Skip to content

Commit

Permalink
[1.26] Backport Extractor Regex Replace Functionality (#315)
Browse files Browse the repository at this point in the history
* Add regex replace functionality to transformation filter extractors [Revised] (#309)

* Add regex replace functionality to transformation filter extractors (#301)

* initial extractor implementation of replace functionality

* minor changes, testing against mergeExtractorsToBody and extraction callback

* add changelog entry

* update API to use new mode selector

update transformation_filter proto

* minor updates to comments in transformation_filter.proto

* remove existing references to no-longer-existing replace_all setting

* update replacement_text_ to a std::optional<std::string>

* remove duplicate mode enum

* update comment indicating that subgroup should never exceed regex_result size

* add AttemptReplaceFromNoMatchNonNilSubgroup test

* prevent string reallocation

* remove unnecessary if block + variable in replaceAllValues

* clean up new tests

* inline replacement_text in inja_transformer_test.cc

* more test cleanup

* update function signatures, remove replaced_value_

* support dynamic metadata as extractor input

* update changelog location

* add API changes to go with 3175ca9

* revert support for dynamic metadata as an extractor input 3175ca9 and e2668be

* refactor calls to extract/replace

* rename replace to extractDestructive, add breaks to switch statement

* update data types to match updated function signatures in inja_transformer_test.cc

* respond to review comments

* update changelog location

* update changelog location

* separate destructive extractors and non-destructive extractors

* fix match_not_null edge case

* update inline documentation for new proto field

* add test demonstrating use of format specifiers

* update REPLACE_ALL mode to return input on no match

* return input on no match in single replace case

* update changelog location
  • Loading branch information
ben-taussig-solo authored Mar 13, 2024
1 parent 0bae991 commit 52b93ad
Show file tree
Hide file tree
Showing 7 changed files with 832 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,19 @@ message Transformation {
// Extractions can be used to extract information from the request/response.
// The extracted information can then be referenced in template fields.
message Extraction {
// The mode of operation for the extraction.
enum Mode {
// Default mode. Extract the value of the subgroup-th capturing group.
EXTRACT = 0;
// Replace the value of the subgroup-th capturing group with the replacement_text.
// Note: replacement_text must be set for this mode.
SINGLE_REPLACE = 1;
// Replace all matches of the regex in the source with the replacement_text.
// Note: replacement_text must be set for this mode.
// Note: subgroup is ignored for this mode. configuration will fail if subgroup is set.
// Note: restrictions on the regex are different for this mode. See the regex field for more details.
REPLACE_ALL = 2;
}

// The source of the extraction
oneof source {
Expand All @@ -162,15 +175,37 @@ message Extraction {
google.protobuf.Empty body = 4;
}

// Only strings matching this regular expression will be part of the
// extraction. The most simple value for this field is '.*', which matches the
// whole source. The field is required. If extraction fails the result is an
// empty value.
// The regex field specifies the regular expression used for matching against the source content. This field is required.
// - In EXTRACT mode, the entire source must match the regex. The subgroup-th capturing group,
// if specified, determines which part of the match is extracted. if the regex does not match the source
// the result of the extraction will be an empty value.
// - In SINGLE_REPLACE mode, the regex also needs to match the entire source. The subgroup-th capturing group
// is targeted for replacement with the replacement_text. if the regex does not match the source
// the result of the extraction will be the source itself.
// - In REPLACE_ALL mode, the regex is applied repeatedly to find all occurrences within the source that match.
// Each matching occurrence is replaced with the replacement_text, and the subgroup field is not used. if the
// regex does not match the source the result of the extraction will be the source itself.
string regex = 2;

// If your regex contains capturing groups, use this field to determine which
// group should be selected.
// For EXTRACT and SINGLE_REPLACE, refers to the portion of the text
// to extract/replace.
// Config will be rejected if this is specified in REPLACE_ALL mode.
uint32 subgroup = 3;

// Used in SINGLE_REPLACE and REPLACE_ALL modes.
// `replacement_text` is used to format the substitution for matched sequences in the input string
// - In SINGLE_REPLACE mode, the content in the subgroup-th capturing group is replaced with the `replacement_text`.
// - In REPLACE_ALL mode, each sequence matching the specified regex in the in the input is replaced with the `replacement_text`.
// The replacement_text may contain special syntax, such as $1, $2, etc., to refer to captured groups within the regular expression.
// The value contained within `replacement_text` is treated as a string, and is passed to std::regex_replace as the replacement string.
// see https://en.cppreference.com/w/cpp/regex/regex_replace for more details.
google.protobuf.StringValue replacement_text = 5;

// The mode of operation for the extraction.
// Defaults to EXTRACT.
Mode mode = 6;
}

// Defines a transformation template.
Expand Down
7 changes: 7 additions & 0 deletions changelog/v1.26.7-patch2/extractor_regex_replace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
changelog:
- type: NEW_FEATURE
resolvesIssue: false
issueLink: https://github.com/solo-io/gloo/issues/8706
description: >
Update transformation filter extractors to support regex
replace/replace all operations on extracted values.
180 changes: 170 additions & 10 deletions source/extensions/filters/http/transformation/inja_transformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ getHeader(const Http::RequestOrResponseHeaderMap &header_map,
Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
: headername_(extractor.header()), body_(extractor.has_body()),
group_(extractor.subgroup()),
extract_regex_(Regex::Utility::parseStdRegex(extractor.regex())) {
extract_regex_(Regex::Utility::parseStdRegex(extractor.regex())),
replacement_text_(extractor.has_replacement_text() ? std::make_optional(extractor.replacement_text().value()) : std::nullopt),
mode_(extractor.mode()) {
// mark count == number of sub groups, and we need to add one for match number
// 0 so we test for < instead of <= see:
// http://www.cplusplus.com/reference/regex/basic_regex/mark_count/
Expand All @@ -64,6 +66,26 @@ Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
fmt::format("group {} requested for regex with only {} sub groups",
group_, extract_regex_.mark_count()));
}

switch (mode_) {
case ExtractionApi::EXTRACT:
break;
case ExtractionApi::SINGLE_REPLACE:
if (!replacement_text_.has_value()) {
throw EnvoyException("SINGLE_REPLACE mode set but no replacement text provided");
}
break;
case ExtractionApi::REPLACE_ALL:
if (!replacement_text_.has_value()) {
throw EnvoyException("REPLACE_ALL mode set but no replacement text provided");
}
if (group_ != 0) {
throw EnvoyException("REPLACE_ALL mode set but subgroup is not 0");
}
break;
default:
throw EnvoyException("Unknown mode");
}
}

absl::string_view
Expand All @@ -83,6 +105,37 @@ Extractor::extract(Http::StreamFilterCallbacks &callbacks,
}
}

std::string
Extractor::extractDestructive(Http::StreamFilterCallbacks &callbacks,
const Http::RequestOrResponseHeaderMap &header_map,
GetBodyFunc &body) const {
// determines which destructive extraction function to call based on the mode
auto extractFunc = [&](Http::StreamFilterCallbacks& callbacks, absl::string_view sv) {
switch (mode_) {
case ExtractionApi::SINGLE_REPLACE:
return replaceIndividualValue(callbacks, sv);
case ExtractionApi::REPLACE_ALL:
return replaceAllValues(callbacks, sv);
default:
// Handle unknown mode
throw EnvoyException("Cannot use extractDestructive with unsupported mode");
}
};

if (body_) {
const std::string &string_body = body();
absl::string_view sv(string_body);
return extractFunc(callbacks, sv);
} else {
const Http::HeaderMap::GetResult header_entries = getHeader(header_map, headername_);
if (header_entries.empty()) {
return "";
}
const auto &header_value = header_entries[0]->value().getStringView();
return extractFunc(callbacks, header_value);
}
}

absl::string_view
Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
Expand All @@ -104,6 +157,63 @@ Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
return "";
}

// Match a regex against the input value and replace the matched subgroup with the replacement_text_ value
std::string
Extractor::replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
std::match_results<absl::string_view::const_iterator> regex_result;

// if there are no matches, return the original input value
if (!std::regex_search(value.begin(), value.end(), regex_result, extract_regex_)) {
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: extractor regex did not match input. Returning input", callbacks);
return std::string(value.begin(), value.end());
}

// if the subgroup specified is greater than the number of subgroups in the regex, return the original input value
if (group_ >= regex_result.size()) {
// this should never happen as we test this in the ctor.
ASSERT("no such group in the regex");
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: invalid group specified for regex. Returning input", callbacks);
return std::string(value.begin(), value.end());
}

// if the regex doesn't match the entire input value, return the original input value
if (regex_result[0].length() != long(value.length())) {
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: Regex did not match entire input value. This is not allowed in SINGLE_REPLACE mode. Returning input", callbacks);
return std::string(value.begin(), value.end());
}

// Create a new string with the maximum possible length after replacement
auto max_possible_length = value.length() + replacement_text_.value().length();
std::string replaced;
replaced.reserve(max_possible_length);

auto subgroup_start = regex_result[group_].first;
auto subgroup_end = regex_result[group_].second;

// Copy the initial part of the string until the match
replaced.assign(value.begin(), subgroup_start);

// Append the replacement text
replaced += replacement_text_.value();

// Append the remaining part of the string after the match
replaced.append(subgroup_end, value.end());

return replaced;
}

// Match a regex against the input value and replace all instances of the regex with the replacement_text_ value
std::string
Extractor::replaceAllValues(Http::StreamFilterCallbacks&,
absl::string_view value) const {
std::string input(value.begin(), value.end());
std::string replaced;

// replace all instances of the regex in the input value with the replacement_text_ value
return std::regex_replace(input, extract_regex_, replacement_text_.value(), std::regex_constants::match_not_null);
}

// A TransformerInstance is constructed by the InjaTransformer constructor at config time
// on the main thread. It access thread-local storage which is populated during the
// InjaTransformer::transform method call, which happens on the request path on any
Expand Down Expand Up @@ -180,6 +290,11 @@ json TransformerInstance::extracted_callback(const inja::Arguments &args) const
if (value_it != ctx.extractions_->end()) {
return value_it->second;
}

const auto destructive_value_it = ctx.destructive_extractions_->find(name);
if (destructive_value_it != ctx.destructive_extractions_->end()) {
return destructive_value_it->second;
}
return "";
}

Expand Down Expand Up @@ -545,26 +660,70 @@ void InjaTransformer::transform(Http::RequestOrResponseHeaderMap &header_map,
}
// get the extractions
std::unordered_map<std::string, absl::string_view> extractions;
std::unordered_map<std::string, std::string> destructive_extractions;

if (advanced_templates_) {
extractions.reserve(extractors_.size());
auto extractions_size = 0;
auto destructive_extractions_size = 0;
for (const auto &named_extractor : extractors_) {
switch(named_extractor.second.mode()) {
case ExtractionApi::REPLACE_ALL:
case ExtractionApi::SINGLE_REPLACE: {
destructive_extractions_size++;
break;
}
case ExtractionApi::EXTRACT: {
extractions_size++;
break;
}
default: {
PANIC_DUE_TO_CORRUPT_ENUM
}
}
}

extractions.reserve(extractions_size);
destructive_extractions.reserve(destructive_extractions_size);
}

for (const auto &named_extractor : extractors_) {
const std::string &name = named_extractor.first;
if (advanced_templates_) {
extractions[name] =
named_extractor.second.extract(callbacks, header_map, get_body);
} else {
absl::string_view name_to_split = name;
json *current = &json_body;

// prepare variables for non-advanced_templates_ scenario
absl::string_view name_to_split;
json* current = nullptr;
if (!advanced_templates_) {
name_to_split = name;
current = &json_body;
for (size_t pos = name_to_split.find("."); pos != std::string::npos;
pos = name_to_split.find(".")) {
auto &&field_name = name_to_split.substr(0, pos);
current = &(*current)[std::string(field_name)];
name_to_split = name_to_split.substr(pos + 1);
}
(*current)[std::string(name_to_split)] =
named_extractor.second.extract(callbacks, header_map, get_body);
}

switch(named_extractor.second.mode()) {
case ExtractionApi::REPLACE_ALL:
case ExtractionApi::SINGLE_REPLACE: {
if (advanced_templates_) {
destructive_extractions[name] = named_extractor.second.extractDestructive(callbacks, header_map, get_body);
} else {
(*current)[std::string(name_to_split)] = named_extractor.second.extractDestructive(callbacks, header_map, get_body);
}
break;
}
case ExtractionApi::EXTRACT: {
if (advanced_templates_) {
extractions[name] = named_extractor.second.extract(callbacks, header_map, get_body);
} else {
(*current)[std::string(name_to_split)] = named_extractor.second.extract(callbacks, header_map, get_body);
}
break;
}
default: {
PANIC_DUE_TO_CORRUPT_ENUM
}
}
}

Expand All @@ -583,6 +742,7 @@ void InjaTransformer::transform(Http::RequestOrResponseHeaderMap &header_map,
typed_tls_data.request_headers_ = request_headers;
typed_tls_data.body_ = &get_body;
typed_tls_data.extractions_ = &extractions;
typed_tls_data.destructive_extractions_ = &destructive_extractions;
typed_tls_data.context_ = &json_body;
typed_tls_data.environ_ = &environ_;
typed_tls_data.cluster_metadata_ = cluster_metadata;
Expand Down
13 changes: 12 additions & 1 deletion source/extensions/filters/http/transformation/inja_transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace HttpFilters {
namespace Transformation {

using GetBodyFunc = std::function<const std::string &()>;
using ExtractionApi = envoy::api::v2::filter::http::Extraction;

struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
public:
Expand All @@ -33,6 +34,7 @@ struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
const Http::RequestOrResponseHeaderMap *header_map_;
const Http::RequestHeaderMap *request_headers_;
const GetBodyFunc *body_;
const std::unordered_map<std::string, std::string> *destructive_extractions_;
const std::unordered_map<std::string, absl::string_view> *extractions_;
const nlohmann::json *context_;
const std::unordered_map<std::string, std::string> *environ_;
Expand Down Expand Up @@ -82,15 +84,24 @@ class Extractor : Logger::Loggable<Logger::Id::filter> {
absl::string_view extract(Http::StreamFilterCallbacks &callbacks,
const Http::RequestOrResponseHeaderMap &header_map,
GetBodyFunc &body) const;

std::string extractDestructive(Http::StreamFilterCallbacks &callbacks,
const Http::RequestOrResponseHeaderMap &header_map,
GetBodyFunc &body) const;
const ExtractionApi::Mode& mode() const { return mode_; }
private:
absl::string_view extractValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
std::string replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
std::string replaceAllValues(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;

const Http::LowerCaseString headername_;
const bool body_;
const unsigned int group_;
const std::regex extract_regex_;
const std::optional<const std::string> replacement_text_;
const ExtractionApi::Mode mode_;
};

class InjaTransformer : public Transformer {
Expand Down
15 changes: 15 additions & 0 deletions test/extensions/filters/http/transformation/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,21 @@ envoy_gloo_cc_test(
],
)

envoy_gloo_cc_test(
name = "inja_transformer_replace_test",
srcs = ["inja_transformer_replace_test.cc"],
repository = "@envoy",
deps = [
"//source/extensions/filters/http/transformation:inja_transformer_lib",
"@envoy//source/common/common:random_generator_lib",
"@envoy//source/common/common:base64_lib",
"@envoy//test/test_common:environment_lib",
"@envoy//test/mocks/http:http_mocks",
"@envoy//test/mocks/server:server_mocks",
"@envoy//test/mocks/upstream:upstream_mocks",
],
)

envoy_cc_test_binary(
name = "inja_transformer_speed_test",
srcs = ["inja_transformer_speed_test.cc"],
Expand Down
Loading

0 comments on commit 52b93ad

Please sign in to comment.