Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1.27] Backport Extractor Regex Replace Functionality #314

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,19 @@ message Transformation {
// Extractions can be used to extract information from the request/response.
// The extracted information can then be referenced in template fields.
message Extraction {
// The mode of operation for the extraction.
enum Mode {
// Default mode. Extract the value of the subgroup-th capturing group.
EXTRACT = 0;
// Replace the value of the subgroup-th capturing group with the replacement_text.
// Note: replacement_text must be set for this mode.
SINGLE_REPLACE = 1;
// Replace all matches of the regex in the source with the replacement_text.
// Note: replacement_text must be set for this mode.
// Note: subgroup is ignored for this mode. configuration will fail if subgroup is set.
// Note: restrictions on the regex are different for this mode. See the regex field for more details.
REPLACE_ALL = 2;
}

// The source of the extraction
oneof source {
Expand All @@ -162,15 +175,37 @@ message Extraction {
google.protobuf.Empty body = 4;
}

// Only strings matching this regular expression will be part of the
// extraction. The most simple value for this field is '.*', which matches the
// whole source. The field is required. If extraction fails the result is an
// empty value.
// The regex field specifies the regular expression used for matching against the source content. This field is required.
// - In EXTRACT mode, the entire source must match the regex. The subgroup-th capturing group,
// if specified, determines which part of the match is extracted. if the regex does not match the source
// the result of the extraction will be an empty value.
// - In SINGLE_REPLACE mode, the regex also needs to match the entire source. The subgroup-th capturing group
// is targeted for replacement with the replacement_text. if the regex does not match the source
// the result of the extraction will be the source itself.
// - In REPLACE_ALL mode, the regex is applied repeatedly to find all occurrences within the source that match.
// Each matching occurrence is replaced with the replacement_text, and the subgroup field is not used. if the
// regex does not match the source the result of the extraction will be the source itself.
string regex = 2;

// If your regex contains capturing groups, use this field to determine which
// group should be selected.
// For EXTRACT and SINGLE_REPLACE, refers to the portion of the text
// to extract/replace.
// Config will be rejected if this is specified in REPLACE_ALL mode.
uint32 subgroup = 3;

// Used in SINGLE_REPLACE and REPLACE_ALL modes.
// `replacement_text` is used to format the substitution for matched sequences in the input string
// - In SINGLE_REPLACE mode, the content in the subgroup-th capturing group is replaced with the `replacement_text`.
// - In REPLACE_ALL mode, each sequence matching the specified regex in the in the input is replaced with the `replacement_text`.
// The replacement_text may contain special syntax, such as $1, $2, etc., to refer to captured groups within the regular expression.
// The value contained within `replacement_text` is treated as a string, and is passed to std::regex_replace as the replacement string.
// see https://en.cppreference.com/w/cpp/regex/regex_replace for more details.
google.protobuf.StringValue replacement_text = 5;

// The mode of operation for the extraction.
// Defaults to EXTRACT.
Mode mode = 6;
}

// Defines a transformation template.
Expand Down
7 changes: 7 additions & 0 deletions changelog/v1.27.3-patch2/extractor_regex_replace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
changelog:
- type: NEW_FEATURE
resolvesIssue: false
issueLink: https://github.com/solo-io/gloo/issues/8706
description: >
Update transformation filter extractors to support regex
replace/replace all operations on extracted values.
180 changes: 170 additions & 10 deletions source/extensions/filters/http/transformation/inja_transformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ getHeader(const Http::RequestOrResponseHeaderMap &header_map,
Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
: headername_(extractor.header()), body_(extractor.has_body()),
group_(extractor.subgroup()),
extract_regex_(Solo::Regex::Utility::parseStdRegex(extractor.regex())) {
extract_regex_(Solo::Regex::Utility::parseStdRegex(extractor.regex())),
replacement_text_(extractor.has_replacement_text() ? std::make_optional(extractor.replacement_text().value()) : std::nullopt),
mode_(extractor.mode()) {
// mark count == number of sub groups, and we need to add one for match number
// 0 so we test for < instead of <= see:
// http://www.cplusplus.com/reference/regex/basic_regex/mark_count/
Expand All @@ -65,6 +67,26 @@ Extractor::Extractor(const envoy::api::v2::filter::http::Extraction &extractor)
fmt::format("group {} requested for regex with only {} sub groups",
group_, extract_regex_.mark_count()));
}

switch (mode_) {
case ExtractionApi::EXTRACT:
break;
case ExtractionApi::SINGLE_REPLACE:
if (!replacement_text_.has_value()) {
throw EnvoyException("SINGLE_REPLACE mode set but no replacement text provided");
}
break;
case ExtractionApi::REPLACE_ALL:
if (!replacement_text_.has_value()) {
throw EnvoyException("REPLACE_ALL mode set but no replacement text provided");
}
if (group_ != 0) {
throw EnvoyException("REPLACE_ALL mode set but subgroup is not 0");
}
break;
default:
throw EnvoyException("Unknown mode");
}
}

absl::string_view
Expand All @@ -84,6 +106,37 @@ Extractor::extract(Http::StreamFilterCallbacks &callbacks,
}
}

std::string
Extractor::extractDestructive(Http::StreamFilterCallbacks &callbacks,
const Http::RequestOrResponseHeaderMap &header_map,
GetBodyFunc &body) const {
// determines which destructive extraction function to call based on the mode
auto extractFunc = [&](Http::StreamFilterCallbacks& callbacks, absl::string_view sv) {
switch (mode_) {
case ExtractionApi::SINGLE_REPLACE:
return replaceIndividualValue(callbacks, sv);
case ExtractionApi::REPLACE_ALL:
return replaceAllValues(callbacks, sv);
default:
// Handle unknown mode
throw EnvoyException("Cannot use extractDestructive with unsupported mode");
}
};

if (body_) {
const std::string &string_body = body();
absl::string_view sv(string_body);
return extractFunc(callbacks, sv);
} else {
const Http::HeaderMap::GetResult header_entries = getHeader(header_map, headername_);
if (header_entries.empty()) {
return "";
}
const auto &header_value = header_entries[0]->value().getStringView();
return extractFunc(callbacks, header_value);
}
}

absl::string_view
Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
Expand All @@ -105,6 +158,63 @@ Extractor::extractValue(Http::StreamFilterCallbacks &callbacks,
return "";
}

// Match a regex against the input value and replace the matched subgroup with the replacement_text_ value
std::string
Extractor::replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const {
std::match_results<absl::string_view::const_iterator> regex_result;

// if there are no matches, return the original input value
if (!std::regex_search(value.begin(), value.end(), regex_result, extract_regex_)) {
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: extractor regex did not match input. Returning input", callbacks);
return std::string(value.begin(), value.end());
}

// if the subgroup specified is greater than the number of subgroups in the regex, return the original input value
if (group_ >= regex_result.size()) {
// this should never happen as we test this in the ctor.
ASSERT("no such group in the regex");
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: invalid group specified for regex. Returning input", callbacks);
return std::string(value.begin(), value.end());
}

// if the regex doesn't match the entire input value, return the original input value
if (regex_result[0].length() != long(value.length())) {
ENVOY_STREAM_LOG(debug, "replaceIndividualValue: Regex did not match entire input value. This is not allowed in SINGLE_REPLACE mode. Returning input", callbacks);
return std::string(value.begin(), value.end());
}

// Create a new string with the maximum possible length after replacement
auto max_possible_length = value.length() + replacement_text_.value().length();
std::string replaced;
replaced.reserve(max_possible_length);

auto subgroup_start = regex_result[group_].first;
auto subgroup_end = regex_result[group_].second;

// Copy the initial part of the string until the match
replaced.assign(value.begin(), subgroup_start);

// Append the replacement text
replaced += replacement_text_.value();

// Append the remaining part of the string after the match
replaced.append(subgroup_end, value.end());

return replaced;
}

// Match a regex against the input value and replace all instances of the regex with the replacement_text_ value
std::string
Extractor::replaceAllValues(Http::StreamFilterCallbacks&,
absl::string_view value) const {
std::string input(value.begin(), value.end());
std::string replaced;

// replace all instances of the regex in the input value with the replacement_text_ value
return std::regex_replace(input, extract_regex_, replacement_text_.value(), std::regex_constants::match_not_null);
}

// A TransformerInstance is constructed by the InjaTransformer constructor at config time
// on the main thread. It access thread-local storage which is populated during the
// InjaTransformer::transform method call, which happens on the request path on any
Expand Down Expand Up @@ -181,6 +291,11 @@ json TransformerInstance::extracted_callback(const inja::Arguments &args) const
if (value_it != ctx.extractions_->end()) {
return value_it->second;
}

const auto destructive_value_it = ctx.destructive_extractions_->find(name);
if (destructive_value_it != ctx.destructive_extractions_->end()) {
return destructive_value_it->second;
}
return "";
}

Expand Down Expand Up @@ -546,26 +661,70 @@ void InjaTransformer::transform(Http::RequestOrResponseHeaderMap &header_map,
}
// get the extractions
std::unordered_map<std::string, absl::string_view> extractions;
std::unordered_map<std::string, std::string> destructive_extractions;

if (advanced_templates_) {
extractions.reserve(extractors_.size());
auto extractions_size = 0;
auto destructive_extractions_size = 0;
for (const auto &named_extractor : extractors_) {
switch(named_extractor.second.mode()) {
case ExtractionApi::REPLACE_ALL:
case ExtractionApi::SINGLE_REPLACE: {
destructive_extractions_size++;
break;
}
case ExtractionApi::EXTRACT: {
extractions_size++;
break;
}
default: {
PANIC_DUE_TO_CORRUPT_ENUM
}
}
}

extractions.reserve(extractions_size);
destructive_extractions.reserve(destructive_extractions_size);
}

for (const auto &named_extractor : extractors_) {
const std::string &name = named_extractor.first;
if (advanced_templates_) {
extractions[name] =
named_extractor.second.extract(callbacks, header_map, get_body);
} else {
absl::string_view name_to_split = name;
json *current = &json_body;

// prepare variables for non-advanced_templates_ scenario
absl::string_view name_to_split;
json* current = nullptr;
if (!advanced_templates_) {
name_to_split = name;
current = &json_body;
for (size_t pos = name_to_split.find("."); pos != std::string::npos;
pos = name_to_split.find(".")) {
auto &&field_name = name_to_split.substr(0, pos);
current = &(*current)[std::string(field_name)];
name_to_split = name_to_split.substr(pos + 1);
}
(*current)[std::string(name_to_split)] =
named_extractor.second.extract(callbacks, header_map, get_body);
}

switch(named_extractor.second.mode()) {
case ExtractionApi::REPLACE_ALL:
case ExtractionApi::SINGLE_REPLACE: {
if (advanced_templates_) {
destructive_extractions[name] = named_extractor.second.extractDestructive(callbacks, header_map, get_body);
} else {
(*current)[std::string(name_to_split)] = named_extractor.second.extractDestructive(callbacks, header_map, get_body);
}
break;
}
case ExtractionApi::EXTRACT: {
if (advanced_templates_) {
extractions[name] = named_extractor.second.extract(callbacks, header_map, get_body);
} else {
(*current)[std::string(name_to_split)] = named_extractor.second.extract(callbacks, header_map, get_body);
}
break;
}
default: {
PANIC_DUE_TO_CORRUPT_ENUM
}
}
}

Expand All @@ -584,6 +743,7 @@ void InjaTransformer::transform(Http::RequestOrResponseHeaderMap &header_map,
typed_tls_data.request_headers_ = request_headers;
typed_tls_data.body_ = &get_body;
typed_tls_data.extractions_ = &extractions;
typed_tls_data.destructive_extractions_ = &destructive_extractions;
typed_tls_data.context_ = &json_body;
typed_tls_data.environ_ = &environ_;
typed_tls_data.cluster_metadata_ = cluster_metadata;
Expand Down
13 changes: 12 additions & 1 deletion source/extensions/filters/http/transformation/inja_transformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace HttpFilters {
namespace Transformation {

using GetBodyFunc = std::function<const std::string &()>;
using ExtractionApi = envoy::api::v2::filter::http::Extraction;

struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
public:
Expand All @@ -33,6 +34,7 @@ struct ThreadLocalTransformerContext : public ThreadLocal::ThreadLocalObject {
const Http::RequestOrResponseHeaderMap *header_map_;
const Http::RequestHeaderMap *request_headers_;
const GetBodyFunc *body_;
const std::unordered_map<std::string, std::string> *destructive_extractions_;
const std::unordered_map<std::string, absl::string_view> *extractions_;
const nlohmann::json *context_;
const std::unordered_map<std::string, std::string> *environ_;
Expand Down Expand Up @@ -82,15 +84,24 @@ class Extractor : Logger::Loggable<Logger::Id::filter> {
absl::string_view extract(Http::StreamFilterCallbacks &callbacks,
const Http::RequestOrResponseHeaderMap &header_map,
GetBodyFunc &body) const;

std::string extractDestructive(Http::StreamFilterCallbacks &callbacks,
const Http::RequestOrResponseHeaderMap &header_map,
GetBodyFunc &body) const;
const ExtractionApi::Mode& mode() const { return mode_; }
private:
absl::string_view extractValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
std::string replaceIndividualValue(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;
std::string replaceAllValues(Http::StreamFilterCallbacks &callbacks,
absl::string_view value) const;

const Http::LowerCaseString headername_;
const bool body_;
const unsigned int group_;
const std::regex extract_regex_;
const std::optional<const std::string> replacement_text_;
const ExtractionApi::Mode mode_;
};

class InjaTransformer : public Transformer {
Expand Down
15 changes: 15 additions & 0 deletions test/extensions/filters/http/transformation/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,21 @@ envoy_gloo_cc_test(
],
)

envoy_gloo_cc_test(
name = "inja_transformer_replace_test",
srcs = ["inja_transformer_replace_test.cc"],
repository = "@envoy",
deps = [
"//source/extensions/filters/http/transformation:inja_transformer_lib",
"@envoy//source/common/common:random_generator_lib",
"@envoy//source/common/common:base64_lib",
"@envoy//test/test_common:environment_lib",
"@envoy//test/mocks/http:http_mocks",
"@envoy//test/mocks/server:server_mocks",
"@envoy//test/mocks/upstream:upstream_mocks",
],
)

envoy_cc_test_binary(
name = "inja_transformer_speed_test",
srcs = ["inja_transformer_speed_test.cc"],
Expand Down
Loading
Loading