Skip to content

Commit

Permalink
Move get_possible_substr_types() into its own function; Use vector in…
Browse files Browse the repository at this point in the history
…stead of std::vector; Fix tuple return type of get_substring_variable_types
  • Loading branch information
SharafMohamed committed Aug 2, 2024
1 parent ceb5d4d commit db8e544
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 117 deletions.
257 changes: 140 additions & 117 deletions components/core/src/clp/Grep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,116 +1048,17 @@ Grep::generate_query_substring_logtypes(string& processed_search_string, ByteLex
if (begin_idx > 0 && is_escape[begin_idx - 1]) {
continue;
}
std::vector<QueryLogtype> possible_substr_types;

// Don't allow an isolated wildcard to be considered a variable
if (end_idx - 1 == begin_idx && is_greedy_wildcard[begin_idx]) {
possible_substr_types.emplace_back('*', "*", false);
} else if (end_idx - 1 == begin_idx && is_non_greedy_wildcard[begin_idx]) {
possible_substr_types.emplace_back('?', "?", false);
} else {
// As we extend substrings adjacent to wildcards, the substrings that begin or end
// with wildcards are redundant (e.g., for string "a*b", a decomposition of the form
// "a*" + "b" is a subset of the more general "a*" + "*" + "*b". Note, as this needs
// "*", the "*" substring is not redundant. This is already handled above). More
// detail about this is given below.
if (is_greedy_wildcard[begin_idx] || is_greedy_wildcard[end_idx - 1]) {
continue;
}

// If the substring isn't surrounded by delimiters there is no reason to consider
// the case where it is a variable as CLP would not compress it as such. Preceding
// delimiter counts the start of log, a wildcard, or an actual delimiter.
bool has_preceding_delimiter
= 0 == begin_idx || is_greedy_wildcard[begin_idx - 1]
|| is_non_greedy_wildcard[begin_idx - 1]
|| lexer.is_delimiter(processed_search_string[begin_idx - 1]);

// Proceeding delimiter counts the end of log, a wildcard, or an actual delimiter.
// However, we have to be careful about a proceeding escape character. First, if '\'
// is a delimiter, we avoid counting the escape character. Second, if a literal '*'
// or '?' is a delimiter, then it will appear after the escape character.
bool has_proceeding_delimiter
= processed_search_string.size() == end_idx || is_greedy_wildcard[end_idx]
|| is_non_greedy_wildcard[end_idx]
|| (false == is_escape[end_idx]
&& lexer.is_delimiter(processed_search_string[end_idx]))
|| (is_escape[end_idx]
&& lexer.is_delimiter(processed_search_string[end_idx + 1]));

// If the substring contains a wildcard, we need to consider the case that it can
// simultaneously match multiple variables and static text, and we need a different
// approach to compare against the archive.
bool contains_wildcard = false;
set<uint32_t> variable_types;
if (has_preceding_delimiter && has_proceeding_delimiter) {
// If the substring is preceded or proceeded by a greedy wildcard then it's
// possible the substring could be extended to match a var, so the wildcards are
// added to the substring. If we don't consider this case we could miss
// combinations. Take for example "a*b", "a*" and "*b" can both match a has#
// style variable ("\w*\d+\w*"). If we decompose the string into either
// substrings "a*" + "b" or "a" + "*b", neither would capture the possibility of
// a logtype with the form "<has#>*<has#>", which is a valid possibility during
// compression. Instead we desire to decompose the string into "a*" + "*" +
// "*b". Note, non-greedy wildcards do not need to be considered, for example
// "a?b" can never match "<has#>?<has#>" or "<has#><has#>".
uint32_t substr_start = begin_idx;
uint32_t substr_end = end_idx;
bool prev_char_is_star = begin_idx > 0 && is_greedy_wildcard[begin_idx - 1];
bool next_char_is_greedy_wildcard = end_idx < processed_search_string.length()
&& is_greedy_wildcard[end_idx];
if (prev_char_is_star) {
substr_start--;
}
if (next_char_is_greedy_wildcard) {
substr_end++;
}
auto [variable_types, contains_wildcard] = get_substring_variable_types(
string_view(processed_search_string)
.substr(substr_start, substr_end - substr_start),
substr_start,
is_greedy_wildcard,
is_non_greedy_wildcard,
is_escape,
lexer
);
bool already_added_var = false;
// Use the variable types to determine the possible_substr_types
for (int id : variable_types) {
auto& schema_type = lexer.m_id_symbol[id];
if (schema_type != "int" && schema_type != "float") {
if (already_added_var) {
continue;
}
already_added_var = true;
}
possible_substr_types.emplace_back();
QueryLogtype& suffix = possible_substr_types.back();
suffix.append_value(
id,
processed_search_string
.substr(substr_start, substr_end - substr_start),
contains_wildcard
);

// If the substring has no wildcards, we can safely exclude lower priority
// variable types.
if (false == contains_wildcard) {
break;
}
}
}
// If the substring matches no variables, or has a wildcard, it is potentially
// static-text.
if (variable_types.empty() || contains_wildcard) {
possible_substr_types.emplace_back();
auto& possible_substr_type = possible_substr_types.back();
for (uint32_t idx = begin_idx; idx < end_idx; idx++) {
char const& c = processed_search_string[idx];
std::string char_string({c});
possible_substr_type.append_value(c, char_string, false);
}
}
auto possible_substr_types = get_possible_substr_types(
processed_search_string,
begin_idx,
end_idx,
is_greedy_wildcard,
is_non_greedy_wildcard,
is_escape,
lexer
);
if (possible_substr_types.empty()) {
continue;
}

// Use the completed set of variable types for each substr(begin_idx,end_idx) to
Expand Down Expand Up @@ -1191,12 +1092,134 @@ Grep::generate_query_substring_logtypes(string& processed_search_string, ByteLex
return query_logtypes;
}

vector<QueryLogtype> Grep::get_possible_substr_types(
string& processed_search_string,
size_t begin_idx,
size_t end_idx,
vector<bool>& is_greedy_wildcard,
vector<bool>& is_non_greedy_wildcard,
vector<bool>& is_escape,
ByteLexer& lexer
) {
vector<QueryLogtype> possible_substr_types;

// Don't allow an isolated wildcard to be considered a variable
if (end_idx - 1 == begin_idx && is_greedy_wildcard[begin_idx]) {
possible_substr_types.emplace_back('*', "*", false);
} else if (end_idx - 1 == begin_idx && is_non_greedy_wildcard[begin_idx]) {
possible_substr_types.emplace_back('?', "?", false);
} else {
// As we extend substrings adjacent to wildcards, the substrings that begin or end
// with wildcards are redundant (e.g., for string "a*b", a decomposition of the form
// "a*" + "b" is a subset of the more general "a*" + "*" + "*b". Note, as this needs
// "*", the "*" substring is not redundant. This is already handled above). More
// detail about this is given below.
if (is_greedy_wildcard[begin_idx] || is_greedy_wildcard[end_idx - 1]) {
return possible_substr_types;
}

// If the substring isn't surrounded by delimiters there is no reason to consider
// the case where it is a variable as CLP would not compress it as such. Preceding
// delimiter counts the start of log, a wildcard, or an actual delimiter.
bool has_preceding_delimiter
= 0 == begin_idx || is_greedy_wildcard[begin_idx - 1]
|| is_non_greedy_wildcard[begin_idx - 1]
|| lexer.is_delimiter(processed_search_string[begin_idx - 1]);

// Proceeding delimiter counts the end of log, a wildcard, or an actual delimiter.
// However, we have to be careful about a proceeding escape character. First, if '\'
// is a delimiter, we avoid counting the escape character. Second, if a literal '*'
// or '?' is a delimiter, then it will appear after the escape character.
bool has_proceeding_delimiter
= processed_search_string.size() == end_idx || is_greedy_wildcard[end_idx]
|| is_non_greedy_wildcard[end_idx]
|| (false == is_escape[end_idx]
&& lexer.is_delimiter(processed_search_string[end_idx]))
|| (is_escape[end_idx] && lexer.is_delimiter(processed_search_string[end_idx + 1])
);

// If the substring contains a wildcard, we need to consider the case that it can
// simultaneously match multiple variables and static text, and we need a different
// approach to compare against the archive.
bool contains_wildcard = false;
set<uint32_t> variable_types;
if (has_preceding_delimiter && has_proceeding_delimiter) {
// If the substring is preceded or proceeded by a greedy wildcard then it's
// possible the substring could be extended to match a var, so the wildcards are
// added to the substring. If we don't consider this case we could miss
// combinations. Take for example "a*b", "a*" and "*b" can both match a has#
// style variable ("\w*\d+\w*"). If we decompose the string into either
// substrings "a*" + "b" or "a" + "*b", neither would capture the possibility of
// a logtype with the form "<has#>*<has#>", which is a valid possibility during
// compression. Instead we desire to decompose the string into "a*" + "*" +
// "*b". Note, non-greedy wildcards do not need to be considered, for example
// "a?b" can never match "<has#>?<has#>" or "<has#><has#>".
uint32_t substr_start = begin_idx;
uint32_t substr_end = end_idx;
bool prev_char_is_star = begin_idx > 0 && is_greedy_wildcard[begin_idx - 1];
bool next_char_is_greedy_wildcard
= end_idx < processed_search_string.length() && is_greedy_wildcard[end_idx];
if (prev_char_is_star) {
substr_start--;
}
if (next_char_is_greedy_wildcard) {
substr_end++;
}
auto [variable_types, contains_wildcard] = get_substring_variable_types(
string_view(processed_search_string)
.substr(substr_start, substr_end - substr_start),
substr_start,
is_greedy_wildcard,
is_non_greedy_wildcard,
is_escape,
lexer
);
bool already_added_var = false;
// Use the variable types to determine the possible_substr_types
for (int id : variable_types) {
auto& schema_type = lexer.m_id_symbol[id];
if (schema_type != "int" && schema_type != "float") {
if (already_added_var) {
continue;
}
already_added_var = true;
}
possible_substr_types.emplace_back();
QueryLogtype& suffix = possible_substr_types.back();
suffix.append_value(
id,
processed_search_string.substr(substr_start, substr_end - substr_start),
contains_wildcard
);

// If the substring has no wildcards, we can safely exclude lower priority
// variable types.
if (false == contains_wildcard) {
break;
}
}
}
// If the substring matches no variables, or has a wildcard, it is potentially
// static-text.
if (variable_types.empty() || contains_wildcard) {
possible_substr_types.emplace_back();
auto& possible_substr_type = possible_substr_types.back();
for (uint32_t idx = begin_idx; idx < end_idx; idx++) {
char const& c = processed_search_string[idx];
std::string char_string({c});
possible_substr_type.append_value(c, char_string, false);
}
}
}
return possible_substr_types;
}

tuple<vector<bool>, vector<bool>, vector<bool>> Grep::get_wildcard_and_escape_locations(
std::string const& processed_search_string
) {
std::vector<bool> is_greedy_wildcard;
std::vector<bool> is_non_greedy_wildcard;
std::vector<bool> is_escape;
vector<bool> is_greedy_wildcard;
vector<bool> is_non_greedy_wildcard;
vector<bool> is_escape;
is_greedy_wildcard.reserve(processed_search_string.size());
is_non_greedy_wildcard.reserve(processed_search_string.size());
is_escape.reserve(processed_search_string.size());
Expand Down Expand Up @@ -1231,12 +1254,12 @@ tuple<vector<bool>, vector<bool>, vector<bool>> Grep::get_wildcard_and_escape_lo
return {std::move(is_greedy_wildcard), std::move(is_non_greedy_wildcard), std::move(is_escape)};
}

tuple<set<uint32_t>, set<uint32_t>> Grep::get_substring_variable_types(
tuple<set<uint32_t>, bool> Grep::get_substring_variable_types(
string_view search_substr,
uint32_t substr_offset,
std::vector<bool>& is_greedy_wildcard,
std::vector<bool>& is_non_greedy_wildcard,
std::vector<bool>& is_escape,
vector<bool>& is_greedy_wildcard,
vector<bool>& is_non_greedy_wildcard,
vector<bool>& is_escape,
ByteLexer& lexer
) {
// To determine if a substring could be a variable we convert it to regex,
Expand Down
21 changes: 21 additions & 0 deletions components/core/src/clp/Grep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,27 @@ class Grep {
log_surgeon::lexers::ByteLexer& lexer
);

/**
* Generates the possible static-text and variable types for the given substring.
* @param processed_search_string
* @param begin_idx
* @param end_idx
* @param is_greedy_wildcard
* @param is_non_greedy_wildcard
* @param is_escape
* @param lexer
* @return a vector containing the possible substring types
*/
static std::vector<QueryLogtype> get_possible_substr_types(
std::string& processed_search_string,
size_t begin_idx,
size_t end_idx,
std::vector<bool>& is_greedy_wildcard,
std::vector<bool>& is_non_greedy_wildcard,
std::vector<bool>& is_escape,
log_surgeon::lexers::ByteLexer& lexer
);

/**
* Mark the locations of non-escaped wildcards '*', '?', and escape characters '\'.
* @param processed_search_string
Expand Down

0 comments on commit db8e544

Please sign in to comment.