Skip to content

Commit

Permalink
[sc-28238] Just redact all literals in the query
Browse files Browse the repository at this point in the history
  • Loading branch information
usefulalgorithm committed Aug 22, 2024
1 parent 2e48887 commit e916580
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 84 deletions.
9 changes: 2 additions & 7 deletions metaphor/common/docs/process_query.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,7 @@ Controls whether the crawler should process the SQL queries before storing the M
```yaml
process_query:
redact_literals:
where_clauses: <true | false> # Whether to redact all literal values in WHERE clauses. Default is `false`.

case_clauses: <true | false> # Whether to redact all literal values in CASE clauses. Default is `false`.

when_not_matched_insert_clauses: <true | false> # Whether to redact literal values in WHEN NOT MATCHED INSERT clauses. If set to `True`, all literal values will be redacted to a predefined string value. Default is `false`.
redact: <true | false> # Whether to redact the literal values. Default it `false`.

placeholder_literal: <placeholder literal> # The redacted values will be replaced by this placeholder string. Default is '<REDACTED>'.

Expand All @@ -18,6 +14,5 @@ process_query:
If any of the following boolean values is set to true, crawler will process the incoming SQL queries:
- `redact_literals.where_clauses`
- `redact_literals.when_not_matched_insert_clauses`
- `redact_literals.redact`
- `ignore_insert_values_into`
21 changes: 2 additions & 19 deletions metaphor/common/sql/process_query/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,7 @@ class RedactPIILiteralsConfig:
Config to control whether we want to redact literal values. Useful if you want to remove PII from your queries.
"""

where_clauses: bool = False
"""
Whether to redact literal values in WHERE clauses. If set to `True`, all literal values will be redacted to a predefined string value.
"""

case_clauses: bool = False
"""
Whether to redact literal values in CASE clauses. If set to `True`, all literal values will be redacted to a predefined string value.
"""

when_not_matched_insert_clauses: bool = False
"""
Whether to redact literal values in WHEN NOT MATCHED INSERT clauses. If set to `True`, all literal values will be redacted to a predefined string value.
"""
redact: bool = False

placeholder_literal: str = "<REDACTED>"

Expand All @@ -50,8 +37,4 @@ def should_process(self) -> bool:
"""
Whether we should run the processing method at all.
"""
return (
self.redact_literals.where_clauses
or self.redact_literals.when_not_matched_insert_clauses
or self.ignore_insert_values_into
)
return self.redact_literals.redact or self.ignore_insert_values_into
71 changes: 22 additions & 49 deletions metaphor/common/sql/process_query/process_query.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import typing as t
from typing import Optional

from sqlglot import Expression, exp, maybe_parse
from sqlglot.dialects.dialect import Dialect
from sqlglot.errors import SqlglotError
from sqlglot.generator import Generator

from metaphor.common.logger import get_logger
from metaphor.common.sql.dialect import PLATFORM_TO_DIALECT
Expand All @@ -13,45 +16,6 @@
logger = get_logger()


def _redact_literal(lit: exp.Literal, config: ProcessQueryConfig) -> None:
lit.args["this"] = config.redact_literals.placeholder_literal
lit.args["is_string"] = True


def _redact_literal_values_in_where_clauses(
expression: Expression, config: ProcessQueryConfig
) -> None:
for where in expression.find_all(exp.Where):
for lit in where.find_all(exp.Literal):
_redact_literal(lit, config)


def _redact_literal_values_in_case_clauses(
expression: Expression, config: ProcessQueryConfig
) -> None:
for case in expression.find_all(exp.Case):
for lit in case.find_all(exp.Literal):
_redact_literal(lit, config)


def _redact_literal_values_in_when_not_matched_insert_clauses(
expression: Expression,
config: ProcessQueryConfig,
) -> None:
for when in expression.find_all(exp.When):
if "matched" not in when.args or when.args["matched"]:
continue
if not isinstance(when.args.get("then"), exp.Insert):
continue

values = when.args["then"].expression
if not isinstance(values, exp.Tuple):
continue

for lit in values.find_all(exp.Literal):
_redact_literal(lit, config)


def _is_insert_values_into(expression: Expression) -> bool:
return isinstance(expression, exp.Insert) and isinstance(
expression.expression, exp.Values
Expand Down Expand Up @@ -111,13 +75,22 @@ def process_query(
if config.ignore_insert_values_into and _is_insert_values_into(expression):
return None

if config.redact_literals.where_clauses:
_redact_literal_values_in_where_clauses(expression, config)

if config.redact_literals.case_clauses:
_redact_literal_values_in_case_clauses(expression, config)

if config.redact_literals.when_not_matched_insert_clauses:
_redact_literal_values_in_when_not_matched_insert_clauses(expression, config)

return expression.sql(dialect=dialect)
if not config.redact_literals.redact:
return expression.sql(dialect=dialect)

DialectClass: t.Type[Dialect]
if dialect is None:
DialectClass = Dialect
else:
DialectClass = Dialect[dialect]
GeneratorClass: t.Type[Generator] = DialectClass().generator().__class__

# Mypy does not allow dynamic base classes, but that's the only way for us to do it
class LiteralsRedacted(DialectClass): # type: ignore
class Generator(GeneratorClass): # type: ignore
TRANSFORMS: t.Dict[t.Type[exp.Expression], t.Callable[..., str]] = {
**GeneratorClass.TRANSFORMS,
exp.Literal: lambda *_: f"'{config.redact_literals.placeholder_literal}'",
}

return expression.sql(dialect=LiteralsRedacted)
8 changes: 2 additions & 6 deletions tests/common/sql/process_query/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@


def test_config():
config = ProcessQueryConfig(
redact_literals=RedactPIILiteralsConfig(where_clauses=True)
)
config = ProcessQueryConfig(ignore_insert_values_into=True)
assert config.should_process

config = ProcessQueryConfig(
redact_literals=RedactPIILiteralsConfig(when_not_matched_insert_clauses=True)
)
config = ProcessQueryConfig(redact_literals=RedactPIILiteralsConfig(redact=True))
assert config.should_process

config = ProcessQueryConfig()
Expand Down
4 changes: 1 addition & 3 deletions tests/common/sql/process_query/test_process_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@

config = ProcessQueryConfig(
redact_literals=RedactPIILiteralsConfig(
where_clauses=True,
case_clauses=True,
when_not_matched_insert_clauses=True,
redact=True,
),
ignore_insert_values_into=True,
)
Expand Down

0 comments on commit e916580

Please sign in to comment.