From 38240d23135244709d24061bf58c213bc220947e Mon Sep 17 00:00:00 2001 From: Daniel D'Avella Date: Wed, 28 Feb 2024 21:05:49 -0500 Subject: [PATCH] Fix harden-pyyaml to handle bad default --- .../docs/pixee_python_harden-pyyaml.md | 10 +++---- src/core_codemods/harden_pyyaml.py | 30 +++++++++++++++++-- tests/codemods/test_harden_pyyaml.py | 13 ++++++++ 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/src/core_codemods/docs/pixee_python_harden-pyyaml.md b/src/core_codemods/docs/pixee_python_harden-pyyaml.md index 49dacb7d..2efee971 100644 --- a/src/core_codemods/docs/pixee_python_harden-pyyaml.md +++ b/src/core_codemods/docs/pixee_python_harden-pyyaml.md @@ -1,13 +1,13 @@ -This codemod hardens all [`yaml.load()`](https://pyyaml.org/wiki/PyYAMLDocumentation) calls against attacks that could result from deserializing untrusted data. +The default loaders in PyYAML are not safe to use with untrusted data. They potentially make your application vulnerable to arbitrary code execution attacks. If you open a YAML file from an untrusted source, and the file is loaded with the default loader, an attacker could execute arbitrary code on your machine. -The fix uses a safety check that already exists in the `yaml` module, replacing unsafe loader class with `SafeLoader`. -The changes from this codemod look like this: +This codemod hardens all [`yaml.load()`](https://pyyaml.org/wiki/PyYAMLDocumentation) calls against such attacks by replacing the default loader with `yaml.SafeLoader`. This is the recommended loader for loading untrusted data. For most use cases it functions as a drop-in replacement for the default loader. +Calling `yaml.load()` without an explicit loader argument is equivalent to calling it with `Loader=yaml.Loader`, which is unsafe. This usage [has been deprecated](https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input\)-Deprecation) since PyYAML 5.1. This codemod will add an explicit `SafeLoader` argument to all `yaml.load()` calls that don't use an explicit loader. + +The changes from this codemod look like the following: ```diff import yaml data = b'!!python/object/apply:subprocess.Popen \\n- ls' - deserialized_data = yaml.load(data, yaml.Loader) + deserialized_data = yaml.load(data, Loader=yaml.SafeLoader) ``` -The codemod will also catch if you pass in the loader argument as a kwarg and if you use any loader other than `SafeLoader`, -including `FullLoader` and `UnsafeLoader`. diff --git a/src/core_codemods/harden_pyyaml.py b/src/core_codemods/harden_pyyaml.py index dbdd5601..7662a155 100644 --- a/src/core_codemods/harden_pyyaml.py +++ b/src/core_codemods/harden_pyyaml.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, cast import libcst as cst from codemodder.codemods.utils_mixin import NameResolutionMixin from core_codemods.api import ( @@ -18,6 +18,9 @@ class HardenPyyaml(SimpleCodemod, NameResolutionMixin): Reference( url="https://owasp.org/www-community/vulnerabilities/Deserialization_of_untrusted_data" ), + Reference( + url="https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation" + ), ], ) change_description = "Replace unsafe `pyyaml` loader with `SafeLoader` in calls to `yaml.load` or custom loader classes." @@ -26,6 +29,12 @@ class HardenPyyaml(SimpleCodemod, NameResolutionMixin): detector_pattern = """ rules: - pattern-either: + - patterns: + - pattern: yaml.load(...) + - pattern-inside: | + import yaml + ... + yaml.load($ARG) - patterns: - pattern: yaml.load(...) - pattern-inside: | @@ -81,10 +90,25 @@ def on_result_found( ) if (maybe_name := maybe_name or self._module_name) == self._module_name: self.add_needed_import(self._module_name) + updated_node = cast(cst.Call, updated_node) # satisfy the type checker new_args = [ *updated_node.args[:1], - updated_node.args[1].with_changes( - value=self.parse_expression(f"{maybe_name}.SafeLoader") + # This is the case where the arg is present but a bad value + ( + updated_node.args[1].with_changes( + value=self.parse_expression(f"{maybe_name}.SafeLoader") + ) + if len(updated_node.args) > 1 + # This is the case where the arg is not present + # Note that this case is deprecated in PyYAML 5.1 since the default is unsafe + else cst.Arg( + keyword=cst.Name("Loader"), + value=self.parse_expression(f"{maybe_name}.SafeLoader"), + equal=cst.AssignEqual( + whitespace_before=cst.SimpleWhitespace(""), + whitespace_after=cst.SimpleWhitespace(""), + ), + ) ), ] return self.update_arg_target(updated_node, new_args) diff --git a/tests/codemods/test_harden_pyyaml.py b/tests/codemods/test_harden_pyyaml.py index 61a273bb..e81c6e35 100644 --- a/tests/codemods/test_harden_pyyaml.py +++ b/tests/codemods/test_harden_pyyaml.py @@ -86,6 +86,19 @@ def test_preserve_custom_loader_kwarg(self, tmpdir): self.run_and_assert(tmpdir, input_code, expected) + def test_default_loader_unsafe(self, tmpdir): + input_code = """ + import yaml + + yaml.load(data) + """ + expected = """ + import yaml + + yaml.load(data, Loader=yaml.SafeLoader) + """ + self.run_and_assert(tmpdir, input_code, expected) + class TestHardenPyyamlClassInherit(BaseSemgrepCodemodTest): codemod = HardenPyyaml