Skip to content

Commit

Permalink
Implement safer dependency parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
drdavella committed Dec 5, 2023
1 parent 235516b commit 1f6770f
Show file tree
Hide file tree
Showing 10 changed files with 152 additions and 129 deletions.
22 changes: 14 additions & 8 deletions src/codemodder/project_analysis/file_parsers/base_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod

from pathlib import Path
from typing import List
from .package_store import PackageStore
from packaging.requirements import Requirement

from codemodder.dependency import Requirement
from codemodder.logging import logger
from .package_store import FileType, PackageStore


class BaseParser(ABC):
Expand All @@ -12,8 +13,8 @@ def __init__(self, parent_directory: Path):

@property
@abstractmethod
def file_type(self):
... # pragma: no cover
def file_type(self) -> FileType:
pass

def _parse_dependencies(self, dependencies: List[str]):
return [
Expand All @@ -24,8 +25,8 @@ def _parse_dependencies(self, dependencies: List[str]):
]

@abstractmethod
def _parse_file(self, file: Path):
... # pragma: no cover
def _parse_file(self, file: Path) -> PackageStore | None:
pass

def find_file_locations(self) -> List[Path]:
return list(Path(self.parent_directory).rglob(self.file_type.value))
Expand All @@ -37,7 +38,12 @@ def parse(self) -> list[PackageStore]:
stores = []
req_files = self.find_file_locations()
for file in req_files:
store = self._parse_file(file)
try:
store = self._parse_file(file)
except Exception as e:
logger.debug("Error parsing file: %s", file, exc_info=e)
continue

if store:
stores.append(store)
return stores
6 changes: 4 additions & 2 deletions src/codemodder/project_analysis/file_parsers/package_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass
from enum import Enum
from packaging.requirements import Requirement
from pathlib import Path

from codemodder.dependency import Requirement


class FileType(Enum):
Expand All @@ -13,6 +15,6 @@ class FileType(Enum):
@dataclass
class PackageStore:
type: FileType
file: str
file: Path
dependencies: set[Requirement]
py_versions: list[str]
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from pathlib import Path

import toml

from codemodder.project_analysis.file_parsers.package_store import (
PackageStore,
FileType,
)
from pathlib import Path
import toml

from .base_parser import BaseParser


Expand All @@ -13,25 +14,18 @@ class PyprojectTomlParser(BaseParser):
def file_type(self):
return FileType.TOML

def _parse_dependencies_from_toml(self, toml_data: dict):
# todo: handle cases for
# 1. no dependencies
return self._parse_dependencies(toml_data["project"]["dependencies"])
def _parse_file(self, file: Path) -> PackageStore | None:
data = toml.load(file)

def _parse_py_versions(self, toml_data: dict) -> list:
# todo: handle cases for
# 1. multiple requires-python such as "">3.5.2"", ">=3.11.1,<3.11.2"
maybe_project = toml_data.get("project")
maybe_python = maybe_project.get("requires-python") if maybe_project else None
return [maybe_python] if maybe_python else []
if not (project := data.get("project")):
return None

def _parse_file(self, file: Path):
data = toml.load(file)
# todo: handle no "project" in data
dependencies = project.get("dependencies", [])
version = project.get("requires-python", None)

return PackageStore(
type=self.file_type,
file=str(file),
dependencies=set(self._parse_dependencies_from_toml(data)),
py_versions=self._parse_py_versions(data),
file=file,
dependencies=set(dependencies) if dependencies else set(),
py_versions=[version] if version else [],
)
Original file line number Diff line number Diff line change
@@ -1,41 +1,39 @@
from typing import Optional
from pathlib import Path

import chardet

from packaging.requirements import InvalidRequirement
from codemodder.logging import logger
from codemodder.project_analysis.file_parsers.package_store import (
PackageStore,
FileType,
)
from pathlib import Path
from .base_parser import BaseParser
import chardet
from codemodder.logging import logger


class RequirementsTxtParser(BaseParser):
@property
def file_type(self):
return FileType.REQ_TXT

def _parse_file(self, file: Path) -> Optional[PackageStore]:
try:
with open(file, "rb") as f:
whole_file = f.read()
enc = chardet.detect(whole_file)
if enc["confidence"] > 0.9:
encoding = enc.get("encoding")
decoded = whole_file.decode(encoding.lower()) if encoding else ""
lines = decoded.splitlines() if decoded else []
else:
raise UnicodeError()
return PackageStore(
type=self.file_type,
file=str(file),
dependencies=set(self._parse_dependencies(lines)),
# requirements.txt files do not declare py versions explicitly
# though we could create a heuristic by analyzing each dependency
# and extracting py versions from them.
py_versions=[],
)
except (UnicodeError, OSError, InvalidRequirement):
logger.debug("Error parsing file: %s", file)
return None
def _parse_file(self, file: Path) -> PackageStore | None:
with open(file, "rb") as f:
whole_file = f.read()

enc = chardet.detect(whole_file)
if enc["confidence"] > 0.9:
encoding = enc.get("encoding")
decoded = whole_file.decode(encoding.lower()) if encoding else ""
lines = decoded.splitlines() if decoded else []
else:
logger.debug("Unknown encoding for file: %s", file)
return None

return PackageStore(
type=self.file_type,
file=file,
dependencies=set(self._parse_dependencies(lines)),
# requirements.txt files do not declare py versions explicitly
# though we could create a heuristic by analyzing each dependency
# and extracting py versions from them.
py_versions=[],
)
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,19 @@ class SetupCfgParser(BaseParser):
def file_type(self):
return FileType.SETUP_CFG

def _parse_dependencies_from_cfg(self, config: configparser.ConfigParser):
# todo: handle cases for
# 1. no dependencies, no options dict
# setup_requires, tests_require, extras_require
dependency_lines = config["options"]["install_requires"].split("\n")
return self._parse_dependencies(dependency_lines)

def _parse_py_versions(self, config: configparser.ConfigParser):
# todo: handle cases for
# 1. no options/ no requires-python
# 2. various requires-python such as "">3.5.2"", ">=3.11.1,<3.11.2"
return [config["options"]["python_requires"]]

def _parse_file(self, file: Path):
def _parse_file(self, file: Path) -> PackageStore | None:
config = configparser.ConfigParser()
config.read(file)

# todo: handle no config, no "options" in config
if not (options := config["options"]):
return None

dependency_lines = options.get("install_requires", "").split("\n")
python_requires = options.get("python_requires", "")

return PackageStore(
type=self.file_type,
file=str(file),
dependencies=set(self._parse_dependencies_from_cfg(config)),
py_versions=self._parse_py_versions(config),
file=file,
dependencies=set(self._parse_dependencies(dependency_lines)),
py_versions=[python_requires] if python_requires else [],
)
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
from codemodder.utils.utils import clean_simplestring
from pathlib import Path
import libcst as cst
from libcst import matchers
from packaging.requirements import Requirement
from typing import Optional

from .base_parser import BaseParser

Expand All @@ -17,67 +14,59 @@ class SetupPyParser(BaseParser):
def file_type(self):
return FileType.SETUP_PY

def _parse_dependencies(self, dependencies):
return [
Requirement(line)
for x in dependencies
# Skip empty lines and comments
if (line := clean_simplestring(x.value)) and not line.startswith("#")
]

def _parse_dependencies_from_cst(self, cst_dependencies: Optional[list]):
return self._parse_dependencies(cst_dependencies) if cst_dependencies else []

def _parse_py_versions(self, version_str):
# todo: handle for multiple versions
return [clean_simplestring(version_str)]
def _parse_file(self, file: Path) -> PackageStore | None:
with open(file, "r", encoding="utf8") as f:
module = cst.parse_module(f.read())

def _parse_file(self, file: Path):
visitor = SetupCallVisitor()
with open(str(file), "r", encoding="utf-8") as f:
# todo: handle failure in parsing
module = cst.parse_module(f.read())
module.visit(visitor)

# todo: handle no python_requires, install_requires

return PackageStore(
type=self.file_type,
file=str(file),
dependencies=set(
self._parse_dependencies_from_cst(visitor.install_requires)
),
py_versions=self._parse_py_versions(visitor.python_requires),
file=file,
dependencies=set(self._parse_dependencies(visitor.install_requires)),
py_versions=visitor.python_requires,
)


class SetupCallVisitor(cst.CSTVisitor):
python_requires: list[str]
install_requires: list[str]

def __init__(self):
self.python_requires = None
self.install_requires = None
# todo setup_requires, tests_require, extras_require
self.python_requires = []
self.install_requires = []
# TODO: setup_requires, tests_require, extras_require

def visit_Call(self, node: cst.Call) -> None:
# todo: only handle setup from setuptools, not others tho unlikely
if matchers.matches(node.func, cst.Name(value="setup")):
visitor = SetupArgVisitor()
node.visit(visitor)
self.python_requires = visitor.python_requires
self.install_requires = visitor.install_requires
# TODO: only handle setup from setuptools, not others tho unlikely
match node.func:
case cst.Name(value="setup"):
visitor = SetupArgVisitor()
node.visit(visitor)
self.python_requires.extend(visitor.python_requires)
self.install_requires.extend(visitor.install_requires)


class SetupArgVisitor(cst.CSTVisitor):
python_requires: list[str]
install_requires: list[str]

def __init__(self):
self.python_requires = None
self.install_requires = None
self.python_requires = []
self.install_requires = []

def visit_Arg(self, node: cst.Arg) -> None:
if matchers.matches(node.keyword, matchers.Name(value="python_requires")):
# todo: this works for `python_requires=">=3.7",` but what about
# a list of versions?
self.python_requires = node.value.value
if matchers.matches(
node.keyword, matchers.Name(value="install_requires")
) and matchers.matches(node.value, matchers.List()):
# todo: if node.value is Name node, find requirements in the variable at node.value
self.install_requires = node.value.elements
match node.keyword, node.value:
case cst.Name(value="python_requires"), cst.SimpleString() as string_node:
# TODO: this works for `python_requires=">=3.7",` but what about a list of versions?
self.python_requires.append(clean_simplestring(string_node.value))
case cst.Name(value="install_requires"), cst.List() as list_node:
for elm in list_node.elements:
match elm:
case cst.Element(value=cst.SimpleString() as string_node):
self.install_requires.append(
clean_simplestring(string_node.value)
)
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_parse(self, pkg_with_pyproject_toml):
assert len(found) == 1
store = found[0]
assert store.type.value == "pyproject.toml"
assert store.file == str(pkg_with_pyproject_toml / parser.file_type.value)
assert store.file == pkg_with_pyproject_toml / parser.file_type.value
assert store.py_versions == [">=3.10.0"]
assert len(store.dependencies) == 6

Expand All @@ -66,13 +66,20 @@ def test_parse_no_python(self, pkg_with_pyproject_toml_no_python):
assert len(found) == 1
store = found[0]
assert store.type.value == "pyproject.toml"
assert store.file == str(
pkg_with_pyproject_toml_no_python / parser.file_type.value
)
assert store.file == pkg_with_pyproject_toml_no_python / parser.file_type.value
assert store.py_versions == []
assert len(store.dependencies) == 1

def test_parse_no_file(self, pkg_with_pyproject_toml):
parser = PyprojectTomlParser(pkg_with_pyproject_toml / "foo")
found = parser.parse()
assert len(found) == 0

def test_parser_error(self, pkg_with_pyproject_toml, mocker):
mocker.patch(
"codemodder.project_analysis.file_parsers.pyproject_toml_file_parser.toml.load",
side_effect=Exception,
)
parser = PyprojectTomlParser(pkg_with_pyproject_toml)
found = parser.parse()
assert len(found) == 0
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_parse(self, pkg_with_reqs_txt):
assert len(found) == 1
store = found[0]
assert store.type.value == "requirements.txt"
assert store.file == str(pkg_with_reqs_txt / parser.file_type.value)
assert store.file == pkg_with_reqs_txt / parser.file_type.value
assert store.py_versions == []
assert len(store.dependencies) == 4

Expand All @@ -18,7 +18,7 @@ def test_parse_utf_16(self, pkg_with_reqs_txt_utf_16):
assert len(found) == 1
store = found[0]
assert store.type.value == "requirements.txt"
assert store.file == str(pkg_with_reqs_txt_utf_16 / parser.file_type.value)
assert store.file == pkg_with_reqs_txt_utf_16 / parser.file_type.value
assert store.py_versions == []
assert len(store.dependencies) == 4

Expand All @@ -31,3 +31,9 @@ def test_parse_no_file(self, pkg_with_reqs_txt):
parser = RequirementsTxtParser(pkg_with_reqs_txt / "foo")
found = parser.parse()
assert len(found) == 0

def test_open_error(self, pkg_with_reqs_txt, mocker):
mocker.patch("builtins.open", side_effect=Exception)
parser = RequirementsTxtParser(pkg_with_reqs_txt)
found = parser.parse()
assert len(found) == 0
Loading

0 comments on commit 1f6770f

Please sign in to comment.