diff --git a/libsast/__init__.py b/libsast/__init__.py index 275df1e..ab5df5c 100644 --- a/libsast/__init__.py +++ b/libsast/__init__.py @@ -12,7 +12,7 @@ __title__ = 'libsast' __authors__ = 'Ajin Abraham' __copyright__ = f'Copyright {year} Ajin Abraham, opensecurity.in' -__version__ = '3.1.0' +__version__ = '3.1.1' __version_info__ = tuple(int(i) for i in __version__.split('.')) __all__ = [ 'Scanner', diff --git a/libsast/core_matcher/choice_matcher.py b/libsast/core_matcher/choice_matcher.py index 0c52879..20e26f2 100644 --- a/libsast/core_matcher/choice_matcher.py +++ b/libsast/core_matcher/choice_matcher.py @@ -31,14 +31,18 @@ def __init__(self, options: dict) -> None: def scan(self, paths: list) -> dict: """Scan file(s) or directory per rule.""" - if not (self.scan_rules and paths): - return - self.validate_rules() - if self.show_progress: pbar = common.ProgressBar('Choice Match', len(self.scan_rules)) self.scan_rules = pbar.progress_loop(self.scan_rules) + file_contents = self.read_file_contents(paths) + return self.regex_scan(file_contents) + + def read_file_contents(self, paths: list) -> list: + """Load file(s) content.""" + if not (self.scan_rules and paths): + return + self.validate_rules() choice_args = [] for rule in self.scan_rules: scan_paths = paths @@ -46,25 +50,29 @@ def scan(self, paths: list) -> dict: # Scan only alternative path scan_paths = [Path(self.alternative_path)] choice_args.append((scan_paths, rule)) + if not choice_args: + return [] - # Use ThreadPoolExecutor for reading file contents and - # ProcessPoolExecutor for processing regex - with ThreadPoolExecutor() as io_executor, ProcessPoolExecutor( - max_workers=self.cpu) as cpu_executor: + # Use ThreadPoolExecutor for file reading + with ThreadPoolExecutor() as io_executor: + # Submit file reading tasks and wait for results futures = [] for args_tuple in choice_args: - # Submit each read task and store the future along with the args future = io_executor.submit( self._read_file_contents, args_tuple) - futures.append((future, args_tuple)) + futures.append(future) + return [future.result() for future in futures] + + def regex_scan(self, file_contents) -> list: + """Process regex matches on the file contents.""" + # Use ProcessPoolExecutor for regex processing + with ProcessPoolExecutor(max_workers=self.cpu) as cpu_executor: results = [] - for future, _ in futures: - file_contents = future.result() - # This will block until the file reading is done - # Process the file contents with ProcessPoolExecutor + for content in file_contents: + # Process Choice Matcher on the file contents process_future = cpu_executor.submit( - self.choice_matcher, file_contents) + self.choice_matcher, content) results.append(process_future.result()) self.add_finding(results) diff --git a/libsast/core_matcher/pattern_matcher.py b/libsast/core_matcher/pattern_matcher.py index 19c802d..e150dc3 100644 --- a/libsast/core_matcher/pattern_matcher.py +++ b/libsast/core_matcher/pattern_matcher.py @@ -31,27 +31,39 @@ def __init__(self, options: dict) -> None: def scan(self, paths: list) -> dict: """Scan file(s) or directory.""" - if not (self.scan_rules and paths): - return - self.validate_rules() - if self.show_progress: pbar = common.ProgressBar('Pattern Match', len(paths)) paths = pbar.progress_loop(paths) + file_contents = self.read_file_contents(paths) + return self.regex_scan(file_contents) + + def read_file_contents(self, paths: list) -> list: + """Load file(s) content.""" + if not (self.scan_rules and paths): + return + self.validate_rules() + # Filter files by extension and size, prepare list for processing files_to_scan = { sfile for sfile in paths if is_file_valid(sfile, self.exts, 5) } + if not files_to_scan: + return [] - # Use a ThreadPool for file reading, and ProcessPool for CPU-bound regex - with ThreadPoolExecutor() as io_executor, ProcessPoolExecutor( - max_workers=self.cpu) as cpu_executor: + # Use a ThreadPool for file reading + with ThreadPoolExecutor() as io_executor: # Read all files file_contents = list(io_executor.map( self._read_file_content, files_to_scan)) + return file_contents + + def regex_scan(self, file_contents: list) -> dict: + """Scan file(s) content.""" + # Use a ProcessPool for CPU-bound regex + with ProcessPoolExecutor(max_workers=self.cpu) as cpu_executor: # Run regex on file data results = cpu_executor.map( diff --git a/poetry.lock b/poetry.lock index 5621700..896ab29 100644 --- a/poetry.lock +++ b/poetry.lock @@ -325,13 +325,13 @@ yaml = ["PyYAML"] [[package]] name = "googleapis-common-protos" -version = "1.65.0" +version = "1.66.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"}, - {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"}, + {file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"}, + {file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"}, ] [package.dependencies] @@ -621,13 +621,13 @@ files = [ [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] @@ -643,12 +643,12 @@ files = [ [[package]] name = "peewee" -version = "3.17.7" +version = "3.17.8" description = "a little orm" optional = false python-versions = "*" files = [ - {file = "peewee-3.17.7.tar.gz", hash = "sha256:6aefc700bd530fc6ac23fa19c9c5b47041751d92985b799169c8e318e97eabaa"}, + {file = "peewee-3.17.8.tar.gz", hash = "sha256:ce1d05db3438830b989a1b9d0d0aa4e7f6134d5f6fd57686eeaa26a3e6485a8c"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index 3630448..1130e37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "libsast" -version = "3.1.0" +version = "3.1.1" description = "A generic SAST library built on top of semgrep and regex" keywords = ["libsast", "SAST", "Python SAST", "SAST API", "Regex SAST", "Pattern Matcher"] authors = ["Ajin Abraham "]