Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split file read + regex scan #50

Merged
merged 2 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libsast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__title__ = 'libsast'
__authors__ = 'Ajin Abraham'
__copyright__ = f'Copyright {year} Ajin Abraham, opensecurity.in'
__version__ = '3.1.0'
__version__ = '3.1.1'
__version_info__ = tuple(int(i) for i in __version__.split('.'))
__all__ = [
'Scanner',
Expand Down
36 changes: 21 additions & 15 deletions libsast/core_matcher/choice_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,18 @@

def scan(self, paths: list) -> dict:
"""Scan file(s) or directory per rule."""
if not (self.scan_rules and paths):
return
self.validate_rules()

if self.show_progress:
pbar = common.ProgressBar('Choice Match', len(self.scan_rules))
self.scan_rules = pbar.progress_loop(self.scan_rules)

file_contents = self.read_file_contents(paths)
return self.regex_scan(file_contents)

def read_file_contents(self, paths: list) -> list:
Dismissed Show dismissed Hide dismissed
"""Load file(s) content."""
if not (self.scan_rules and paths):
return
self.validate_rules()
choice_args = []
for rule in self.scan_rules:
scan_paths = paths
Expand All @@ -47,24 +51,26 @@
scan_paths = [Path(self.alternative_path)]
choice_args.append((scan_paths, rule))

# Use ThreadPoolExecutor for reading file contents and
# ProcessPoolExecutor for processing regex
with ThreadPoolExecutor() as io_executor, ProcessPoolExecutor(
max_workers=self.cpu) as cpu_executor:
# Use ThreadPoolExecutor for file reading
with ThreadPoolExecutor() as io_executor:
# Submit file reading tasks and wait for results
futures = []
for args_tuple in choice_args:
# Submit each read task and store the future along with the args
future = io_executor.submit(
self._read_file_contents, args_tuple)
futures.append((future, args_tuple))
futures.append(future)
return [future.result() for future in futures]

def regex_scan(self, file_contents) -> list:
"""Process regex matches on the file contents."""
# Use ProcessPoolExecutor for regex processing
with ProcessPoolExecutor(max_workers=self.cpu) as cpu_executor:

results = []
for future, _ in futures:
file_contents = future.result()
# This will block until the file reading is done
# Process the file contents with ProcessPoolExecutor
for content in file_contents:
# Process Choice Matcher on the file contents
process_future = cpu_executor.submit(
self.choice_matcher, file_contents)
self.choice_matcher, content)
results.append(process_future.result())

self.add_finding(results)
Expand Down
24 changes: 17 additions & 7 deletions libsast/core_matcher/pattern_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,37 @@

def scan(self, paths: list) -> dict:
"""Scan file(s) or directory."""
if not (self.scan_rules and paths):
return
self.validate_rules()

if self.show_progress:
pbar = common.ProgressBar('Pattern Match', len(paths))
paths = pbar.progress_loop(paths)

file_contents = self.read_file_contents(paths)
return self.regex_scan(file_contents)

def read_file_contents(self, paths: list) -> list:
Dismissed Show dismissed Hide dismissed
"""Load file(s) content."""
if not (self.scan_rules and paths):
return
self.validate_rules()

# Filter files by extension and size, prepare list for processing
files_to_scan = {
sfile for sfile in paths
if is_file_valid(sfile, self.exts, 5)
}

# Use a ThreadPool for file reading, and ProcessPool for CPU-bound regex
with ThreadPoolExecutor() as io_executor, ProcessPoolExecutor(
max_workers=self.cpu) as cpu_executor:
# Use a ThreadPool for file reading
with ThreadPoolExecutor() as io_executor:

# Read all files
file_contents = list(io_executor.map(
self._read_file_content, files_to_scan))
return file_contents

def regex_scan(self, file_contents: list) -> dict:
"""Scan file(s) content."""
# Use a ProcessPool for CPU-bound regex
with ProcessPoolExecutor(max_workers=self.cpu) as cpu_executor:

# Run regex on file data
results = cpu_executor.map(
Expand Down
16 changes: 8 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "libsast"
version = "3.1.0"
version = "3.1.1"
description = "A generic SAST library built on top of semgrep and regex"
keywords = ["libsast", "SAST", "Python SAST", "SAST API", "Regex SAST", "Pattern Matcher"]
authors = ["Ajin Abraham <[email protected]>"]
Expand Down
Loading