diff --git a/gato/cli/output.py b/gato/cli/output.py index d0896bf..aaf02cc 100644 --- a/gato/cli/output.py +++ b/gato/cli/output.py @@ -4,8 +4,6 @@ from gato.cli import (RED_DASH, GREEN_PLUS, GREEN_EXCLAIM, RED_EXCLAIM, BRIGHT_DASH, YELLOW_EXCLAIM, YELLOW_DASH) - - from colorama import Style, Fore diff --git a/gato/enumerate/enumerate.py b/gato/enumerate/enumerate.py index 718e338..d882afc 100644 --- a/gato/enumerate/enumerate.py +++ b/gato/enumerate/enumerate.py @@ -8,6 +8,7 @@ from gato.enumerate.repository import RepositoryEnum from gato.enumerate.organization import OrganizationEnum from gato.enumerate.recommender import Recommender +from gato.enumerate.ingest.ingest import DataIngestor from gato.caching import CacheManager logger = logging.getLogger(__name__) @@ -184,7 +185,7 @@ def enumerate_organization(self, org: str): result = self.org_e.api.call_post('/graphql', wf_query) # Sometimes we don't get a 200, fall back in this case. if result.status_code == 200: - self.repo_e.construct_workflow_cache(result.json()['data']['nodes']) + DataIngestor.construct_workflow_cache(result.json()['data']['nodes']) else: Output.warn("GraphQL query failed, will revert to REST workflow query for impacted repositories!") try: @@ -196,6 +197,7 @@ def enumerate_organization(self, org: str): Output.tabbed( f"Enumerating: {Output.bright(repo.name)}!" ) + self.repo_e.enumerate_repository(repo, large_org_enum=len(enum_list) > 25) self.repo_e.enumerate_repository_secrets(repo) @@ -240,6 +242,7 @@ def enumerate_repo_only(self, repo_name: str, large_enum=False): Output.tabbed( f"Enumerating: {Output.bright(repo.name)}!" ) + self.repo_e.enumerate_repository(repo, large_org_enum=large_enum) self.repo_e.enumerate_repository_secrets(repo) @@ -282,7 +285,7 @@ def enumerate_repos(self, repo_names: list): for i in range (0, 3): result = self.repo_e.api.call_post('/graphql', wf_query) if result.status_code == 200: - self.repo_e.construct_workflow_cache(result.json()['data'].values()) + DataIngestor.construct_workflow_cache(result.json()['data'].values()) break else: Output.warn(f"GraphQL query failed with {result.status_code} on attempt {str(i+1)}, will try again!") diff --git a/gato/enumerate/ingest/__init.__.py b/gato/enumerate/ingest/__init.__.py new file mode 100644 index 0000000..02711f8 --- /dev/null +++ b/gato/enumerate/ingest/__init.__.py @@ -0,0 +1 @@ +from .ingest import DataIngestor \ No newline at end of file diff --git a/gato/enumerate/ingest/ingest.py b/gato/enumerate/ingest/ingest.py new file mode 100644 index 0000000..a0e936a --- /dev/null +++ b/gato/enumerate/ingest/ingest.py @@ -0,0 +1,72 @@ +from gato.caching.cache_manager import CacheManager +from gato.models import Workflow, Repository + +class DataIngestor: + + def __init__(self, queue): + """ + Args: + queue (queue): Queue to use for processing data. + """ + self.queue = queue + + + @staticmethod + def construct_workflow_cache(yml_results): + """Creates a cache of workflow yml files retrieved from graphQL. Since + graphql and REST do not have parity, we still need to use rest for most + enumeration calls. This method saves off all yml files, so during org + level enumeration if we perform yml enumeration the cached file is used + instead of making github REST requests. + + Args: + yml_results (list): List of results from individual GraphQL queries + (100 nodes at a time). + """ + + cache = CacheManager() + for result in yml_results: + # If we get any malformed/missing data just skip it and + # Gato will fall back to the contents API for these few cases. + if not result: + continue + + if 'nameWithOwner' not in result: + continue + + owner = result['nameWithOwner'] + cache.set_empty(owner) + # Empty means no yamls, so just skip. + if not result['object']: + continue + + for yml_node in result['object']['entries']: + yml_name = yml_node['name'] + if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'): + contents = yml_node['object']['text'] + wf_wrapper = Workflow(owner, contents, yml_name) + + cache.set_workflow(owner, yml_name, wf_wrapper) + repo_data = { + 'full_name': result['nameWithOwner'], + 'html_url': result['url'], + 'visibility': 'private' if result['isPrivate'] else 'public', + 'default_branch': result['defaultBranchRef']['name'], + 'fork': result['isFork'], + 'stargazers_count': result['stargazers']['totalCount'], + 'pushed_at': result['pushedAt'], + 'permissions': { + 'pull': result['viewerPermission'] == 'READ' or \ + result['viewerPermission'] == 'TRIAGE' or \ + result['viewerPermission'] == 'WRITE' or \ + result['viewerPermission'] == 'ADMIN', + 'push': result['viewerPermission'] == 'WRITE' or \ + result['viewerPermission'] == 'ADMIN', + 'admin': result['viewerPermission'] == 'ADMIN' + }, + 'archived': result['isArchived'], + 'isFork': False + } + + repo_wrapper = Repository(repo_data) + cache.set_repository(repo_wrapper) \ No newline at end of file diff --git a/gato/enumerate/repository.py b/gato/enumerate/repository.py index 2d92c41..911483d 100644 --- a/gato/enumerate/repository.py +++ b/gato/enumerate/repository.py @@ -10,7 +10,6 @@ from gato.caching import CacheManager from gato.notifications import send_slack_webhook - logger = logging.getLogger(__name__) class RepositoryEnum(): @@ -182,20 +181,27 @@ def __perform_yml_enumeration(self, repository: Repository): # Checks any local workflows referenced by this self.__check_callees(parsed_yml, repository, rules) - + if wf_injection and not skip_checks and not rules: injection_package = self.__create_info_package(parsed_yml.wf_name,workflow_url, wf_injection, rules) - commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name) + + # We first check the result from GQL, if the last push was within 24 hours, then we check if the last + # push impacted the specific workflow. + if self.__is_within_last_day(repository.repo_data['pushed_at']): + commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name) + if self.__is_within_last_day(commit_date) and '[bot]' not in author: + send_slack_webhook(injection_package) repository.set_injection(injection_package) - if self.is_within_last_day(commit_date) and '[bot]' not in author: - send_slack_webhook(injection_package) - if pwn_reqs and not skip_checks: + if pwn_reqs and not skip_checks: pwn_request_package = self.__create_info_package(parsed_yml.wf_name,workflow_url, pwn_reqs, rules) - commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name) - - if self.is_within_last_day(commit_date) and '[bot]' not in author: - send_slack_webhook(pwn_request_package) + + # We first check the result from GQL, if the last push was within 24 hours, then we check if the last + # push impacted the specific workflow. + if self.__is_within_last_day(repository.repo_data['pushed_at']): + commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name) + if self.__is_within_last_day(commit_date) and '[bot]' not in author: + send_slack_webhook(pwn_request_package) repository.set_pwn_request(pwn_request_package) if self_hosted_jobs: @@ -216,14 +222,14 @@ def __perform_yml_enumeration(self, repository: Repository): return runner_wfs - def is_within_last_day(self, timestamp_str, format='%Y-%m-%dT%H:%M:%SZ'): + def __is_within_last_day(self, timestamp_str, format='%Y-%m-%dT%H:%M:%SZ'): # Convert the timestamp string to a datetime object date = datetime.strptime(timestamp_str, format) # Get the current date and time now = datetime.now() - # Calculate the date 3 days ago + # Calculate the date 1 days ago seven_days_ago = now - timedelta(days=1) # Return True if the date is within the last day, False otherwise @@ -312,61 +318,3 @@ def enumerate_repository_secrets( if org_secrets: repository.set_accessible_org_secrets(org_secrets) - - def construct_workflow_cache(self, yml_results): - """Creates a cache of workflow yml files retrieved from graphQL. Since - graphql and REST do not have parity, we still need to use rest for most - enumeration calls. This method saves off all yml files, so during org - level enumeration if we perform yml enumeration the cached file is used - instead of making github REST requests. - - Args: - yml_results (list): List of results from individual GraphQL queries - (100 nodes at a time). - """ - - cache = CacheManager() - for result in yml_results: - # If we get any malformed/missing data just skip it and - # Gato will fall back to the contents API for these few cases. - if not result: - continue - - if 'nameWithOwner' not in result: - continue - - owner = result['nameWithOwner'] - cache.set_empty(owner) - # Empty means no yamls, so just skip. - if not result['object']: - continue - - for yml_node in result['object']['entries']: - yml_name = yml_node['name'] - if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'): - contents = yml_node['object']['text'] - wf_wrapper = Workflow(owner, contents, yml_name) - - cache.set_workflow(owner, yml_name, wf_wrapper) - repo_data = { - 'full_name': result['nameWithOwner'], - 'html_url': result['url'], - 'visibility': 'private' if result['isPrivate'] else 'public', - 'default_branch': result['defaultBranchRef']['name'], - 'fork': result['isFork'], - 'stargazers_count': result['stargazers']['totalCount'], - 'permissions': { - 'pull': result['viewerPermission'] == 'READ' or \ - result['viewerPermission'] == 'TRIAGE' or \ - result['viewerPermission'] == 'WRITE' or \ - result['viewerPermission'] == 'ADMIN', - 'push': result['viewerPermission'] == 'WRITE' or \ - result['viewerPermission'] == 'ADMIN', - 'admin': result['viewerPermission'] == 'ADMIN' - }, - 'archived': result['isArchived'], - 'isFork': False - } - - repo_wrapper = Repository(repo_data) - cache.set_repository(repo_wrapper) \ No newline at end of file diff --git a/gato/github/api.py b/gato/github/api.py index eb2fc2d..f03df36 100644 --- a/gato/github/api.py +++ b/gato/github/api.py @@ -1173,7 +1173,6 @@ def get_file_last_updated(self, repo_name: str, file_path: str): f'/repos/{repo_name}/commits',params={"path": file_path} ) - commit_date = resp.json()[0]['commit']['author']['date'] commit_author = resp.json()[0]['commit']['author']['name']