Continued refactoring.

AdnaneKhan · May 9, 2024 · 5bce2cd · 5bce2cd
1 parent 62ef002
commit 5bce2cd
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 75 deletions.
diff --git a/gato/cli/output.py b/gato/cli/output.py
@@ -4,8 +4,6 @@
 from gato.cli import (RED_DASH, GREEN_PLUS, GREEN_EXCLAIM, RED_EXCLAIM,
                       BRIGHT_DASH, YELLOW_EXCLAIM, YELLOW_DASH)
 
-
-
 from colorama import Style, Fore
 
 

diff --git a/gato/enumerate/enumerate.py b/gato/enumerate/enumerate.py
@@ -8,6 +8,7 @@
 from gato.enumerate.repository import RepositoryEnum
 from gato.enumerate.organization import OrganizationEnum
 from gato.enumerate.recommender import Recommender
+from gato.enumerate.ingest.ingest import DataIngestor
 from gato.caching import CacheManager
 
 logger = logging.getLogger(__name__)
@@ -184,7 +185,7 @@ def enumerate_organization(self, org: str):
             result = self.org_e.api.call_post('/graphql', wf_query)
             # Sometimes we don't get a 200, fall back in this case.
             if result.status_code == 200:
-                self.repo_e.construct_workflow_cache(result.json()['data']['nodes'])
+                DataIngestor.construct_workflow_cache(result.json()['data']['nodes'])
             else:
                 Output.warn("GraphQL query failed, will revert to REST workflow query for impacted repositories!")
         try:
@@ -196,6 +197,7 @@ def enumerate_organization(self, org: str):
                 Output.tabbed(
                     f"Enumerating: {Output.bright(repo.name)}!"
                 )
+
                 self.repo_e.enumerate_repository(repo, large_org_enum=len(enum_list) > 25)
                 self.repo_e.enumerate_repository_secrets(repo)
 
@@ -240,6 +242,7 @@ def enumerate_repo_only(self, repo_name: str, large_enum=False):
             Output.tabbed(
                 f"Enumerating: {Output.bright(repo.name)}!"
             )
+
 
             self.repo_e.enumerate_repository(repo, large_org_enum=large_enum)
             self.repo_e.enumerate_repository_secrets(repo)
@@ -282,7 +285,7 @@ def enumerate_repos(self, repo_names: list):
                 for i in range (0, 3):
                     result = self.repo_e.api.call_post('/graphql', wf_query)
                     if result.status_code == 200:
-                        self.repo_e.construct_workflow_cache(result.json()['data'].values())
+                        DataIngestor.construct_workflow_cache(result.json()['data'].values())
                         break
                     else:
                         Output.warn(f"GraphQL query failed with {result.status_code} on attempt {str(i+1)}, will try again!")

diff --git a/gato/enumerate/ingest/__init.__.py b/gato/enumerate/ingest/__init.__.py
@@ -0,0 +1 @@
+from .ingest import DataIngestor
diff --git a/gato/enumerate/ingest/ingest.py b/gato/enumerate/ingest/ingest.py
@@ -0,0 +1,72 @@
+from gato.caching.cache_manager import CacheManager
+from gato.models import Workflow, Repository
+
+class DataIngestor:
+
+    def __init__(self, queue):
+        """
+        Args:
+            queue (queue): Queue to use for processing data.
+        """
+        self.queue = queue
+
+
+    @staticmethod
+    def construct_workflow_cache(yml_results):
+        """Creates a cache of workflow yml files retrieved from graphQL. Since
+        graphql and REST do not have parity, we still need to use rest for most
+        enumeration calls. This method saves off all yml files, so during org
+        level enumeration if we perform yml enumeration the cached file is used
+        instead of making github REST requests. 
+
+        Args:
+            yml_results (list): List of results from individual GraphQL queries
+            (100 nodes at a time).
+        """
+
+        cache = CacheManager()
+        for result in yml_results:
+            # If we get any malformed/missing data just skip it and 
+            # Gato will fall back to the contents API for these few cases.
+            if not result:
+                continue
+
+            if 'nameWithOwner' not in result:
+                continue
+
+            owner = result['nameWithOwner']
+            cache.set_empty(owner)
+            # Empty means no yamls, so just skip.
+            if not result['object']:
+                continue
+
+            for yml_node in result['object']['entries']:
+                yml_name = yml_node['name']
+                if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'):
+                    contents = yml_node['object']['text']
+                    wf_wrapper = Workflow(owner, contents, yml_name)
+
+                    cache.set_workflow(owner, yml_name, wf_wrapper)
+            repo_data = {
+                'full_name': result['nameWithOwner'],
+                'html_url': result['url'],
+                'visibility': 'private' if result['isPrivate'] else 'public',
+                'default_branch': result['defaultBranchRef']['name'],
+                'fork': result['isFork'],
+                'stargazers_count': result['stargazers']['totalCount'],
+                'pushed_at': result['pushedAt'],
+                'permissions': {
+                    'pull': result['viewerPermission'] == 'READ' or \
+                      result['viewerPermission'] == 'TRIAGE' or \
+                      result['viewerPermission'] == 'WRITE' or \
+                      result['viewerPermission'] == 'ADMIN',
+                    'push': result['viewerPermission'] == 'WRITE' or \
+                        result['viewerPermission'] == 'ADMIN',
+                    'admin': result['viewerPermission'] == 'ADMIN'
+                },
+                'archived': result['isArchived'],
+                'isFork': False
+            }
+
+            repo_wrapper = Repository(repo_data)
+            cache.set_repository(repo_wrapper)
diff --git a/gato/enumerate/repository.py b/gato/enumerate/repository.py
@@ -10,7 +10,6 @@
 from gato.caching import CacheManager
 from gato.notifications import send_slack_webhook
 
-
 logger = logging.getLogger(__name__)
 
 class RepositoryEnum():
@@ -182,20 +181,27 @@ def __perform_yml_enumeration(self, repository: Repository):
 
                 # Checks any local workflows referenced by this
                 self.__check_callees(parsed_yml, repository, rules)
-
+                
                 if wf_injection and not skip_checks and not rules: 
                     injection_package = self.__create_info_package(parsed_yml.wf_name,workflow_url, wf_injection, rules)
-                    commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name)
+
+                    # We first check the result from GQL, if the last push was within 24 hours, then we check if the last
+                    # push impacted the specific workflow.
+                    if self.__is_within_last_day(repository.repo_data['pushed_at']):
+                        commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name)
+                        if self.__is_within_last_day(commit_date) and '[bot]' not in author:
+                            send_slack_webhook(injection_package)
                     repository.set_injection(injection_package)
-                    if self.is_within_last_day(commit_date) and '[bot]' not in author:
-                        send_slack_webhook(injection_package)
-                if pwn_reqs and not skip_checks:
 
+                if pwn_reqs and not skip_checks:
                     pwn_request_package = self.__create_info_package(parsed_yml.wf_name,workflow_url, pwn_reqs, rules)
-                    commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name)
-
-                    if self.is_within_last_day(commit_date) and '[bot]' not in author:
-                        send_slack_webhook(pwn_request_package)
+
+                    # We first check the result from GQL, if the last push was within 24 hours, then we check if the last
+                    # push impacted the specific workflow.
+                    if self.__is_within_last_day(repository.repo_data['pushed_at']):
+                        commit_date, author = self.api.get_file_last_updated(repository.name, ".github/workflows/" + parsed_yml.wf_name)
+                        if self.__is_within_last_day(commit_date) and '[bot]' not in author:
+                            send_slack_webhook(pwn_request_package)
                     repository.set_pwn_request(pwn_request_package)
 
                 if self_hosted_jobs:
@@ -216,14 +222,14 @@ def __perform_yml_enumeration(self, repository: Repository):
 
         return runner_wfs
 
-    def is_within_last_day(self, timestamp_str, format='%Y-%m-%dT%H:%M:%SZ'):
+    def __is_within_last_day(self, timestamp_str, format='%Y-%m-%dT%H:%M:%SZ'):
         # Convert the timestamp string to a datetime object
         date = datetime.strptime(timestamp_str, format)
 
         # Get the current date and time
         now = datetime.now()
 
-        # Calculate the date 3 days ago
+        # Calculate the date 1 days ago
         seven_days_ago = now - timedelta(days=1)
 
         # Return True if the date is within the last day, False otherwise
@@ -312,61 +318,3 @@ def enumerate_repository_secrets(
 
             if org_secrets:
                 repository.set_accessible_org_secrets(org_secrets)
-
-    def construct_workflow_cache(self, yml_results):
-        """Creates a cache of workflow yml files retrieved from graphQL. Since
-        graphql and REST do not have parity, we still need to use rest for most
-        enumeration calls. This method saves off all yml files, so during org
-        level enumeration if we perform yml enumeration the cached file is used
-        instead of making github REST requests. 
-
-        Args:
-            yml_results (list): List of results from individual GraphQL queries
-            (100 nodes at a time).
-        """
-
-        cache = CacheManager()
-        for result in yml_results:
-            # If we get any malformed/missing data just skip it and 
-            # Gato will fall back to the contents API for these few cases.
-            if not result:
-                continue
-
-            if 'nameWithOwner' not in result:
-                continue
-
-            owner = result['nameWithOwner']
-            cache.set_empty(owner)
-            # Empty means no yamls, so just skip.
-            if not result['object']:
-                continue
-
-            for yml_node in result['object']['entries']:
-                yml_name = yml_node['name']
-                if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'):
-                    contents = yml_node['object']['text']
-                    wf_wrapper = Workflow(owner, contents, yml_name)
-
-                    cache.set_workflow(owner, yml_name, wf_wrapper)
-            repo_data = {
-                'full_name': result['nameWithOwner'],
-                'html_url': result['url'],
-                'visibility': 'private' if result['isPrivate'] else 'public',
-                'default_branch': result['defaultBranchRef']['name'],
-                'fork': result['isFork'],
-                'stargazers_count': result['stargazers']['totalCount'],
-                'permissions': {
-                    'pull': result['viewerPermission'] == 'READ' or \
-                      result['viewerPermission'] == 'TRIAGE' or \
-                      result['viewerPermission'] == 'WRITE' or \
-                      result['viewerPermission'] == 'ADMIN',
-                    'push': result['viewerPermission'] == 'WRITE' or \
-                        result['viewerPermission'] == 'ADMIN',
-                    'admin': result['viewerPermission'] == 'ADMIN'
-                },
-                'archived': result['isArchived'],
-                'isFork': False
-            }
-
-            repo_wrapper = Repository(repo_data)
-            cache.set_repository(repo_wrapper)
diff --git a/gato/github/api.py b/gato/github/api.py
@@ -1173,7 +1173,6 @@ def get_file_last_updated(self, repo_name: str, file_path: str):
             f'/repos/{repo_name}/commits',params={"path": file_path}
         )
 
-
         commit_date = resp.json()[0]['commit']['author']['date']
         commit_author = resp.json()[0]['commit']['author']['name']
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,8 +4,6 @@
		from gato.cli import (RED_DASH, GREEN_PLUS, GREEN_EXCLAIM, RED_EXCLAIM,
		BRIGHT_DASH, YELLOW_EXCLAIM, YELLOW_DASH)



		from colorama import Style, Fore


Expand Down
-Original file line number
+Diff line change
@@ Expand Up @@
                 f'/repos/{repo_name}/commits',params={"path": file_path}
             )
             commit_date = resp.json()[0]['commit']['author']['date']
             commit_author = resp.json()[0]['commit']['author']['name']
@@ Expand Down @@