Auto scan entire repo ! (#202)

* scan entire repo * fix tests * increased coverage * Added tree_sitter as hidden import. * Incrementing verison * dw/debug scan (#203) * scan works on python fastapi test repo * keep original entrypoint * algo cleaning and improvements --------- Co-authored-by: Embedded DevOps <[email protected]> Co-authored-by: David Wurtz <[email protected]>
qodo-ai · Nov 7, 2024 · 3be9ba6 · 3be9ba6
1 parent 738bf47
commit 3be9ba6
Show file tree

Hide file tree

Showing 24 changed files with 1,135 additions and 417 deletions.
diff --git a/.github/workflows/ci_pipeline.yml b/.github/workflows/ci_pipeline.yml
@@ -135,7 +135,7 @@ jobs:
 
     - name: Install Dependencies
       run: |
-        pip install poetry wandb
+        pip install poetry wandb tree_sitter
         poetry install
     - name: Build Executable
       run: make installer

diff --git a/Makefile b/Makefile
@@ -28,6 +28,7 @@ installer:
 		--hidden-import=tiktoken_ext.openai_public \
 		--hidden-import=tiktoken_ext \
 		--hidden-import=wandb \
+		--hidden-import=tree_sitter \
 		--hidden-import=wandb_gql \
 		--onefile \
 		--name cover-agent \

diff --git a/README.md b/README.md
@@ -32,6 +32,31 @@ CodiumAI Cover Agent aims to help efficiently increasing code coverage, by autom
 
 ## News and Updates
 
+### 2024-11-05:
+New mode - scan an entire repo, auto identify the test files, auto collect context for each test file, and extend the test suite with new tests.
+How to run:
+
+1) Create a branch in your repo
+2) cd to your repo
+3) Run the following command:
+```shell
+poetry run cover-agent \
+  --project-language="python" \
+  --project-root="<path_to_your_repo>" \
+  --code-coverage-report-path="<path_to_your_repo>/coverage.xml" \
+  --test-command="coverage run -m pytest <relative_path_to_unittest_folder> --cov=<path_to_your_repo> --cov-report=xml --cov-report=term --log-cli-level=INFO --timeout=30" \
+  --model=bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0
+```
+
+Notes:
+- `<relative_path_to_unittest_folder>` is optional, but will prevent running e2e test files if exists, which may take a long time"
+- You can use other models, like 'gpt-4o' or 'o1-mini', but recommended to use 'sonnet-3.5' as this is currently the best code model in the world.
+
+Additional configuration options:
+- `--max-test-files-allowed-to-analyze` - The maximum number of test files to analyze. Default is 20 (to avoid long running times).
+- `--look-for-oldest-unchanged-test-files` - If set, the tool will sort the test files by the last modified date and analyze the oldest ones first. This is useful to find the test files that are most likely to be outdated, and for multiple runs. Default is False.
+
+
 ### 2024-09-29:
 We are excited to announce the latest series of updates to CoverAgent, delivering significant improvements to functionality, documentation, and testing frameworks. These updates reflect our ongoing commitment to enhancing the developer experience, improving error handling, and refining the testing processes.
 
@@ -119,6 +144,7 @@ After downloading the executable or installing the Pip package you can run the C
 cover-agent \
   --source-file-path "<path_to_source_file>" \
   --test-file-path "<path_to_test_file>" \
+  --project-root "<path_to_project_root>" \
   --code-coverage-report-path "<path_to_coverage_report>" \
   --test-command "<test_command_to_run>" \
   --test-command-dir "<directory_to_run_test_command>" \
@@ -138,6 +164,7 @@ Follow the steps in the README.md file located in the `templated_tests/python_fa
 cover-agent \
   --source-file-path "templated_tests/python_fastapi/app.py" \
   --test-file-path "templated_tests/python_fastapi/test_app.py" \
+  --project-root "templated_tests/python_fastapi" \
   --code-coverage-report-path "templated_tests/python_fastapi/coverage.xml" \
   --test-command "pytest --cov=. --cov-report=xml --cov-report=term" \
   --test-command-dir "templated_tests/python_fastapi" \

diff --git a/cover_agent/AICaller.py b/cover_agent/AICaller.py
@@ -103,17 +103,20 @@ def call_model(self, prompt: dict, max_tokens=4096, stream=True):
             completion_tokens = int(usage.completion_tokens)
 
         if "WANDB_API_KEY" in os.environ:
-            root_span = Trace(
-                name="inference_"
-                + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
-                kind="llm",  # kind can be "llm", "chain", "agent", or "tool"
-                inputs={
-                    "user_prompt": prompt["user"],
-                    "system_prompt": prompt["system"],
-                },
-                outputs={"model_response": content},
-            )
-            root_span.log(name="inference")
+            try:
+                root_span = Trace(
+                    name="inference_"
+                    + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
+                    kind="llm",  # kind can be "llm", "chain", "agent", or "tool"
+                    inputs={
+                        "user_prompt": prompt["user"],
+                        "system_prompt": prompt["system"],
+                    },
+                    outputs={"model_response": content},
+                )
+                root_span.log(name="inference")
+            except Exception as e:
+                print(f"Error logging to W&B: {e}")
 
         # Returns: Response, Prompt token count, and Completion token count
         return content, prompt_tokens, completion_tokens

diff --git a/cover_agent/CoverAgent.py b/cover_agent/CoverAgent.py
@@ -29,6 +29,7 @@ def __init__(self, args):
         self.test_gen = UnitTestGenerator(
             source_file_path=args.source_file_path,
             test_file_path=args.test_file_output_path,
+            project_root=args.project_root,
             code_coverage_report_path=args.code_coverage_report_path,
             test_command=args.test_command,
             test_command_dir=args.test_command_dir,
@@ -58,6 +59,13 @@ def _validate_paths(self):
             raise FileNotFoundError(
                 f"Test file not found at {self.args.test_file_path}"
             )
+
+        # Ensure the project root exists
+        if self.args.project_root and not os.path.isdir(self.args.project_root):
+            raise FileNotFoundError(
+                f"Project root not found at {self.args.project_root}"
+            )
+
         # Create default DB file if not provided
         if not self.args.log_db_path:
             self.args.log_db_path = "cover_agent_unit_test_runs.db"

diff --git a/cover_agent/PromptBuilder.py b/cover_agent/PromptBuilder.py
@@ -43,6 +43,7 @@ def __init__(
         failed_test_runs: str = "",
         language: str = "python",
         testing_framework: str = "NOT KNOWN",
+        project_root: str = "",
     ):
         """
         The `PromptBuilder` class is responsible for building a formatted prompt string by replacing placeholders with the actual content of files read during initialization. It takes in various paths and settings as parameters and provides a method to generate the prompt.
@@ -67,8 +68,11 @@ def __init__(
             build_prompt(self)
                 Replaces placeholders with the actual content of files read during initialization and returns the formatted prompt string.
         """
-        self.source_file_name = os.path.basename(source_file_path)
-        self.test_file_name = os.path.basename(test_file_path)
+        self.project_root = project_root
+        self.source_file_path = source_file_path
+        self.test_file_path = test_file_path
+        self.source_file_name_rel = os.path.relpath(source_file_path, project_root)
+        self.test_file_name_rel = os.path.relpath(test_file_path, project_root)
         self.source_file = self._read_file(source_file_path)
         self.test_file = self._read_file(test_file_path)
         self.code_coverage_report = code_coverage_report
@@ -123,8 +127,8 @@ def _read_file(self, file_path):
 
     def build_prompt(self) -> dict:
         variables = {
-            "source_file_name": self.source_file_name,
-            "test_file_name": self.test_file_name,
+            "source_file_name": self.source_file_name_rel,
+            "test_file_name": self.test_file_name_rel,
             "source_file_numbered": self.source_file_numbered,
             "test_file_numbered": self.test_file_numbered,
             "source_file": self.source_file,
@@ -165,8 +169,8 @@ def build_prompt_custom(self, file) -> dict:
             dict: A dictionary containing the system and user prompts.
         """
         variables = {
-            "source_file_name": self.source_file_name,
-            "test_file_name": self.test_file_name,
+            "source_file_name": self.source_file_name_rel,
+            "test_file_name": self.test_file_name_rel,
             "source_file_numbered": self.source_file_numbered,
             "test_file_numbered": self.test_file_numbered,
             "source_file": self.source_file,

diff --git a/cover_agent/UnitTestGenerator.py b/cover_agent/UnitTestGenerator.py
@@ -30,6 +30,7 @@ def __init__(
         desired_coverage: int = 90,  # Default to 90% coverage if not specified
         additional_instructions: str = "",
         use_report_coverage_feature_flag: bool = False,
+        project_root: str = "",
     ):
         """
         Initialize the UnitTestGenerator class with the provided parameters.
@@ -57,6 +58,7 @@ def __init__(
         self.relevant_line_number_to_insert_imports_after = None
         self.relevant_line_number_to_insert_tests_after = None
         self.test_headers_indentation = None
+        self.project_root = project_root
         self.source_file_path = source_file_path
         self.test_file_path = test_file_path
         self.code_coverage_report_path = code_coverage_report_path
@@ -303,6 +305,7 @@ def build_prompt(self) -> dict:
             failed_test_runs=failed_test_runs_value,
             language=self.language,
             testing_framework=self.testing_framework,
+            project_root=self.project_root,
         )
 
         return self.prompt_builder.build_prompt()

diff --git a/cover_agent/lsp_logic/logic.py b/cover_agent/lsp_logic/logic.py
@@ -3,7 +3,8 @@
 from cover_agent.lsp_logic.utils.utils import uri_to_path, is_forbidden_directory
 
 
-async def get_direct_context(captures, language, lsp, project_dir, rel_file, target_file):
+async def get_direct_context(captures, language, lsp, project_dir, rel_file):
+    target_file = str(os.path.join(project_dir, rel_file))
     skip_found_symbols = True
     context_files = set()
     context_symbols = set()
@@ -30,14 +31,15 @@ async def get_direct_context(captures, language, lsp, project_dir, rel_file, tar
                 if project_dir not in d_path:
                     continue
                 if not is_forbidden_directory(d_path, language):
-                    print(f"Context definition: \'{name_symbol}\' at line {line} from file \'{rel_d_path}\'")
+                    # print(f"Context definition: \'{name_symbol}\' at line {line} from file \'{rel_d_path}\'")
                     context_files.add(d_path)
                     context_symbols.add(name_symbol)
                     context_symbols_and_files.add((name_symbol, rel_d_path))
     return context_files, context_symbols
 
 
-async def get_reverse_context(captures, lsp, project_dir, rel_file, target_file):
+async def get_reverse_context(captures, lsp, project_dir, rel_file):
+    target_file = str(os.path.join(project_dir, rel_file))
     skip_found_symbols = True
     reverse_context_files = set()
     reverse_context_symbols = set()

diff --git a/cover_agent/lsp_logic/scripts/main.py b/cover_agent/lsp_logic/scripts/main.py
@@ -58,16 +58,14 @@ async def run():
                                                                   language,
                                                                   lsp,
                                                                   project_dir,
-                                                                  rel_file,
-                                                                  target_file)
+                                                                  rel_file)
         print("Getting context done.")
 
         print("\nGetting reverse context ...")
         reverse_context_files, reverse_context_symbols = await get_reverse_context(captures,
                                                           lsp,
                                                           project_dir,
-                                                          rel_file,
-                                                          target_file)
+                                                          rel_file)
         print("Getting reverse context done.")
 
     print("\n\n================")

diff --git a/cover_agent/lsp_logic/utils/utils_context.py b/cover_agent/lsp_logic/utils/utils_context.py
@@ -0,0 +1,100 @@
+import os
+from time import sleep
+
+from jinja2 import Environment, StrictUndefined
+
+from cover_agent.lsp_logic.file_map.file_map import FileMap
+from cover_agent.lsp_logic.logic import get_direct_context
+from cover_agent.lsp_logic.multilspy import LanguageServer
+from cover_agent.lsp_logic.multilspy.multilspy_config import MultilspyConfig
+from cover_agent.lsp_logic.multilspy.multilspy_logger import MultilspyLogger
+
+from cover_agent.settings.config_loader import get_settings
+from cover_agent.utils import load_yaml
+
+
+async def analyze_context(test_file, context_files, args, ai_caller):
+    """
+    # we now want to analyze the test file against the source files and determine several things:
+    # 1. If this test file is a unit test file
+    # 2. Which of the context files can be seen as the main source file for this test file, for which we want to increase coverage
+    # 3. Set all other context files as additional 'included_files' for the CoverAgent
+    """
+    source_file = None
+    context_files_include = context_files
+    try:
+        test_file_rel_str = os.path.relpath(test_file, args.project_root)
+        context_files_rel_filtered_list_str = ""
+        for file in context_files:
+            context_files_rel_filtered_list_str += f"`{os.path.relpath(file, args.project_root)}\n`"
+        variables = {"language": args.project_language,
+                     "test_file_name_rel": test_file_rel_str,
+                     "test_file_content": open(test_file, 'r').read(),
+                     "context_files_names_rel": context_files_rel_filtered_list_str
+                     }
+        file = 'analyze_test_against_context'
+        environment = Environment(undefined=StrictUndefined)
+        settings = get_settings().get(file)
+        system_prompt = environment.from_string(settings.system).render(variables)
+        user_prompt = environment.from_string(settings.user).render(variables)
+        response, prompt_token_count, response_token_count = (
+            ai_caller.call_model(prompt={"system": system_prompt, "user": user_prompt})
+        )
+        response_dict = load_yaml(response)
+        if int(response_dict.get('is_this_a_unit_test', 0)) == 1:
+            source_file_rel = response_dict.get('main_file', "").strip().strip('`')
+            source_file = os.path.join(args.project_root, source_file_rel)
+            for file in context_files:
+                file_rel = os.path.relpath(file, args.project_root)
+                if file_rel == source_file_rel:
+                    context_files_include = [f for f in context_files if f != file]
+
+        if source_file:
+            print(f"Test file: `{test_file}` is a unit test file for source file: `{source_file}`")
+        else:
+            print(f"Test file: `{test_file}` is not a unit test file")
+    except Exception as e:
+        print(f"Error while analyzing test file {test_file} against context files: {e}")
+
+    return source_file, context_files_include
+
+
+async def find_test_file_context(args, lsp, test_file):
+    try:
+        target_file = test_file
+        rel_file = os.path.relpath(target_file, args.project_root)
+
+        # get tree-sitter query results
+        # print("\nGetting tree-sitter query results for the target file...")
+        fname_summary = FileMap(target_file, parent_context=False, child_context=False,
+                                header_max=0, project_base_path=args.project_root)
+        query_results, captures = fname_summary.get_query_results()
+        # print("Tree-sitter query results for the target file done.")
+
+        # print("\nGetting context ...")
+        context_files, context_symbols = await get_direct_context(captures,
+                                                                  args.project_language,
+                                                                  lsp,
+                                                                  args.project_root,
+                                                                  rel_file)
+        # filter empty files
+        context_files_filtered = []
+        for file in context_files:
+            with open(file, 'r') as f:
+                if f.read().strip():
+                    context_files_filtered.append(file)
+        context_files = context_files_filtered
+        # print("Getting context done.")
+    except Exception as e:
+        print(f"Error while getting context for test file {test_file}: {e}")
+        context_files = []
+
+    return context_files
+
+
+async def initialize_language_server(args):
+    logger = MultilspyLogger()
+    config = MultilspyConfig.from_dict({"code_language": args.project_language})
+    lsp = LanguageServer.create(config, logger, args.project_root)
+    sleep(0.1)
+    return lsp
diff --git a/cover_agent/main.py b/cover_agent/main.py
@@ -15,6 +15,9 @@ def parse_args():
     parser.add_argument(
         "--test-file-path", required=True, help="Path to the input test file."
     )
+    parser.add_argument(
+        "--project-root", required=False, help="Path to the root of the project.", default=""
+    )
     parser.add_argument(
         "--test-file-output-path",
         required=False,

diff --git a/cover_agent/main_full_repo.py b/cover_agent/main_full_repo.py
@@ -0,0 +1,54 @@
+import asyncio
+import copy
+import os
+from cover_agent.AICaller import AICaller
+from cover_agent.lsp_logic.utils.utils_context import analyze_context, find_test_file_context, \
+    initialize_language_server
+from cover_agent.utils import parse_args_full_repo, find_test_files
+from cover_agent.CoverAgent import CoverAgent
+
+
+async def run():
+    args = parse_args_full_repo()
+
+    # scan the project directory for test files
+    test_files = find_test_files(args)
+    print("Test files found:\n" + ''.join(f"{f}\n" for f in test_files))
+
+    # initialize the language server
+    print("\nInitializing language server...")
+    lsp = await initialize_language_server(args)
+
+    # start the language server
+    async with lsp.start_server():
+        print("LSP server initialized.")
+
+        ai_caller = AICaller(model=args.model)
+
+        # main loop for analyzing test files
+        for test_file in test_files:
+            # Find the context files for the test file
+            context_files = await find_test_file_context(args, lsp, test_file)
+            print("Context files for test file '{}':\n{}".format(test_file, ''.join(f"{f}\n" for f in context_files)))
+
+            # Analyze the test file against the context files
+            print("\nAnalyzing test file against context files...")
+            source_file, context_files_include = await analyze_context(test_file, context_files, args, ai_caller)
+
+            if source_file:
+                # Run the CoverAgent for the test file
+                args_copy = copy.deepcopy(args)
+                args_copy.source_file_path = source_file
+                args_copy.test_command_dir = args.project_root
+                args_copy.test_file_path = test_file
+                args_copy.included_files = context_files_include
+                agent = CoverAgent(args_copy)
+                agent.run()
+
+
+def main():
+    asyncio.run(run())
+
+
+if __name__ == "__main__":
+    main()