diff --git a/app/extract_all_code.py b/app/extract_all_code.py index 4d5f7a9..e39be03 100644 --- a/app/extract_all_code.py +++ b/app/extract_all_code.py @@ -6,7 +6,6 @@ import openai import time import ast -import os # Set OpenAI API key openai.api_key = 'YOUR_API_KEY_HERE' @@ -16,23 +15,24 @@ # Specify project headers project_headers = { - "Authorization" : "Bearer " + openai.api_key, + "Authorization": "Bearer " + openai.api_key, # "OpenAI-Project" : ocrroo_project_id } # ChatGPT -# python multiprocessing program to extract only programming code from video using opencv and tesseract ocr with limited memory saving frames into text file +# python multiprocessing program to extract only programming code from video using opencv and tesseract ocr with +# limited memory saving frames into text file # Set up pytesseract path (if required) # For example, on Windows: pytesseract.pytesseract.tesseract_cmd = r'C:\Users\user\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' -# Function to check if the text is likely to be programming code +# Function to check if the text is likely to be programming code # using common keywords and symbols def is_code(text): code_pattern = re.compile(r""" - (\b(if|else|while|for|return|int|float|double|char|void|import|from|class|def|print|include|main)\b| # common keywords + (\b(if|else|while|for|return|int|float|double|char|void|import|from|class|def|print|include|main)\b| [\{\}\[\]\(\)<>;:=]| # common symbols \b\d+\b| # numbers [\w]+\.\w+| # object properties or functions @@ -109,6 +109,7 @@ def extract_code_from_frame(frame): code_lines = [line.strip() for line in text.split('\n') if is_code(line)] return '\n'.join(code_lines) + # Function to save frames containing code as images def save_frames(frames, output_dir, output_file): unique_code = set() @@ -127,6 +128,7 @@ def save_frames(frames, output_dir, output_file): for code in unique_code: f.write(code + '\n') + def is_valid_python_code(code_line): """ Validate if a line of Python code is syntactically correct. @@ -137,6 +139,7 @@ def is_valid_python_code(code_line): except SyntaxError: return False + def process_code_file(input_filename, output_filename): """ Read a code file, trim off lines starting with '>>>', validate remaining lines, @@ -158,6 +161,7 @@ def process_code_file(input_filename, output_filename): for valid_line in valid_lines: outfile.write(valid_line + '\n') + def process_text_file(input_file, output_file): with open(input_file, 'r') as file: text = file.read() @@ -174,15 +178,15 @@ def process_text_file(input_file, output_file): # {"role": "system", # "content": f"You are a coding assistant. You reply only in programming code " # "that is correct and formatted. Do NOT reply with any explanation, " - # f"only code. If you are given something that is not programming code, " + # "only code. If you are given something that is not programming code, " # "you must NOT include it in your response. If nothing is present, " # "simply return 'ERROR' and nothing else. Do NOT return leading or " # "trailing" # "backticks and do NOT return the language before the code snippet."}, {"role": "system", - "content": f"You are a coding assistant. You reply only in programming code " + "content": "You are a coding assistant. You reply only in programming code " "that is correct and formatted. Do NOT reply with any explanation, " - f"only code. If you are given something that is not programming code, " + "only code. If you are given something that is not programming code, " "you must NOT include it in your response. Do NOT return leading or " "trailing " "backticks and do NOT return the language before the code snippet."}, @@ -221,4 +225,5 @@ def process_text_file(input_file, output_file): process_text_file(valid_code_file, gpt_output_file) - print(f"Code-containing frames extraction complete. Check '{output_dir}' for the output images. Check '{gpt_output_file}' for the output file.") + print(f"Code-containing frames extraction complete. Check '{output_dir}' for the output images. " + f"Check '{gpt_output_file}' for the output file.") diff --git a/app/extract_text.py b/app/extract_text.py index 522169c..d3b864a 100644 --- a/app/extract_text.py +++ b/app/extract_text.py @@ -77,13 +77,6 @@ def extract_frame_at_timestamp(filename: str, timestamp: float) -> Union[cv2.Vid @staticmethod def openai_format_raw_ocr(extracted_text: str, language: str) -> str: """ - "Given an input of potentially raw OCR capture from a video containing Python code, your task is to correct - and format the code. Ensure the code's indentation and syntax are accurate. Exclude any content that isn't - valid Python code. If no recognizable Python content is detected, return 'ERROR'. Do not provide - explanations, leading or trailing backticks, or specify the language in your response. Simply return the - corrected code. Avoid making extensive alterations; the goal is to retain the original intent of the capture - as closely as possible." - prompt = f"Fix up the following {language} code snippet: '{extracted_text}'" response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "Given an input of potentially raw OCR capture from a video containing code, your task is to correct and format the code. diff --git a/app/templates/index.html b/app/templates/index.html index 12e7546..3d70c5c 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -13,22 +13,22 @@

Complete the following actions in the settings menu to finalise setup.


-

+

Configure Tesseract OCR Executable -

-

+

+

Configure IDE Executable -

-

+

+

Provide OpenAI API Key -

-

+

+

Set a Username to Collaborate With Other Users -

+