-
-
Notifications
You must be signed in to change notification settings - Fork 334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update bulk image conversion script to support multiple formats #226
base: dev
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,10 @@ | |
import argparse | ||
import os | ||
|
||
import argparse | ||
import os | ||
from scripts.local.utils.bulk_ops_common import convert_image, convert_pdf_to_jpg | ||
|
||
from scripts.utils.image_utils import convert_image, get_size_in_kb, get_size_reduction | ||
Comment on lines
+10
to
12
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
|
||
|
@@ -73,4 +77,4 @@ def parse_args(): | |
exit(1) | ||
else: | ||
# print("All sample images are jpgs. Commit accepted.") | ||
exit(0) | ||
exit(0) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,59 +1,38 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import os | ||
from scripts.local.utils.bulk_ops_common import convert_image | ||
|
||
from scripts.local.utils.bulk_ops_common import add_common_args, run_argparser | ||
|
||
|
||
def convert_image_to(): | ||
# Wrapper to handle all available extensions | ||
pass | ||
|
||
|
||
def convert_images_in_tree(args): | ||
input_directory = args.get("input", None) | ||
recursive = args.get("recursive", None) | ||
output_directory = args.get("output", None) | ||
trigger_size = args.get("trigger_size", None) | ||
def convert_images(filenames, output_format): | ||
converted_count = 0 | ||
for image_path in filenames: | ||
old_size = get_size_in_kb(image_path) | ||
if old_size <= trigger_size: | ||
continue | ||
|
||
# Note: the pre-commit hook takes care of ensuring only image files are passed here. | ||
new_image_path = convert_image(image_path) | ||
new_size = get_size_in_kb(new_image_path) | ||
if new_size <= old_size: | ||
print( | ||
f"Converted png to jpg: {image_path}: {new_size:.2f}KB {get_size_reduction(old_size, new_size)}" | ||
) | ||
for input_path in filenames: | ||
if input_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.tif')): | ||
name, ext = os.path.splitext(input_path) | ||
output_path = f"{name}.{output_format.lower()}" | ||
convert_image(input_path, output_path, output_format) | ||
print(f"Converted {input_path} to {output_format}") | ||
converted_count += 1 | ||
else: | ||
print( | ||
f"Skipping conversion for {image_path} as size is more than before ({new_size:.2f} KB > {old_size:.2f} KB)" | ||
) | ||
os.remove(new_image_path) | ||
|
||
print(f"Skipping unsupported file: {input_path}") | ||
return converted_count | ||
|
||
|
||
def parse_args(): | ||
# construct the argument parse and parse the arguments | ||
argparser = argparse.ArgumentParser() | ||
add_common_args(argparser, ["--input", "--output", "--trigger-size", "--recursive"]) | ||
args = run_argparser(argparser) | ||
parser = argparse.ArgumentParser(description="Convert images for pre-commit hook") | ||
parser.add_argument( | ||
"--format", | ||
choices=['jpg', 'png', 'jpeg'], | ||
default='jpg', | ||
help="Output format for images (default: jpg)" | ||
) | ||
parser.add_argument("filenames", nargs="*", help="Files to convert.") | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
|
||
converted_count = convert_images_in_tree(args) | ||
trigger_size = args["trigger_size"] | ||
converted_count = convert_images(args.filenames, args.format.upper()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we can call the function |
||
if converted_count > 0: | ||
print( | ||
f"Note: {converted_count} png images above {trigger_size}KB were converted to jpg.\nPlease manually remove the png files and add your commit again." | ||
) | ||
print(f"Note: {converted_count} images were converted.") | ||
exit(1) | ||
else: | ||
# print("All sample images are jpgs. Commit accepted.") | ||
exit(0) | ||
exit(0) |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -3,43 +3,20 @@ | |||||||||||||||||||||||
import glob | ||||||||||||||||||||||||
import operator | ||||||||||||||||||||||||
import os | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
from PIL import Image | ||||||||||||||||||||||||
from pdf2image import convert_from_path | ||||||||||||||||||||||||
Anushlinux marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
from src.utils.file import PathUtils | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# TODO: add shell utilities for simple local images processing such as: | ||||||||||||||||||||||||
# From issue: https://github.com/Udayraj123/OMRChecker/issues/213 | ||||||||||||||||||||||||
# - bulk resize, | ||||||||||||||||||||||||
# - clip to max width (or height) | ||||||||||||||||||||||||
# - with a conditional trigger if the file size exceeds a provided value | ||||||||||||||||||||||||
# - bulk convert : | ||||||||||||||||||||||||
# - pdf to jpg | ||||||||||||||||||||||||
# - png to jpg or vice versa | ||||||||||||||||||||||||
# - tiff | ||||||||||||||||||||||||
# - bulk rename files | ||||||||||||||||||||||||
# - adding folder name to file name | ||||||||||||||||||||||||
# - removing non-utf characters from filename (to avoid processing errors) | ||||||||||||||||||||||||
# - add watermark to all images | ||||||||||||||||||||||||
# - blur a particular section of the images (e.g. student names and signatures) | ||||||||||||||||||||||||
# - create a gif from a folder of images | ||||||||||||||||||||||||
# - Save output of cropped pages to avoid cropping in each run (and merge with manually cropped images) | ||||||||||||||||||||||||
# - Save output of cropped markers to avoid cropping in each run (and merge with manually cropped images) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Make sure to be cross-os compatible i.e. use Posix paths wherever possible | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Maybe have a common util file for bulk ops and then create one file for each of the above util. | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Usual pre-processing commands for speedups (useful during re-runs) | ||||||||||||||||||||||||
# python3 scripts/local/convert_images.py -i inputs/ --replace [--filter-ext png,jpg] --output-ext jpg | ||||||||||||||||||||||||
# python3 scripts/local/resize_images.py -i inputs/ -o outputs --max-width=1500 | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's keep all these comments as a guide for the contributors to figure out what needs to be done here |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# TODO: add shell utilities for bulk image processing, resizing, watermarking, etc. | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def walk_and_extract_files(input_dir, file_extensions): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Walks through the directory to extract files with specified extensions. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
extracted_files = [] | ||||||||||||||||||||||||
for _dir, _subdir, _files in os.walk(input_dir): | ||||||||||||||||||||||||
matching_globs = [ | ||||||||||||||||||||||||
glob(os.path.join(_dir, f"*.{file_extension}")) | ||||||||||||||||||||||||
glob.glob(os.path.join(_dir, f"*.{file_extension}")) | ||||||||||||||||||||||||
Anushlinux marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
for file_extension in file_extensions | ||||||||||||||||||||||||
] | ||||||||||||||||||||||||
matching_files = functools.reduce(operator.iconcat, matching_globs, []) | ||||||||||||||||||||||||
|
@@ -49,8 +26,11 @@ def walk_and_extract_files(input_dir, file_extensions): | |||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def get_local_argparser(): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Returns an argument parser with common input, output, and optional recursive processing flags. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
local_argparser = argparse.ArgumentParser() | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
local_argparser.add_argument( | ||||||||||||||||||||||||
"-i", | ||||||||||||||||||||||||
"--input", | ||||||||||||||||||||||||
|
@@ -72,29 +52,79 @@ def get_local_argparser(): | |||||||||||||||||||||||
local_argparser.add_argument( | ||||||||||||||||||||||||
"-r", | ||||||||||||||||||||||||
"--recursive", | ||||||||||||||||||||||||
required=True, | ||||||||||||||||||||||||
type=bool, | ||||||||||||||||||||||||
action='store_true', | ||||||||||||||||||||||||
Anushlinux marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||
dest="recursive", | ||||||||||||||||||||||||
help="Specify whether to process subdirectories recursively", | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
local_argparser.add_argument( | ||||||||||||||||||||||||
"--trigger-size", | ||||||||||||||||||||||||
default=200, | ||||||||||||||||||||||||
required=True, | ||||||||||||||||||||||||
type=int, | ||||||||||||||||||||||||
dest="trigger_size", | ||||||||||||||||||||||||
help="Specify minimum file size to trigger the hook.", | ||||||||||||||||||||||||
help="Specify minimum file size (KB) to trigger the hook.", | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
return local_argparser | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def convert_image(input_path, output_path, output_format): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Converts an image to the specified output format. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
with Image.open(input_path) as img: | ||||||||||||||||||||||||
if output_format == 'JPG': | ||||||||||||||||||||||||
output_format = 'JPEG' | ||||||||||||||||||||||||
img.save(output_path, output_format) | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's check the existing util in image_utils file. Merge the two functions as per the new output_formats support
Please remove the existing image_path[:-4] logic too |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def convert_pdf_to_jpg(input_path, output_dir): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Converts a PDF to a series of JPG images, one per page. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
pages = convert_from_path(input_path) | ||||||||||||||||||||||||
for i, page in enumerate(pages): | ||||||||||||||||||||||||
output_path = os.path.join(output_dir, f"page_{i + 1}.jpg") | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a feature extension idea here, see if you can implement - |
||||||||||||||||||||||||
page.save(output_path, 'JPEG') | ||||||||||||||||||||||||
Comment on lines
+85
to
+88
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's also include file name here
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also convert_image and convert_pdf_to_jpg should come from image_utils instead of bulk_ops_common.py. |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def bulk_convert(input_dir, output_dir, output_format, in_place=False): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Bulk converts images and PDFs to the specified format. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
os.makedirs(output_dir, exist_ok=True) | ||||||||||||||||||||||||
extensions = ['png', 'jpg', 'jpeg', 'tiff', 'tif', 'pdf'] | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's keep pdf handling in a separate script (check other comments)
Suggested change
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
filepaths = walk_and_extract_files(input_dir, extensions) | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. snake case
Suggested change
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
for input_path in filepaths: | ||||||||||||||||||||||||
relative_path = os.path.relpath(os.path.dirname(input_path), input_dir) | ||||||||||||||||||||||||
output_subdir = os.path.join(output_dir, relative_path) if not in_place else os.path.dirname(input_path) | ||||||||||||||||||||||||
os.makedirs(output_subdir, exist_ok=True) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
Comment on lines
+100
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Let's discuss more about this in the discord group if needed. |
||||||||||||||||||||||||
filename = os.path.basename(input_path) | ||||||||||||||||||||||||
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.tif')): | ||||||||||||||||||||||||
name, _ = os.path.splitext(filename) | ||||||||||||||||||||||||
output_path = os.path.join(output_subdir, f"{name}.{output_format.lower()}") | ||||||||||||||||||||||||
Comment on lines
+107
to
+108
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this shouldn't be needed here. The convert_image util already takes output_format as the input we can move this logic there. |
||||||||||||||||||||||||
convert_image(input_path, output_path, output_format) | ||||||||||||||||||||||||
print(f"Converted {filename} to {output_format}") | ||||||||||||||||||||||||
elif filename.lower().endswith('.pdf'): | ||||||||||||||||||||||||
pdf_output_dir = os.path.join(output_subdir, os.path.splitext(filename)[0]) | ||||||||||||||||||||||||
os.makedirs(pdf_output_dir, exist_ok=True) | ||||||||||||||||||||||||
convert_pdf_to_jpg(input_path, pdf_output_dir) | ||||||||||||||||||||||||
Comment on lines
+112
to
+114
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this logic to a separate script (see the other comment below)
|
||||||||||||||||||||||||
print(f"Converted {filename} to JPG") | ||||||||||||||||||||||||
else: | ||||||||||||||||||||||||
print(f"Skipping unsupported file: {filename}") | ||||||||||||||||||||||||
Comment on lines
+91
to
+117
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function looks bulky. Also the user is less likely to have both pdfs and other image formats together. So we can break the function down this way:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want you can raise a separate PR for PDF handling altogether (for more credit in terms of # of PRs) |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def add_common_args(argparser, arguments): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Adds arguments from the local argparser to the main argument parser. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
local_argparser = get_local_argparser() | ||||||||||||||||||||||||
for argument in arguments: | ||||||||||||||||||||||||
for action in local_argparser._actions: | ||||||||||||||||||||||||
if argument in action.option_strings: | ||||||||||||||||||||||||
# Copy the argument from local_argparser to argparser | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's not remove this comment, it is to explain what the next line is doing |
||||||||||||||||||||||||
argparser.add_argument( | ||||||||||||||||||||||||
*action.option_strings, | ||||||||||||||||||||||||
dest=action.dest, | ||||||||||||||||||||||||
|
@@ -107,15 +137,44 @@ def add_common_args(argparser, arguments): | |||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def run_argparser(argparser): | ||||||||||||||||||||||||
( | ||||||||||||||||||||||||
args, | ||||||||||||||||||||||||
unknown, | ||||||||||||||||||||||||
) = argparser.parse_known_args() | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Runs the argument parser and returns parsed arguments. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
args, unknown = argparser.parse_known_args() | ||||||||||||||||||||||||
args = vars(args) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
if len(unknown) > 0: | ||||||||||||||||||||||||
if unknown: | ||||||||||||||||||||||||
argparser.print_help() | ||||||||||||||||||||||||
raise Exception(f"\nError: Unknown arguments: {unknown}") | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
return args | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
def main(): | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
Main entry point for the script. Handles argument parsing and starts the bulk conversion process. | ||||||||||||||||||||||||
""" | ||||||||||||||||||||||||
parser = argparse.ArgumentParser(description="Bulk image and PDF converter") | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function should be present in local/convert_images.py instead. The bulk_ops_common.py is for utilities across all bulk converters |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
# Add standard arguments | ||||||||||||||||||||||||
add_common_args(parser, ['-i', '--input', '-o', '--output', '--recursive', '--trigger-size']) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
parser.add_argument( | ||||||||||||||||||||||||
"--format", | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make this a bit more specific
Suggested change
|
||||||||||||||||||||||||
choices=['jpg', 'png', 'jpeg'], | ||||||||||||||||||||||||
default='jpg', | ||||||||||||||||||||||||
help="Output format for images (default: jpg)" | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
parser.add_argument( | ||||||||||||||||||||||||
"--in-place", | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
action='store_true', | ||||||||||||||||||||||||
help="Modify files in place" | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also mention what happens if inplace=False is passed in the description |
||||||||||||||||||||||||
) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
args = run_argparser(parser) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
bulk_convert(args['input'], args['output'], args['format'].upper(), args['in_place']) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
||||||||||||||||||||||||
if __name__ == "__main__": | ||||||||||||||||||||||||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is intentional. We are okay with having png images in the codebase if they're below this size