diff --git a/classify-split-extract-workflow/classify-job/utils.py b/classify-split-extract-workflow/classify-job/utils.py index e4f4853c7..8845af90c 100644 --- a/classify-split-extract-workflow/classify-job/utils.py +++ b/classify-split-extract-workflow/classify-job/utils.py @@ -12,6 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility functions for obtaining authentication tokens, +sending callback requests, splitting PDF files into multiple parts, +getting the current UTC timestamp, and deleting directories. +""" + import glob import io import shutil @@ -73,11 +79,13 @@ def split_pages(file_pattern: str, bucket_name: str, output_dir: str) -> None: num_pages = len(reader.pages) num_shards = (num_pages + 14) // 15 - pdf_writers = [PyPDF2.PdfWriter() for _ in range(num_shards)] - for page_index, page in enumerate(reader.pages): - pdf_writers[page_index // 15].add_page(page) + for shard_index in range(num_shards): + pdf_writer = PyPDF2.PdfWriter() + for page_index in range(15): + page_number = shard_index * 15 + page_index + if page_number < num_pages: + pdf_writer.add_page(reader.pages[page_number]) - for shard_index, pdf_writer in enumerate(pdf_writers): output_filename = f"{output_dir}/{file_path[3:-4]} - " \ f"part {shard_index + 1} of {num_shards}.pdf" blob = bucket.blob(output_filename)