diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 18_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 18_it_IT.xlsx index ab09d09..4fc72d8 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 18_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 18_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 22_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 22_it_IT.xlsx index 761bc3e..0a6d748 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 22_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 22_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 26_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 26_it_IT.xlsx index a1a8cc1..3375d6d 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 26_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 26_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 30_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 30_it_IT.xlsx index 8334eee..c9b16a5 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 30_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 30_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 32_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 32_it_IT.xlsx index 4e78177..543355a 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 32_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 32_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 36_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 36_it_IT.xlsx index c6959f3..29eff6e 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 36_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 36_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 37_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 37_it_IT.xlsx index e1baba0..c70d282 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 37_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 37_it_IT.xlsx differ diff --git a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 38_it_IT.xlsx b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 38_it_IT.xlsx index 8b315c1..56df42e 100644 Binary files a/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 38_it_IT.xlsx and b/inputs/VolumesExcel_06_07_2022/it_IT/Paesi Bassi VOLUME 38_it_IT.xlsx differ diff --git a/python/make_contentdm_csv.py b/python/make_contentdm_csv.py index 85edcde..3c412b6 100644 --- a/python/make_contentdm_csv.py +++ b/python/make_contentdm_csv.py @@ -4,14 +4,16 @@ import csv import json +import os import re from collections.abc import Iterator from contextlib import contextmanager from dataclasses import dataclass -from locale import nl_langinfo +from itertools import chain from pathlib import Path import openpyxl +from contentdm.file_inspector import FileIdentifier, FileInspector, VolumeIdentifier from data_parsing import control_title from data_parsing.load_database import initialize_database_for_xml from openpyxl import load_workbook @@ -21,10 +23,11 @@ @dataclass -class ImageRow: # pylint: disable=too-many-instance-attributes +class FileRow: # pylint: disable=too-many-instance-attributes + scans: list[str] file_id: str series_id: str - title: str + title: str | None year: int | None month: int | None day: int | None @@ -34,7 +37,7 @@ class ImageRow: # pylint: disable=too-many-instance-attributes subjects: list[str] @classmethod - def from_row(cls, row: list[openpyxl.cell.Cell]) -> ImageRow: + def from_row(cls, row: list[openpyxl.cell.Cell], scans: list[str]) -> FileRow: # Check for empty lines, title lines or incorrect lines file_id = row[0].value assert isinstance(file_id, str) @@ -47,25 +50,18 @@ def from_row(cls, row: list[openpyxl.cell.Cell]) -> ImageRow: else: raise ValueError(f"Can't parse series ID of: '{file_id}'") - # Perform type checks on all rows - title = row[1].value - assert isinstance(title, str) - year = row[2].value - assert year is None or isinstance(year, int) - month = row[3].value - assert month is None or isinstance(month, int) - day = row[4].value - assert day is None or isinstance(day, int) - location = row[5].value - assert isinstance(location, str) or location is None - authors = row[6].value - assert isinstance(authors, str) or authors is None - recipients = row[7].value - assert isinstance(recipients, str) or recipients is None - subjects = row[8].value - assert isinstance(subjects, str) or subjects is None + # Cast all rows to their expected type + title = str(row[1].value) if row[1].value is not None else None + year = int(row[2].value) if row[2].value is not None else None + month = int(row[3].value) if row[3].value is not None else None + day = int(row[4].value) if row[4].value is not None else None + location = str(row[5].value) if row[5].value is not None else None + authors = str(row[6].value) if row[6].value is not None else None + recipients = str(row[7].value) if row[7].value is not None else None + subjects = str(row[8].value) if row[8].value is not None else None return cls( + scans=scans, file_id=file_id, series_id=series_id, title=title, @@ -114,68 +110,51 @@ def __init__(self, input_dir: str, sanitize: bool = True) -> None: def run(self) -> None: """Create a CSV file for ContentDM.""" - with open(self.csv_file, "w", encoding="utf-8", newline="") as csv_file: - images = self._get_images() - first_item = next(images) - csv_writer = csv.DictWriter( - csv_file, - dialect=csv.excel_tab, - fieldnames=list(self._get_excel_row(first_item).keys()), - ) - csv_writer.writeheader() - csv_writer.writerow(self._get_excel_row(first_item)) - - for image in images: - csv_writer.writerow(self._get_excel_row(image)) - - def _get_images(self) -> Iterator[ImageRow]: - # Iterate files sorted by volume number + output_dir = Path("outputs") / "contentdm" + for griglie in self._get_all_griglie(): + os.makedirs(output_dir / griglie.name, exist_ok=True) + for file in self._get_all_files(griglie): + with open( + output_dir / griglie.name / f"{file.file_id}.txt", + "w", + encoding="utf-8", + ) as csv_file: + row = self._get_actual_excel_row(file) + row["filename"] = None + csv_writer = csv.DictWriter( + csv_file, dialect=csv.excel_tab, fieldnames=list(row.keys()) + ) + csv_writer.writeheader() + csv_writer.writerow(row) + for scan in file.scans: + row["filename"] = scan + csv_writer.writerow(row) + + # Temporary early exit + break + + def _get_all_griglie(self) -> Iterator[Path]: + """Yields all file paths to the santizied griglie in sorted order""" files = [ (i, i.name.replace("Paesi Bassi VOLUME", "").replace("_it_IT.xlsx", "")) for i in self.sanitized_dir.iterdir() if i.name.startswith("Paesi") ] - for file in sorted(files, key=lambda x: int(x[1])): - if int(file[1]) > 1: - return - - with self._excel_sheet(file[0], int(file[1])) as sheet: - for row in sheet.iter_rows(): - if re.match("(.*)_title", row[0].value or ""): - # TODO: HANDLE SERIES DATA - pass - else: - yield ImageRow.from_row(row) - - # self._create_xml_individual_files(sheet, sub_levels) - - @contextmanager - def _excel_sheet( - self, file: Path, vol_num: int - ) -> Iterator[openpyxl.worksheet.worksheet.Worksheet]: - """Open an Excel workbook, return the first sheet and close it after use.""" - workbook = load_workbook(file) + for file, _ in sorted(files, key=lambda x: int(x[1])): + yield file + print(f"Finished parsing {file.name}") + + def _get_all_files(self, griglie: Path) -> Iterator[FileRow]: + files_and_scans = FileInspector(griglie).run() + + workbook = load_workbook(griglie) sheet = workbook[workbook.sheetnames[0]] - # Little sanity check - if not re.match("(.*)_title", sheet["A"][0].value): - raise ValueError( - f"Can't determine the volume/ms number of {sheet['A'][0].value}. " - "Does it have the correct format?" - ) - yield sheet - print(f"Finished writing volume {vol_num + 275} - Volume {vol_num}") - workbook.close() - def _get_excel_row(self, image: ImageRow) -> dict[str, str]: - """Since I can't change metadata templates, I'm doing it like this.""" - actual_dict = self._get_actual_excel_row(image) - return { - "file_name": f"{image.file_id.upper()}r.tif", - "title": image.title, - "actual_data": json.dumps(actual_dict), - } + for row in sheet.iter_rows(): + if row[0].value in files_and_scans: + yield FileRow.from_row(row, files_and_scans[row[0].value]) - def _get_actual_excel_row(self, image: ImageRow) -> dict[str, str]: + def _get_actual_excel_row(self, image: FileRow) -> dict[str, str]: """Turn an ImageRow into a dict with the correct column headers.""" title_nl, title_en, title_it = self._translate_title(image, self.database) if image.location: @@ -216,7 +195,7 @@ def _get_actual_excel_row(self, image: ImageRow) -> dict[str, str]: } def _translate_title( - self, image: ImageRow, database: Database + self, image: FileRow, database: Database ) -> tuple[str, str, str]: """Translate a title into Dutch, English and Italian.""" # Find document translation @@ -256,8 +235,5 @@ def _translate_title( if __name__ == "__main__": - eadmaker = ContentDMFileWriter( - "inputs/VolumesExcel_06_07_2022/it_IT", - False, - ) + eadmaker = ContentDMFileWriter("inputs/VolumesExcel_06_07_2022/it_IT", False) eadmaker.run() diff --git a/python/xlsx_functions/helper_functions.py b/python/xlsx_functions/helper_functions.py index 73f1eb0..9237b33 100644 --- a/python/xlsx_functions/helper_functions.py +++ b/python/xlsx_functions/helper_functions.py @@ -1,6 +1,8 @@ from openpyxl.cell.cell import Cell -def compare_rows(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool: +def compare_rows(row1: tuple[Cell, ...] | None, row2: tuple[Cell, ...]) -> bool: """Compare the values of two rows.""" + if row1 is None: + return False return [i.value for i in row1[1:]] == [i.value for i in row2[1:]]