Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
DanielNoord committed Sep 10, 2023
1 parent 771f427 commit 85facc8
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 81 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
136 changes: 56 additions & 80 deletions python/make_contentdm_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@

import csv
import json
import os
import re
from collections.abc import Iterator
from contextlib import contextmanager
from dataclasses import dataclass
from locale import nl_langinfo
from itertools import chain
from pathlib import Path

import openpyxl
from contentdm.file_inspector import FileIdentifier, FileInspector, VolumeIdentifier
from data_parsing import control_title
from data_parsing.load_database import initialize_database_for_xml
from openpyxl import load_workbook
Expand All @@ -21,10 +23,11 @@


@dataclass
class ImageRow: # pylint: disable=too-many-instance-attributes
class FileRow: # pylint: disable=too-many-instance-attributes
scans: list[str]
file_id: str
series_id: str
title: str
title: str | None
year: int | None
month: int | None
day: int | None
Expand All @@ -34,7 +37,7 @@ class ImageRow: # pylint: disable=too-many-instance-attributes
subjects: list[str]

@classmethod
def from_row(cls, row: list[openpyxl.cell.Cell]) -> ImageRow:
def from_row(cls, row: list[openpyxl.cell.Cell], scans: list[str]) -> FileRow:
# Check for empty lines, title lines or incorrect lines
file_id = row[0].value
assert isinstance(file_id, str)
Expand All @@ -47,25 +50,18 @@ def from_row(cls, row: list[openpyxl.cell.Cell]) -> ImageRow:
else:
raise ValueError(f"Can't parse series ID of: '{file_id}'")

# Perform type checks on all rows
title = row[1].value
assert isinstance(title, str)
year = row[2].value
assert year is None or isinstance(year, int)
month = row[3].value
assert month is None or isinstance(month, int)
day = row[4].value
assert day is None or isinstance(day, int)
location = row[5].value
assert isinstance(location, str) or location is None
authors = row[6].value
assert isinstance(authors, str) or authors is None
recipients = row[7].value
assert isinstance(recipients, str) or recipients is None
subjects = row[8].value
assert isinstance(subjects, str) or subjects is None
# Cast all rows to their expected type
title = str(row[1].value) if row[1].value is not None else None
year = int(row[2].value) if row[2].value is not None else None
month = int(row[3].value) if row[3].value is not None else None
day = int(row[4].value) if row[4].value is not None else None
location = str(row[5].value) if row[5].value is not None else None
authors = str(row[6].value) if row[6].value is not None else None
recipients = str(row[7].value) if row[7].value is not None else None
subjects = str(row[8].value) if row[8].value is not None else None

return cls(
scans=scans,
file_id=file_id,
series_id=series_id,
title=title,
Expand Down Expand Up @@ -114,68 +110,51 @@ def __init__(self, input_dir: str, sanitize: bool = True) -> None:

def run(self) -> None:
"""Create a CSV file for ContentDM."""
with open(self.csv_file, "w", encoding="utf-8", newline="") as csv_file:
images = self._get_images()
first_item = next(images)
csv_writer = csv.DictWriter(
csv_file,
dialect=csv.excel_tab,
fieldnames=list(self._get_excel_row(first_item).keys()),
)
csv_writer.writeheader()
csv_writer.writerow(self._get_excel_row(first_item))

for image in images:
csv_writer.writerow(self._get_excel_row(image))

def _get_images(self) -> Iterator[ImageRow]:
# Iterate files sorted by volume number
output_dir = Path("outputs") / "contentdm"
for griglie in self._get_all_griglie():
os.makedirs(output_dir / griglie.name, exist_ok=True)
for file in self._get_all_files(griglie):
with open(
output_dir / griglie.name / f"{file.file_id}.txt",
"w",
encoding="utf-8",
) as csv_file:
row = self._get_actual_excel_row(file)
row["filename"] = None
csv_writer = csv.DictWriter(
csv_file, dialect=csv.excel_tab, fieldnames=list(row.keys())
)
csv_writer.writeheader()
csv_writer.writerow(row)
for scan in file.scans:
row["filename"] = scan
csv_writer.writerow(row)

# Temporary early exit
break

def _get_all_griglie(self) -> Iterator[Path]:
"""Yields all file paths to the santizied griglie in sorted order"""
files = [
(i, i.name.replace("Paesi Bassi VOLUME", "").replace("_it_IT.xlsx", ""))
for i in self.sanitized_dir.iterdir()
if i.name.startswith("Paesi")
]
for file in sorted(files, key=lambda x: int(x[1])):
if int(file[1]) > 1:
return

with self._excel_sheet(file[0], int(file[1])) as sheet:
for row in sheet.iter_rows():
if re.match("(.*)_title", row[0].value or ""):
# TODO: HANDLE SERIES DATA
pass
else:
yield ImageRow.from_row(row)

# self._create_xml_individual_files(sheet, sub_levels)

@contextmanager
def _excel_sheet(
self, file: Path, vol_num: int
) -> Iterator[openpyxl.worksheet.worksheet.Worksheet]:
"""Open an Excel workbook, return the first sheet and close it after use."""
workbook = load_workbook(file)
for file, _ in sorted(files, key=lambda x: int(x[1])):
yield file
print(f"Finished parsing {file.name}")

def _get_all_files(self, griglie: Path) -> Iterator[FileRow]:
files_and_scans = FileInspector(griglie).run()

workbook = load_workbook(griglie)
sheet = workbook[workbook.sheetnames[0]]
# Little sanity check
if not re.match("(.*)_title", sheet["A"][0].value):
raise ValueError(
f"Can't determine the volume/ms number of {sheet['A'][0].value}. "
"Does it have the correct format?"
)
yield sheet
print(f"Finished writing volume {vol_num + 275} - Volume {vol_num}")
workbook.close()

def _get_excel_row(self, image: ImageRow) -> dict[str, str]:
"""Since I can't change metadata templates, I'm doing it like this."""
actual_dict = self._get_actual_excel_row(image)
return {
"file_name": f"{image.file_id.upper()}r.tif",
"title": image.title,
"actual_data": json.dumps(actual_dict),
}
for row in sheet.iter_rows():
if row[0].value in files_and_scans:
yield FileRow.from_row(row, files_and_scans[row[0].value])

def _get_actual_excel_row(self, image: ImageRow) -> dict[str, str]:
def _get_actual_excel_row(self, image: FileRow) -> dict[str, str]:
"""Turn an ImageRow into a dict with the correct column headers."""
title_nl, title_en, title_it = self._translate_title(image, self.database)
if image.location:
Expand Down Expand Up @@ -216,7 +195,7 @@ def _get_actual_excel_row(self, image: ImageRow) -> dict[str, str]:
}

def _translate_title(
self, image: ImageRow, database: Database
self, image: FileRow, database: Database
) -> tuple[str, str, str]:
"""Translate a title into Dutch, English and Italian."""
# Find document translation
Expand Down Expand Up @@ -256,8 +235,5 @@ def _translate_title(


if __name__ == "__main__":
eadmaker = ContentDMFileWriter(
"inputs/VolumesExcel_06_07_2022/it_IT",
False,
)
eadmaker = ContentDMFileWriter("inputs/VolumesExcel_06_07_2022/it_IT", False)
eadmaker.run()
4 changes: 3 additions & 1 deletion python/xlsx_functions/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from openpyxl.cell.cell import Cell


def compare_rows(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool:
def compare_rows(row1: tuple[Cell, ...] | None, row2: tuple[Cell, ...]) -> bool:
"""Compare the values of two rows."""
if row1 is None:
return False
return [i.value for i in row1[1:]] == [i.value for i in row2[1:]]

2 comments on commit 85facc8

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the currently missing titles:
no. Missing titles
These are the currently missing translations:
no. Missing translations
These are the errors found in titles:
no. Errors in titles

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the rest of the output log. Please check for additional errors:
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 10_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 11_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 12_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 13_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 14_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 15_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 16_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 17_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 18_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 19_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 1_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 20_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 21_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 22_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 23_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 24_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 25_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 26_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 27_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 28_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 29_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 2_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 30_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 31_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 32_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 33_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 34_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 35_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 36_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 37_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 38_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 39_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 3_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 40_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 41_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 42_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 43_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 44_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 45_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 46_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 47_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 48_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 49_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 4_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 50_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 51_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 52_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 53_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 54_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 55_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 56_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 57_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 58_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 59_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 5_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 60_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 61_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 62_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 63_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 64_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 65_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 66_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 67_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 68_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 69_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 6_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 7_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 8_it_IT.xlsx
File written to outputs/VolumesExcelSanitized_06_07_2022/it_IT/Paesi Bassi VOLUME 9_it_IT.xlsx
Starting to create XML file!
Finished writing volume ms276 - Volume 1
Finished writing volume ms277 - Volume 2
Finished writing volume ms278 - Volume 3
Finished writing volume ms279 - Volume 4
Finished writing volume ms280 - Volume 5
Finished writing volume ms281 - Volume 6
Finished writing volume ms282 - Volume 7
Finished writing volume ms283 - Volume 8
Finished writing volume ms284 - Volume 9
Finished writing volume ms285 - Volume 10
Finished writing volume ms286 - Volume 11
Finished writing volume ms287 - Volume 12
Finished writing volume ms288 - Volume 13
Finished writing volume ms289 - Volume 14
Finished writing volume ms290 - Volume 15
Finished writing volume ms291 - Volume 16
Finished writing volume ms292 - Volume 17
Finished writing volume ms293 - Volume 18
Finished writing volume ms294 - Volume 19
Finished writing volume ms295 - Volume 20
Finished writing volume ms296 - Volume 21
Finished writing volume ms297 - Volume 22
Finished writing volume ms298 - Volume 23
Finished writing volume ms299 - Volume 24
Finished writing volume ms300 - Volume 25
Finished writing volume ms301 - Volume 26
Finished writing volume ms302 - Volume 27
Finished writing volume ms303 - Volume 28
Finished writing volume ms304 - Volume 29
Finished writing volume ms305 - Volume 30
Finished writing volume ms306 - Volume 31
Finished writing volume ms307 - Volume 32
Finished writing volume ms308 - Volume 33
Finished writing volume ms309 - Volume 34
Finished writing volume ms310 - Volume 35
Finished writing volume ms311 - Volume 36
Finished writing volume ms312 - Volume 37
Finished writing volume ms313 - Volume 38
Finished writing volume ms314 - Volume 39
Finished writing volume ms315 - Volume 40
Finished writing volume ms316 - Volume 41
Finished writing volume ms317 - Volume 42
Finished writing volume ms318 - Volume 43
Finished writing volume ms319 - Volume 44
Finished writing volume ms320 - Volume 45
Finished writing volume ms321 - Volume 46
Finished writing volume ms322 - Volume 47
Finished writing volume ms323 - Volume 48
Finished writing volume ms324 - Volume 49
Finished writing volume ms325 - Volume 50
Finished writing volume ms326 - Volume 51
Finished writing volume ms327 - Volume 52
Finished writing volume ms328 - Volume 53
Finished writing volume ms329 - Volume 54
Finished writing volume ms330 - Volume 55
Finished writing volume ms331 - Volume 56
Finished writing volume ms332 - Volume 57
Finished writing volume ms333 - Volume 58
Finished writing volume ms334 - Volume 59
Finished writing volume ms335 - Volume 60
Finished writing volume ms336 - Volume 61
Finished writing volume ms337 - Volume 62
Finished writing volume ms338 - Volume 63
Finished writing volume ms339 - Volume 64
Finished writing volume ms340 - Volume 65
Finished writing volume ms341 - Volume 66
Finished writing volume ms342 - Volume 67
Finished writing volume ms343 - Volume 68
Finished writing volume ms344 - Volume 69
Printed file to outputs/Legation_Archive.xml
Writing XML complete!
XML-DTD check complete!

Please sign in to comment.