Skip to content

Commit

Permalink
Merge branch 'release/0.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed Jul 27, 2023
2 parents 4e2f2cb + d02a122 commit f65968a
Show file tree
Hide file tree
Showing 12 changed files with 236 additions and 38 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ jobs:
-
name: Display release info
run: |
echo "$Version: ${{ steps.query-release-info.outputs.version }}"
echo "$Date: ${{ steps.query-release-info.outputs.release-date }}"
echo "Version: ${{ steps.query-release-info.outputs.version }}"
echo "Date: ${{ steps.query-release-info.outputs.release-date }}"
echo "${{ steps.query-release-info.outputs.release-notes }}"
-
uses: ncipollo/release-action@v1
Expand All @@ -177,4 +177,4 @@ jobs:
path: dist
-
name: Publish build to PyPI
uses: pypa/[email protected].7
uses: pypa/[email protected].8
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ repos:
exclude: "(^Pipfile\\.lock$)"
# Python hooks
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.278'
rev: 'v0.0.280'
hooks:
- id: ruff
- repo: https://github.com/psf/black
Expand Down
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.4.0] - 2023-07-27

### Added

- More extensive testing of date and time strings in various formats, including
[RFC-3339](https://www.ietf.org/rfc/rfc3339.txt), ISO-8061 and things in between

### Changed

- Date parsing is now does not assume a timezone if none is provided (the parsed datetime will be naive)
- `pypa/gh-action-pypi-publish` updated to v1.8.8

## [0.3.0] - 2023-07-19

### Added
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ ban-relative-imports = "all"
"S101",
"TID252",
# Allow more complex pytest.raises
"PT012"
"PT012",
"DTZ001"
]

[tool.coverage.run]
Expand Down
2 changes: 1 addition & 1 deletion src/tika_client/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.0"
__version__ = "0.4.0"
12 changes: 10 additions & 2 deletions src/tika_client/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
from types import TracebackType
from typing import Dict
from typing import Optional
from typing import Type

from httpx import Client

Expand Down Expand Up @@ -35,7 +38,7 @@ def __init__(
self.tika = Tika(self._client, compress=compress)
self.rmeta = Recursive(self._client, compress=compress)

def add_headers(self, header: Dict[str, str]): # pragma: no cover
def add_headers(self, header: Dict[str, str]) -> None: # pragma: no cover
"""
Updates the httpx Client headers with the given values
"""
Expand All @@ -44,5 +47,10 @@ def add_headers(self, header: Dict[str, str]): # pragma: no cover
def __enter__(self) -> "TikaClient":
return self

def __exit__(self, exc_type, exc_val, exc_tb) -> None:
def __exit__(
self,
exc_type: Optional[Type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
self._client.close()
66 changes: 40 additions & 26 deletions src/tika_client/data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from enum import Enum
from typing import Dict
from typing import List
Expand All @@ -11,7 +12,17 @@
# Based on https://cwiki.apache.org/confluence/display/TIKA/Metadata+Overview

logger = logging.getLogger("tika-client.data")
_FRACTION_REGEX = re.compile("(.*)([\\.,][0-9]+)(.*)")
_TIME_RE = re.compile(
r"(?P<year>\d{4})-"
r"(?P<month>\d{2})-"
r"(?P<day>\d{2})"
r"[ tT]"
r"(?P<hour>\d{2}):"
r"(?P<minute>\d{2}):"
r"(?P<second>\d{2})"
r"(?P<fractional_seconds>\.\d+)?"
r"(?P<timezone>[zZ]|[+-]\d{2}:\d{2})?",
)


class TikaKey(str, Enum):
Expand Down Expand Up @@ -105,31 +116,34 @@ def get_optional_datetime(self, key: Union[TikaKey, DublinCoreKey, XmpKey, str])

date_str: str = self.data[key]

# Handle fractional seconds
frac = _FRACTION_REGEX.match(date_str)
if frac is not None:
logger.info("Located fractional seconds")
delta = timedelta(seconds=float(frac.group(2)))
date_str = frac.group(1)
# Attempt to include the timezone info still
if frac.group(3) is not None:
date_str += frac.group(3)
else:
delta = timedelta()

# Handle Zulu time as UTC
if "Z" in date_str:
date_str = date_str.replace("Z", "+00:00")

# Assume UTC if it is not set
if "+" not in date_str:
date_str += "+00:00"

try:
return datetime.fromisoformat(date_str) + delta
except ValueError as e:
logger.error(f"{e} during datetime parsing")
return None
m = _TIME_RE.match(date_str)
if not m:
return None

(year, month, day, hour, minute, second, frac_sec, timezone_str) = m.groups()

microseconds = int(float(frac_sec) * 1000000.0) if frac_sec is not None else 0
tzinfo = None
if timezone_str is not None:
if timezone_str.lower() == "z":
tzinfo = timezone.utc
else:
multi = -1 if timezone_str[0:1] == "-" else 1
hours = int(timezone_str[1:3])
minutes = int(timezone_str[4:])
delta = timedelta(hours=hours, minutes=minutes) * multi
tzinfo = timezone(delta)

return datetime(
year=int(year),
month=int(month),
day=int(day),
hour=int(hour),
minute=int(minute),
second=int(second),
microsecond=microseconds,
tzinfo=tzinfo,
)

def get_optional_string(self, key: Union[TikaKey, DublinCoreKey, XmpKey, str]) -> Optional[str]:
if key not in self.data:
Expand Down
131 changes: 131 additions & 0 deletions tests/test_datetime_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone

import magic
from pytest_httpx import HTTPXMock

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient
from tika_client.data_models import DublinCoreKey
from tika_client.data_models import TikaKey


class TestDateTimeFormat:
def test_parse_offset_date_format_utc(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the +xx:yy format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-05-17T16:30:44+00:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.created == datetime(
year=2023,
month=5,
day=17,
hour=16,
minute=30,
second=44,
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_zulu(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
"""
Test the datetime parsing properly handles a time with a UTC timezone in the Z format
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-01-17T16:35:44Z"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.created == datetime(
year=2023,
month=1,
day=17,
hour=16,
minute=35,
second=44,
tzinfo=timezone.utc,
)

def test_parse_offset_date_format_positive(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
"""
Test the datetime parsing properly handles a time with a timezone in the +xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44+08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.created == datetime(
year=2023,
month=6,
day=17,
hour=16,
minute=30,
second=44,
tzinfo=timezone(timedelta(hours=8)),
)

def test_parse_offset_date_format_negative(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
"""
Test the datetime parsing properly handles a time with a timezone in the -xx:yy format offset from UTC
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "2023-06-17T16:30:44-08:00"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.created == datetime(
year=2023,
month=6,
day=17,
hour=16,
minute=30,
second=44,
tzinfo=timezone(timedelta(hours=-8)),
)

def test_parse_offset_date_format_python_isoformat(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
"""
Test the datetime parsing properly handles a time with a timezone in the ISO 8061 format (as done by Python)
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

expected = datetime.now(tz=timezone.utc)

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: expected.isoformat()},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.created == expected

def test_parse_offset_date_no_match(self, tika_client: TikaClient, httpx_mock: HTTPXMock):
"""
Test the datetime parsing properly handles a time string which doesn't match the correct formats
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"

httpx_mock.add_response(
json={TikaKey.ContentType: "test", TikaKey.Parsers: [], DublinCoreKey.Created: "202-06-17T16:30:44-0"},
)

resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.created is None
6 changes: 4 additions & 2 deletions tests/test_file_formats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from datetime import datetime
from datetime import timezone

import magic

Expand All @@ -9,6 +8,9 @@

class TestLibreOfficeFormats:
def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
"""
Test handling of a ODT document produced by LibreOffice
"""
test_file = SAMPLE_DIR / "sample-libre-office.odt"
resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))

Expand All @@ -27,5 +29,5 @@ def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
minute=30,
second=44,
microsecond=719000,
tzinfo=timezone.utc,
tzinfo=None,
)
6 changes: 6 additions & 0 deletions tests/test_image_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@

class TestParseImageMetadata:
def test_image_jpeg(self, tika_client: TikaClient):
"""
Test the handling of a JPEG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.jpg"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.type == "image/jpeg"

def test_image_png(self, tika_client: TikaClient):
"""
Test the handling of a PNG file metadata retrieval
"""
test_file = SAMPLE_DIR / "sample.png"
resp = tika_client.metadata.from_file(test_file, magic.from_file(str(test_file), mime=True))

Expand Down
Loading

0 comments on commit f65968a

Please sign in to comment.