Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set exiftool path explicitly. #267

Merged
merged 1 commit into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""

def _get_metadata(self, local_path):
exiftool = shutil.which("exiftool")
if not exiftool:
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,

md = MarkItDown(exiftool_path="{which_exiftool}")

This warning will be removed in future releases.
""",
DeprecationWarning,
)

return None
else:
try:
result = subprocess.run(
[exiftool, "-json", local_path], capture_output=True, text=True
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
Expand All @@ -920,7 +931,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = ""

# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"Title",
Expand Down Expand Up @@ -975,7 +986,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = ""

# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"Title",
Expand Down Expand Up @@ -1036,7 +1047,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
md_content = ""

# Add metadata
metadata = self._get_metadata(local_path)
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
if metadata:
for f in [
"ImageSize",
Expand Down Expand Up @@ -1325,6 +1336,7 @@ def __init__(
llm_client: Optional[Any] = None,
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
exiftool_path: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
Expand All @@ -1334,6 +1346,9 @@ def __init__(
else:
self._requests_session = requests_session

if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH")

# Handle deprecation notices
#############################
if mlm_client is not None:
Expand Down Expand Up @@ -1366,6 +1381,7 @@ def __init__(
self._llm_client = llm_client
self._llm_model = llm_model
self._style_map = style_map
self._exiftool_path = exiftool_path

self._page_converters: List[DocumentConverter] = []

Expand Down Expand Up @@ -1549,12 +1565,15 @@ def _convert(
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters

if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map

if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters

# If we hit an error log it and keep trying
try:
res = converter.convert(local_path, **_kwargs)
Expand Down
32 changes: 26 additions & 6 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,29 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
markitdown = MarkItDown()
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()

# Test JPG metadata processing
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool)
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content

# Test setting the exiftool path through an environment variable
os.environ["EXIFTOOL_PATH"] = which_exiftool
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
for key in JPG_TEST_EXIFTOOL:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
Expand Down Expand Up @@ -341,8 +361,8 @@ def test_markitdown_llm() -> None:

if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
Comment on lines -344 to -348

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, I noticed that the tests in test_markitdown_exiftool() were commented out in this commit. Was this an intentional change, or was it possibly included by mistake?

# test_markitdown_deprecation()
# test_markitdown_llm()
Loading