improvements

holunda-io · Mar 27, 2024 · e476ab5 · e476ab5
1 parent 961b635
commit e476ab5
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 193 deletions.
diff --git a/bpm-ai-core/bpm_ai_core/llm/common/blob.py b/bpm-ai-core/bpm_ai_core/llm/common/blob.py
@@ -1,4 +1,3 @@
-import mimetypes
 import os
 from io import BytesIO
 from pathlib import PurePath
@@ -7,6 +6,7 @@
 import requests
 from pydantic import BaseModel, Field, model_validator
 
+from bpm_ai_core.util.file import guess_mimetype
 from bpm_ai_core.util.storage import read_file_from_azure_blob, read_file_from_s3, is_s3_url, is_azure_blob_url
 
 
@@ -122,7 +122,7 @@ def from_path_or_url(
             Blob instance
         """
         if mime_type is None and guess_type:
-            _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
+            _mimetype = guess_mimetype(path) if guess_type else None
         else:
             _mimetype = mime_type
 

diff --git a/bpm-ai-core/bpm_ai_core/util/audio.py b/bpm-ai-core/bpm_ai_core/util/audio.py
@@ -3,15 +3,18 @@
 
 import requests
 
-from bpm_ai_core.util.file import is_supported_file
 
-supported_audio_extensions = [
-   "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav2", "webm"
-]
-
-
-def is_supported_audio_file(url_or_path: str) -> bool:
-    return is_supported_file(url_or_path, supported_extensions=supported_audio_extensions)
+audio_ext_map = {
+    'flac': 'audio/flac',
+    'mp3': 'audio/mpeg',
+    'mp4': 'audio/mpeg',
+    'mpeg': 'audio/mpeg',
+    'mpga': 'audio/mpeg',
+    'm4a': 'audio/mpeg',
+    'ogg': 'audio/ogg',
+    'wav': 'audio/vnd.wav',
+    'webm': 'audio/webm',
+}
 
 
 def load_audio(path: str) -> io.BytesIO:
@@ -28,7 +31,7 @@ def load_audio(path: str) -> io.BytesIO:
         # Handle web URL
         response = requests.get(path)
         audio = io.BytesIO(response.content)
-        if path.endswith(tuple(supported_audio_extensions)):
+        if path.endswith(tuple(audio_ext_map.keys())):
             audio.name = f"audio.{path.rsplit('.', 1)[-1]}"
     elif os.path.isfile(path):
         # Handle local file path
@@ -38,4 +41,4 @@ def load_audio(path: str) -> io.BytesIO:
     else:
         raise ValueError("The path provided is neither a valid URL nor a file path.")
 
-    return audio
+    return audio
diff --git a/bpm-ai-core/bpm_ai_core/util/file.py b/bpm-ai-core/bpm_ai_core/util/file.py
@@ -2,23 +2,47 @@
 from typing import List
 from urllib.parse import urlparse
 
+from bpm_ai_core.util.audio import audio_ext_map
+from bpm_ai_core.util.image import image_ext_map, pdf_ext_map
 
-def is_supported_file(url_or_path: str, supported_extensions: List[str]) -> bool:
-    url_or_path = url_or_path.strip()
+supported_ext_map = {**audio_ext_map, **image_ext_map, **pdf_ext_map}
+supported_extensions = supported_ext_map.keys()
+
+
+def guess_mimetype(filename: str) -> str | None:
+    if is_supported_file(filename, list(image_ext_map.keys())):
+        return image_ext_map[_extract_extension(filename)]
+    elif is_supported_file(filename, list(pdf_ext_map.keys())):
+        return "application/pdf"
+    elif is_supported_file(filename, list(audio_ext_map.keys())):
+        return audio_ext_map[_extract_extension(filename)]
+
+
+def is_supported_file(url_or_path: str, extensions: List[str] = supported_extensions) -> bool:
+    file_extension = _extract_extension(url_or_path)
     # Normalize the extensions to lowercase
-    supported_extensions = [ext.lower() for ext in supported_extensions]
+    extensions = [ext.lower() for ext in extensions]
+    # Check if the file extension is in the list of supported extensions
+    return file_extension in extensions
+
+
+def is_supported_img_file(url_or_path: str) -> bool:
+    return is_supported_file(url_or_path, extensions=list(image_ext_map.keys()) + list(pdf_ext_map.keys()))
 
+
+def is_supported_audio_file(url_or_path: str) -> bool:
+    return is_supported_file(url_or_path, extensions=list(audio_ext_map.keys()))
+
+
+def _extract_extension(url_or_path):
+    url_or_path = url_or_path.strip()
     # Extract the path from URL if it's a URL
     parsed_url = urlparse(url_or_path)
     path = parsed_url.path if parsed_url.scheme else url_or_path
-
     # Remove trailing slash if present
     if path.endswith('/'):
         path = path[:-1]
-
     # Extract the file extension
     _, file_extension = os.path.splitext(path)
     file_extension = file_extension.lower().lstrip('.')
-
-    # Check if the file extension is in the list of supported extensions
-    return file_extension in supported_extensions
+    return file_extension
diff --git a/bpm-ai-core/bpm_ai_core/util/image.py b/bpm-ai-core/bpm_ai_core/util/image.py
@@ -8,31 +8,34 @@
 from PIL import Image, ImageDraw
 from pdf2image import convert_from_path, convert_from_bytes
 
-from bpm_ai_core.llm.common.blob import Blob
-from bpm_ai_core.util.file import is_supported_file
-
 logger = logging.getLogger(__name__)
 
-supported_img_extensions = [
-    'bmp', 'dib',
-    'gif',
-    'icns', 'ico',
-    'jfif', 'jpe', 'jpeg', 'jpg',
-    'j2c', 'j2k', 'jp2', 'jpc', 'jpf', 'jpx',
-    'apng', 'png',
-    'pbm', 'pgm', 'pnm', 'ppm',
-    'tif', 'tiff',
-    'webp',
-    'emf', 'wmf',
-    'pdf'
-]
-
-
-def is_supported_img_file(url_or_path: str) -> bool:
-    return is_supported_file(url_or_path, supported_extensions=supported_img_extensions)
-
 
-async def blob_as_images(blob: Blob, accept_formats: list[str], return_bytes: bool = False) -> Union[list[Image.Image], list[bytes]]:
+image_ext_map = {
+    'bmp': 'image/bmp',
+    'gif': 'image/gif',
+    'icns': 'image/x-icns',
+    'ico': 'image/x-icon',
+    'jfif': 'image/jpeg',
+    'jpe': 'image/jpeg',
+    'jpeg': 'image/jpeg',
+    'jpg': 'image/jpeg',
+    'png': 'image/png',
+    'pbm': 'image/x-portable-bitmap',
+    'pgm': 'image/x-portable-graymap',
+    'pnm': 'image/x-portable-anymap',
+    'ppm': 'image/x-portable-pixmap',
+    'tif': 'image/tiff',
+    'tiff': 'image/tiff',
+    'webp': 'image/webp',
+}
+
+pdf_ext_map = {
+    'pdf': 'application/pdf'
+}
+
+
+async def blob_as_images(blob, accept_formats: list[str], return_bytes: bool = False) -> Union[list[Image.Image], list[bytes]]:
     """
     Load an image, PDF, or other file in a Blob into a Pillow Image object or raw bytes of accepted format.