Skip to content

Commit

Permalink
improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Benjoyo committed Mar 27, 2024
1 parent 961b635 commit e476ab5
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 193 deletions.
4 changes: 2 additions & 2 deletions bpm-ai-core/bpm_ai_core/llm/common/blob.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import mimetypes
import os
from io import BytesIO
from pathlib import PurePath
Expand All @@ -7,6 +6,7 @@
import requests
from pydantic import BaseModel, Field, model_validator

from bpm_ai_core.util.file import guess_mimetype
from bpm_ai_core.util.storage import read_file_from_azure_blob, read_file_from_s3, is_s3_url, is_azure_blob_url


Expand Down Expand Up @@ -122,7 +122,7 @@ def from_path_or_url(
Blob instance
"""
if mime_type is None and guess_type:
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None
_mimetype = guess_mimetype(path) if guess_type else None
else:
_mimetype = mime_type

Expand Down
23 changes: 13 additions & 10 deletions bpm-ai-core/bpm_ai_core/util/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,18 @@

import requests

from bpm_ai_core.util.file import is_supported_file

supported_audio_extensions = [
"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav2", "webm"
]


def is_supported_audio_file(url_or_path: str) -> bool:
return is_supported_file(url_or_path, supported_extensions=supported_audio_extensions)
audio_ext_map = {
'flac': 'audio/flac',
'mp3': 'audio/mpeg',
'mp4': 'audio/mpeg',
'mpeg': 'audio/mpeg',
'mpga': 'audio/mpeg',
'm4a': 'audio/mpeg',
'ogg': 'audio/ogg',
'wav': 'audio/vnd.wav',
'webm': 'audio/webm',
}


def load_audio(path: str) -> io.BytesIO:
Expand All @@ -28,7 +31,7 @@ def load_audio(path: str) -> io.BytesIO:
# Handle web URL
response = requests.get(path)
audio = io.BytesIO(response.content)
if path.endswith(tuple(supported_audio_extensions)):
if path.endswith(tuple(audio_ext_map.keys())):
audio.name = f"audio.{path.rsplit('.', 1)[-1]}"
elif os.path.isfile(path):
# Handle local file path
Expand All @@ -38,4 +41,4 @@ def load_audio(path: str) -> io.BytesIO:
else:
raise ValueError("The path provided is neither a valid URL nor a file path.")

return audio
return audio
40 changes: 32 additions & 8 deletions bpm-ai-core/bpm_ai_core/util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,47 @@
from typing import List
from urllib.parse import urlparse

from bpm_ai_core.util.audio import audio_ext_map
from bpm_ai_core.util.image import image_ext_map, pdf_ext_map

def is_supported_file(url_or_path: str, supported_extensions: List[str]) -> bool:
url_or_path = url_or_path.strip()
supported_ext_map = {**audio_ext_map, **image_ext_map, **pdf_ext_map}
supported_extensions = supported_ext_map.keys()


def guess_mimetype(filename: str) -> str | None:
if is_supported_file(filename, list(image_ext_map.keys())):
return image_ext_map[_extract_extension(filename)]
elif is_supported_file(filename, list(pdf_ext_map.keys())):
return "application/pdf"
elif is_supported_file(filename, list(audio_ext_map.keys())):
return audio_ext_map[_extract_extension(filename)]


def is_supported_file(url_or_path: str, extensions: List[str] = supported_extensions) -> bool:
file_extension = _extract_extension(url_or_path)
# Normalize the extensions to lowercase
supported_extensions = [ext.lower() for ext in supported_extensions]
extensions = [ext.lower() for ext in extensions]
# Check if the file extension is in the list of supported extensions
return file_extension in extensions


def is_supported_img_file(url_or_path: str) -> bool:
return is_supported_file(url_or_path, extensions=list(image_ext_map.keys()) + list(pdf_ext_map.keys()))


def is_supported_audio_file(url_or_path: str) -> bool:
return is_supported_file(url_or_path, extensions=list(audio_ext_map.keys()))


def _extract_extension(url_or_path):
url_or_path = url_or_path.strip()
# Extract the path from URL if it's a URL
parsed_url = urlparse(url_or_path)
path = parsed_url.path if parsed_url.scheme else url_or_path

# Remove trailing slash if present
if path.endswith('/'):
path = path[:-1]

# Extract the file extension
_, file_extension = os.path.splitext(path)
file_extension = file_extension.lower().lstrip('.')

# Check if the file extension is in the list of supported extensions
return file_extension in supported_extensions
return file_extension
47 changes: 25 additions & 22 deletions bpm-ai-core/bpm_ai_core/util/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,34 @@
from PIL import Image, ImageDraw
from pdf2image import convert_from_path, convert_from_bytes

from bpm_ai_core.llm.common.blob import Blob
from bpm_ai_core.util.file import is_supported_file

logger = logging.getLogger(__name__)

supported_img_extensions = [
'bmp', 'dib',
'gif',
'icns', 'ico',
'jfif', 'jpe', 'jpeg', 'jpg',
'j2c', 'j2k', 'jp2', 'jpc', 'jpf', 'jpx',
'apng', 'png',
'pbm', 'pgm', 'pnm', 'ppm',
'tif', 'tiff',
'webp',
'emf', 'wmf',
'pdf'
]


def is_supported_img_file(url_or_path: str) -> bool:
return is_supported_file(url_or_path, supported_extensions=supported_img_extensions)


async def blob_as_images(blob: Blob, accept_formats: list[str], return_bytes: bool = False) -> Union[list[Image.Image], list[bytes]]:
image_ext_map = {
'bmp': 'image/bmp',
'gif': 'image/gif',
'icns': 'image/x-icns',
'ico': 'image/x-icon',
'jfif': 'image/jpeg',
'jpe': 'image/jpeg',
'jpeg': 'image/jpeg',
'jpg': 'image/jpeg',
'png': 'image/png',
'pbm': 'image/x-portable-bitmap',
'pgm': 'image/x-portable-graymap',
'pnm': 'image/x-portable-anymap',
'ppm': 'image/x-portable-pixmap',
'tif': 'image/tiff',
'tiff': 'image/tiff',
'webp': 'image/webp',
}

pdf_ext_map = {
'pdf': 'application/pdf'
}


async def blob_as_images(blob, accept_formats: list[str], return_bytes: bool = False) -> Union[list[Image.Image], list[bytes]]:
"""
Load an image, PDF, or other file in a Blob into a Pillow Image object or raw bytes of accepted format.
Expand Down
Loading

0 comments on commit e476ab5

Please sign in to comment.