Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#38: Ingest an instrument from a URL #75

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 36 additions & 19 deletions src/harmony/schemas/errors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,51 @@

'''

from pydantic import BaseModel

class BaseHarmonyError(Exception):
def __init__(self, message: str = None):
self.status_code = 500
self.detail = message or "Something went wrong"
super().__init__(self.detail)

class BadRequestError(BaseModel):
status_code = 400
detail = "Bad request data"

class BadRequestError(BaseHarmonyError):
def __init__(self, message: str = None):
self.status_code = 400
self.detail = message or "Bad request data"
super(Exception, self).__init__(self.detail)

class SomethingWrongError(BaseModel):
status_code = 500
detail = "Something went wrong"

class SomethingWrongError(BaseHarmonyError):
def __init__(self, message: str = None):
self.status_code = 500
self.detail = message or "Something went wrong"
super(Exception, self).__init__(self.detail)

class UnauthorizedError(BaseModel):
status_code = 401
message = "Unauthorized"

class UnauthorizedError(BaseHarmonyError):
def __init__(self, message: str = None):
self.status_code = 401
self.detail = message or "Unauthorized"
super(Exception, self).__init__(self.detail)

class ForbiddenError(BaseModel):
status_code = 403
message = "Forbidden"

class ForbiddenError(BaseHarmonyError):
def __init__(self, message: str = None):
self.status_code = 403
self.detail = message or "Forbidden"
super(Exception, self).__init__(self.detail)

class ConflictError(BaseModel):
status_code = 409
message = "Conflict"

class ConflictError(BaseHarmonyError):
def __init__(self, message: str = None):
self.status_code = 409
self.detail = message or "Conflict"
super(Exception, self).__init__(self.detail)

class ResourceNotFoundError(BaseModel):
status_code = 404
message = "Resource not found"

class ResourceNotFoundError(BaseHarmonyError):
def __init__(self, message: str = None):
self.status_code = 404
self.detail = message or "Resource not found"
super(Exception, self).__init__(self.detail)
14 changes: 7 additions & 7 deletions src/harmony/schemas/requests/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@

'''

from typing import List, Optional

from pydantic import ConfigDict, BaseModel, Field

from harmony.schemas.catalogue_instrument import CatalogueInstrument
from harmony.schemas.catalogue_question import CatalogueQuestion
from harmony.schemas.enums.file_types import FileType
from harmony.schemas.enums.languages import Language
from pydantic import ConfigDict, BaseModel, Field
from typing import Any, Dict, List, Optional

DEFAULT_FRAMEWORK = "huggingface"
DEFAULT_MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
Expand All @@ -45,6 +43,7 @@ class RawFile(BaseModel):
content: str = Field(description="The raw file contents")
text_content: Optional[str] = Field(None, description="The plain text content")
tables: list = Field([], description="The tables in the file")
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Optional metadata about the file")
model_config = ConfigDict(
json_schema_extra={
"example": {
Expand All @@ -65,7 +64,8 @@ class Question(BaseModel):
instrument_id: Optional[str] = Field(None, description="Unique identifier for the instrument (UUID-4)")
instrument_name: Optional[str] = Field(None, description="Human readable name for the instrument")
topics_auto: Optional[list] = Field(None, description="Automated list of topics identified by model")
topics_strengths: Optional[dict] = Field(None, description="Automated list of topics identified by model with strength of topic")
topics_strengths: Optional[dict] = Field(None,
description="Automated list of topics identified by model with strength of topic")
nearest_match_from_mhc_auto: Optional[dict] = Field(None, description="Automatically identified nearest MHC match")
closest_catalogue_question_match: Optional[CatalogueQuestion] = Field(
None, description="The closest question match in the catalogue for the question"
Expand Down Expand Up @@ -95,7 +95,7 @@ class Instrument(BaseModel):
study: Optional[str] = Field(None, description="The study")
sweep: Optional[str] = Field(None, description="The sweep")
metadata: Optional[dict] = Field(None,
description="Optional metadata about the instrument (URL, citation, DOI, copyright holder)")
description="Optional metadata about the instrument (URL, citation, DOI, copyright holder)")
language: Language = Field(Language.English,
description="The ISO 639-2 (alpha-2) encoding of the instrument language")
questions: List[Question] = Field(description="The items inside the instrument")
Expand Down Expand Up @@ -124,7 +124,7 @@ class Instrument(BaseModel):
"source_page": 0
}]
}
})
})


class MatchParameters(BaseModel):
Expand Down
221 changes: 221 additions & 0 deletions src/harmony/util/url_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
'''
MIT License

Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
Project: Harmony (https://harmonydata.ac.uk)
Maintainer: Thomas Wood (https://fastdatascience.com)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

'''

import base64
import hashlib
import requests
import ssl
import urllib.parse
import uuid
from datetime import datetime, timedelta
from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments
from harmony.schemas.errors.base import BadRequestError, ForbiddenError, ConflictError, SomethingWrongError
from harmony.schemas.requests.text import RawFile, Instrument, FileType
from pathlib import Path
from requests.adapters import HTTPAdapter
from typing import List, Dict

MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
DOWNLOAD_TIMEOUT = 30 # seconds
MAX_REDIRECTS = 5
ALLOWED_SCHEMES = {'https'}
RATE_LIMIT_REQUESTS = 60 # requests per min
RATE_LIMIT_WINDOW = 60 # seconds

MIME_TO_FILE_TYPE = {
'application/pdf': FileType.pdf,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': FileType.xlsx,
'text/plain': FileType.txt,
'text/csv': FileType.csv,
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': FileType.docx
}

EXT_TO_FILE_TYPE = {
'.pdf': FileType.pdf,
'.xlsx': FileType.xlsx,
'.txt': FileType.txt,
'.csv': FileType.csv,
'.docx': FileType.docx
}


class URLDownloader:
def __init__(self):
self.rate_limit_storage: Dict[str, List[datetime]] = {}
self.session = requests.Session()
self.session.mount('https://', HTTPAdapter(max_retries=3))
self.session.verify = True

def _check_rate_limit(self, domain: str) -> None:
now = datetime.now()
if domain not in self.rate_limit_storage:
self.rate_limit_storage[domain] = []

self.rate_limit_storage[domain] = [
ts for ts in self.rate_limit_storage[domain]
if ts > now - timedelta(seconds=RATE_LIMIT_WINDOW)
]

if len(self.rate_limit_storage[domain]) >= RATE_LIMIT_REQUESTS:
raise ConflictError("Rate limit exceeded")

self.rate_limit_storage[domain].append(now)

def _validate_url(self, url: str) -> None:
try:
parsed = urllib.parse.urlparse(url)

if parsed.scheme not in ALLOWED_SCHEMES:
raise BadRequestError(f"URL must use HTTPS")

if not parsed.netloc or '.' not in parsed.netloc:
raise BadRequestError("Invalid domain")

if '..' in parsed.path or '//' in parsed.path:
raise ForbiddenError("Path traversal detected")

if parsed.fragment:
raise BadRequestError("URL fragments not supported")

blocked_domains = {'localhost', '127.0.0.1', '0.0.0.0'}
if parsed.netloc in blocked_domains:
raise ForbiddenError("Access to internal domains blocked")

except Exception as e:
raise BadRequestError(f"Invalid URL: {str(e)}")

def _validate_ssl(self, response: requests.Response) -> None:
cert = response.raw.connection.sock.getpeercert()
if not cert:
raise ForbiddenError("Invalid SSL certificate")

not_after = ssl.cert_time_to_seconds(cert['notAfter'])
if datetime.fromtimestamp(not_after) < datetime.now():
raise ForbiddenError("Expired SSL certificate")

def _check_legal_headers(self, response: requests.Response) -> None:
if response.headers.get('X-Robots-Tag', '').lower() == 'noindex':
raise ForbiddenError("Access not allowed by robots directive")

if 'X-Copyright' in response.headers:
raise ForbiddenError("Content is copyright protected")

if 'X-Terms-Of-Service' in response.headers:
raise ForbiddenError("Terms of service acceptance required")

def _validate_content_type(self, url: str, content_type: str) -> FileType:
try:
content_type = content_type.split(';')[0].lower()

if content_type in MIME_TO_FILE_TYPE:
return MIME_TO_FILE_TYPE[content_type]

ext = Path(urllib.parse.urlparse(url).path).suffix.lower()
if ext in EXT_TO_FILE_TYPE:
return EXT_TO_FILE_TYPE[ext]

raise BadRequestError(f"Unsupported file type: {content_type}")
except BadRequestError:
raise
except Exception as e:
raise BadRequestError(f"Error validating content type: {str(e)}")

def download(self, url: str) -> RawFile:
try:
self._validate_url(url)
domain = urllib.parse.urlparse(url).netloc
self._check_rate_limit(domain)

response = self.session.get(
url,
timeout=DOWNLOAD_TIMEOUT,
stream=True,
verify=True,
allow_redirects=True,
headers={
'User-Agent': 'HarmonyBot/1.0 (+https://harmonydata.ac.uk)',
'Accept': ', '.join(MIME_TO_FILE_TYPE.keys())
}
)
response.raise_for_status()

self._validate_ssl(response)
self._check_legal_headers(response)

content_length = response.headers.get('content-length')
if content_length and int(content_length) > MAX_FILE_SIZE:
raise ForbiddenError(f"File too large: {content_length} bytes (max {MAX_FILE_SIZE})")

file_type = self._validate_content_type(url, response.headers.get('content-type', ''))

hasher = hashlib.sha256()
content = b''
for chunk in response.iter_content(chunk_size=8192):
hasher.update(chunk)
content += chunk

if file_type in [FileType.pdf, FileType.xlsx, FileType.docx]:
content_str = f"data:{response.headers['content-type']};base64," + base64.b64encode(content).decode(
'ascii')
else:
content_str = content.decode('utf-8')

return RawFile(
file_id=str(uuid.uuid4()),
file_name=Path(urllib.parse.urlparse(url).path).name or "downloaded_file",
file_type=file_type,
content=content_str,
metadata={
'content_hash': hasher.hexdigest(),
'download_timestamp': datetime.now().isoformat(),
'source_url': url
}
)

except (BadRequestError, ForbiddenError, ConflictError):
raise
except requests.Timeout:
raise SomethingWrongError("Download timeout")
except requests.TooManyRedirects:
raise ForbiddenError("Too many redirects")
except requests.RequestException as e:
if e.response is not None:
if e.response.status_code == 401:
raise ForbiddenError("Resource requires authentication")
elif e.response.status_code == 403:
raise ForbiddenError("Access forbidden")
elif e.response.status_code == 429:
raise ConflictError("Rate limit exceeded")
raise SomethingWrongError(f"Download error: {str(e)}")
except Exception as e:
raise SomethingWrongError(f"Unexpected error: {str(e)}")


def load_instruments_from_url(url: str) -> List[Instrument]:
downloader = URLDownloader()
raw_file = downloader.download(url)
return convert_files_to_instruments([raw_file])
Loading
Loading