Skip to content

Commit

Permalink
Merge pull request #6 from harmonydata/catalogue-match-instruments
Browse files Browse the repository at this point in the history
Catalogue match instruments
  • Loading branch information
woodthom2 authored Aug 21, 2024
2 parents 7d91350 + cc43e54 commit e4a51f0
Show file tree
Hide file tree
Showing 17 changed files with 534 additions and 213 deletions.
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,10 @@ data/

# Cache
instruments_cache.json
vectors_cache.json
vectors_cache.json

# Catalogue data
huggingface_sentence_transformers_paraphrase_multilingual_MiniLM_L12_v2_embeddings_all_float16.pkl.bz2
all_questions_ever_seen.json
all_instruments_preprocessed.json
instrument_idx_to_question_idxs.json
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[submodule "harmony"]
path = harmony
url = https://github.com/harmonydata/harmony.git
branch = main
branch = catalogue-match-instruments
[submodule "mhc_embeddings"]
path = mhc_embeddings
url = https://github.com/harmonydata/mhc_embeddings.git
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ environment. Make sure to give the service account the required `Vertex AI` role

`TIKA_SERVER_ENDPOINT` - This is the endpoint where `Tika` is served from.

`AZURE_STORAGE_URL` - The Azure Blob storage URL. This is required for downloading the
catalogue data.

You can ideally set these environment variables to show Harmony where to look for dependencies and data, but it will
work without it (it will download the sentence transformer from HuggingFace Hub, etc).

Expand Down
1 change: 1 addition & 0 deletions docker_compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ services:
AZURE_OPENAI_API_KEY:
AZURE_OPENAI_ENDPOINT:
TIKA_SERVER_ENDPOINT: http://tika:9998
AZURE_STORAGE_URL:
volumes:
- harmonyapimount:/data

Expand Down
2 changes: 1 addition & 1 deletion harmony
Submodule harmony updated 29 files
+0 −1 MANIFEST.in
+2 −4 pyproject.toml
+1 −3 requirements.txt
+2 −5 src/harmony/__init__.py
+2 −3 src/harmony/matching/default_matcher.py
+335 −20 src/harmony/matching/matcher.py
+68 −94 src/harmony/matching/negator.py
+ src/harmony/parsing/20240719_pdf_question_extraction_sklearn_crf_model.pkl
+6 −55 src/harmony/parsing/pdf_parser.py
+27 −0 src/harmony/parsing/text_extraction/__init__.py
+240 −0 src/harmony/parsing/text_extraction/dictionary_options_matcher.py
+182 −0 src/harmony/parsing/text_extraction/ensemble_named_entity_recogniser.py
+92 −0 src/harmony/parsing/text_extraction/options_extractor.py
+72 −0 src/harmony/parsing/text_extraction/options_words.py
+13 −19 src/harmony/parsing/text_extraction/rule_based_extractor.py
+84 −0 src/harmony/parsing/text_extraction/sequence_finder.py
+135 −0 src/harmony/parsing/text_extraction/smart_document_parser.py
+73 −0 src/harmony/parsing/text_extraction/smart_table_analyser.py
+14 −39 src/harmony/parsing/text_extraction/spacy_options_matcher.py
+193 −0 src/harmony/parsing/text_extraction/spacy_wrapper.py
+3 −0 src/harmony/parsing/text_parser.py
+0 −127 src/harmony/parsing/util/feature_extraction.py
+9 −0 src/harmony/schemas/catalogue_instrument.py
+10 −0 src/harmony/schemas/catalogue_question.py
+17 −5 src/harmony/schemas/requests/text.py
+11 −4 src/harmony/schemas/responses/text.py
+1 −1 src/harmony/util/file_helper.py
+0 −61 src/harmony/util/instrument_helper.py
+2 −2 tests/test_negator.py
1 change: 1 addition & 0 deletions harmony_api/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class Settings(BaseSettings):
OPENAI_API_KEY: str | None = os.getenv("OPENAI_API_KEY")
AZURE_OPENAI_API_KEY: str | None = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT: str | None = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_STORAGE_URL: str | None = os.getenv("AZURE_STORAGE_URL")
GOOGLE_APPLICATION_CREDENTIALS: dict = GOOGLE_APPLICATION_CREDENTIALS


Expand Down
24 changes: 24 additions & 0 deletions harmony_api/dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from harmony.schemas.requests.text import MatchBody
from harmony_api import http_exceptions, helpers, constants


def model_from_match_body_is_available(match_body: MatchBody) -> bool:
"""
Check model availability.
"""

model = match_body.parameters
__check_model(model.dict())

return True


def __check_model(model_dict: dict):
if model_dict not in constants.ALL_HARMONY_API_MODELS:
raise http_exceptions.CouldNotProcessRequestHTTPException(
"Could not process request because the model does not exist."
)
if not helpers.check_model_availability(model_dict):
raise http_exceptions.CouldNotProcessRequestHTTPException(
"Could not process request because the model is not available."
)
Loading

0 comments on commit e4a51f0

Please sign in to comment.