Skip to content

Commit

Permalink
Fixed tests, fixed bug with package extras
Browse files Browse the repository at this point in the history
  • Loading branch information
Pringled committed Oct 31, 2024
1 parent 8b481ce commit e63e3a0
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 16 deletions.
23 changes: 16 additions & 7 deletions model2vec/hf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,16 @@ def save_pretrained(
folder_path.mkdir(exist_ok=True, parents=True)
save_file({"embeddings": embeddings}, folder_path / "model.safetensors")
tokenizer.save(str(folder_path / "tokenizer.json"))
json.dump(config, open(folder_path / "config.json", "w"))
with open(folder_path / "config.json", "w") as config_file:
json.dump(config, config_file, indent=4, sort_keys=True)

# Save vocab.txt
with open(folder_path / "vocab.txt", "w") as vocab_file:
vocab = tokenizer.get_vocab()
for token in sorted(vocab, key=vocab.get):
vocab_file.write(f"{token}\n")

# Load tokenizer.json
# Load tokenizer.json to use for generating tokenizer_config.json
with open(folder_path / "tokenizer.json", "r") as f:
tokenizer_data = json.load(f)

Expand All @@ -62,7 +63,14 @@ def save_pretrained(
"unk_token": "[UNK]",
"mask_token": "[MASK]",
}
json.dump(special_tokens, open(folder_path / "special_tokens_map.json", "w"), indent=4)
with open(folder_path / "special_tokens_map.json", "w") as special_tokens_file:
json.dump(special_tokens, special_tokens_file, indent=4, sort_keys=True)

# Set fallback values for normalizer attributes in case normalizer is None
normalizer = tokenizer_data.get("normalizer")
do_lower_case = normalizer.get("lowercase") if normalizer else config.get("do_lower_case", True)
strip_accents = normalizer.get("strip_accents") if normalizer else None
tokenize_chinese_chars = normalizer.get("handle_chinese_chars", True) if normalizer else True

# Save tokenizer_config.json based on tokenizer.json
tokenizer_config = {
Expand All @@ -79,17 +87,18 @@ def save_pretrained(
},
"clean_up_tokenization_spaces": True,
"cls_token": special_tokens["cls_token"],
"do_lower_case": tokenizer_data.get("normalizer", {}).get("lowercase", config.get("do_lower_case", True)),
"do_lower_case": do_lower_case,
"mask_token": special_tokens["mask_token"],
"model_max_length": config.get("seq_length", 512),
"pad_token": special_tokens["pad_token"],
"sep_token": special_tokens["sep_token"],
"strip_accents": tokenizer_data.get("normalizer", {}).get("strip_accents"),
"tokenize_chinese_chars": tokenizer_data.get("normalizer", {}).get("handle_chinese_chars", True),
"strip_accents": strip_accents,
"tokenize_chinese_chars": tokenize_chinese_chars,
"tokenizer_class": "BertTokenizer",
"unk_token": special_tokens["unk_token"],
}
json.dump(tokenizer_config, open(folder_path / "tokenizer_config.json", "w"), indent=4)
with open(folder_path / "tokenizer_config.json", "w") as tokenizer_config_file:
json.dump(tokenizer_config, tokenizer_config_file, indent=4, sort_keys=True)

logger.info(f"Saved model to {folder_path}")

Expand Down
18 changes: 10 additions & 8 deletions model2vec/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,19 @@ def get_tensor(self, key: str) -> np.ndarray:

def get_package_extras(package: str, extra: str) -> Iterator[str]:
"""Get the extras of the package."""
message = metadata(package)
try:
message = metadata(package)
except Exception as e:
raise ImportError(f"Could not retrieve metadata for package '{package}': {e}")

Check warning on line 32 in model2vec/utils.py

View check run for this annotation

Codecov / codecov/patch

model2vec/utils.py#L31-L32

Added lines #L31 - L32 were not covered by tests

all_packages = message.get_all("Requires-Dist") or []
for package in all_packages:
name, *rest = package.split(";", maxsplit=1)
if not rest:
continue
_, found_extra = rest[0].split("==", maxsplit=1)
# Strip off quotes
found_extra = found_extra.strip(' "')
if found_extra == extra:
yield name
if rest:
# Extract and clean the extra requirement
found_extra = rest[0].split("==")[-1].strip(" \"'")
if found_extra == extra:
yield name.strip()


def importable(module: str, extra: str) -> None:
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"rich",
"tqdm",
"tokenizers>=0.20",
"safetensors",
"setuptools",
]

Expand All @@ -53,6 +54,8 @@ dev = [
]
distill = ["torch", "transformers", "scikit-learn"]

onnx = ["onnx", "torch"]

[project.urls]
"Homepage" = "https://github.com/MinishLab"
"Bug Reports" = "https://github.com/MinishLab/model2vec/issues"
Expand Down
10 changes: 10 additions & 0 deletions scripts/export_to_onnx.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
from model2vec.utils import get_package_extras, importable

# Define the optional dependency group name
_REQUIRED_EXTRA = "onnx"

# Check if each dependency for the "onnx" group is importable
for extra_dependency in get_package_extras("model2vec", _REQUIRED_EXTRA):
importable(extra_dependency, _REQUIRED_EXTRA)

import argparse
import logging
from pathlib import Path
Expand All @@ -6,6 +15,7 @@

from model2vec import StaticModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


Expand Down
58 changes: 57 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e63e3a0

Please sign in to comment.