Skip to content

Commit

Permalink
chore: Update transformers on 1.x branch (#8528)
Browse files Browse the repository at this point in the history
* Update transformers

* New Anthropic tokenizer.json URL

* Black it

* Fix mypy failures

* Fix pylint failures

---------

Co-authored-by: Silvano Cerza <[email protected]>
  • Loading branch information
vblagoje and silvanocerza authored Nov 11, 2024
1 parent a7005f6 commit ab28a2b
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 24 deletions.
12 changes: 8 additions & 4 deletions haystack/document_stores/mongodb_atlas.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import re
from typing import Dict, Generator, List, Optional, Union

import numpy as np
from tqdm import tqdm

from haystack import __version__ as haystack_version
from haystack.document_stores import BaseDocumentStore
from haystack.errors import DocumentStoreError
from haystack.nodes.retriever import DenseRetriever
from haystack.schema import Document, FilterType
from haystack.utils import get_batches_from_generator
from haystack import __version__ as haystack_version
from .mongodb_filters import mongo_filter_converter

from ..lazy_imports import LazyImport
from .mongodb_filters import mongo_filter_converter

with LazyImport("Run 'pip install farm-haystack[mongodb]'") as mongodb_import:
import pymongo
from pymongo import InsertOne, ReplaceOne, UpdateOne
from pymongo.collection import Collection as MongoCollection
from pymongo.driver_info import DriverInfo

METRIC_TYPES = ["euclidean", "cosine", "dotProduct"]
Expand Down Expand Up @@ -82,7 +86,7 @@ def __init__(
def _create_document_field_map(self) -> Dict:
return {self.embedding_field: "embedding"}

def _get_collection(self, index=None) -> "pymongo.collection.Collection":
def _get_collection(self, index=None) -> "MongoCollection":
"""
Returns the collection named by index or returns the collection specified when the
driver was initialized.
Expand Down Expand Up @@ -126,7 +130,7 @@ def delete_documents(
elif (ids, filters) == (ids, filters):
mongo_filters = {"$and": [mongo_filter_converter(filters), {"id": {"$in": ids}}]}

collection.delete_many(filter=mongo_filters)
collection.delete_many(filter=mongo_filters) # pylint: disable=possibly-used-before-assignment

def delete_index(self, index=None):
"""
Expand Down
17 changes: 7 additions & 10 deletions haystack/modeling/model/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,24 @@
Thanks for the great work!
"""

from typing import Type, Optional, Dict, Any, Union, List

import re
import json
import logging
import os
import re
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional, Type, Union

import numpy as np
import torch
from torch import nn
import transformers
from transformers import PretrainedConfig, PreTrainedModel
from transformers import AutoModel, AutoConfig
from torch import nn
from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
from transformers.modeling_utils import SequenceSummary

from haystack.errors import ModelingError
from haystack.modeling.utils import silence_transformers_logs


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -213,8 +211,7 @@ def _pool_tokens(
):
token_vecs = sequence_output.cpu().numpy()
# we only take the aggregated value of non-padding tokens
padding_mask = padding_mask.cpu().numpy()
ignore_mask_2d = padding_mask == 0
ignore_mask_2d = padding_mask.cpu().numpy() == 0
# sometimes we want to exclude the CLS token as well from our aggregation operation
if ignore_first_token:
ignore_mask_2d[:, 0] = True
Expand All @@ -225,7 +222,7 @@ def _pool_tokens(
if strategy == "reduce_mean":
pooled_vecs = np.ma.array(data=token_vecs, mask=ignore_mask_3d).mean(axis=1).data

return pooled_vecs
return pooled_vecs # pylint: disable=possibly-used-before-assignment


class HFLanguageModel(LanguageModel):
Expand Down
7 changes: 3 additions & 4 deletions haystack/modeling/model/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,15 +502,14 @@ def logits_to_preds(
# sorted_candidates.shape : (batch_size, max_seq_len^2, 2)
start_indices = torch.div(flat_sorted_indices, max_seq_len, rounding_mode="trunc")
end_indices = flat_sorted_indices % max_seq_len
sorted_candidates = torch.cat((start_indices, end_indices), dim=2)

# Get the n_best candidate answers for each sample
sorted_candidates = sorted_candidates.cpu().numpy()
start_end_matrix = start_end_matrix.cpu().numpy()
sorted_candidates = torch.cat((start_indices, end_indices), dim=2).cpu().numpy()
start_end_matrix_array = start_end_matrix.cpu().numpy()
for sample_idx in range(batch_size):
sample_top_n = self.get_top_candidates(
sorted_candidates[sample_idx],
start_end_matrix[sample_idx],
start_end_matrix_array[sample_idx],
sample_idx,
start_matrix=start_matrix[sample_idx],
end_matrix=end_matrix[sample_idx],
Expand Down
7 changes: 4 additions & 3 deletions haystack/nodes/prompt/invocation_layer/anthropic_claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
# Taken from:
# https://github.com/anthropics/anthropic-sdk-python/blob/main/anthropic/tokenizer.py#L7
# This is a JSON config to load the tokenizer used for Anthropic Claude.
CLAUDE_TOKENIZER_REMOTE_FILE = (
"https://raw.githubusercontent.com/anthropics/anthropic-sdk-python/main/src/anthropic/tokenizer.json"
)
# Anthropic removed tokenizer.json from their repo (https://github.com/anthropics/anthropic-sdk-python/pull/726),
# we need to use the commit from the latest version of the SDK that still
# has it, i.e. 0.38.0 and commit hash 14afc93ffd809e60666a267763a57a328184c5e4.
CLAUDE_TOKENIZER_REMOTE_FILE = "https://raw.githubusercontent.com/anthropics/anthropic-sdk-python/14afc93ffd809e60666a267763a57a328184c5e4/src/anthropic/tokenizer.json"


class AnthropicClaudeInvocationLayer(PromptModelInvocationLayer):
Expand Down
2 changes: 1 addition & 1 deletion haystack/utils/experiment_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def track_params(self, params: Dict[str, Any]):

def track_artifacts(self, dir_path: Union[str, Path], artifact_path: Optional[str] = None):
try:
mlflow.log_artifacts(dir_path, artifact_path)
mlflow.log_artifacts(str(dir_path), artifact_path)
except ConnectionError:
logger.warning("ConnectionError in logging artifacts to MLflow")
except Exception as e:
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ dependencies = [
"requests",
"httpx",
"pydantic<2",
"transformers==4.39.3",
"transformers>=4.46,<5.0",
"pandas",
"rank_bm25",
"scikit-learn>=1.3.0", # TF-IDF and metrics
Expand Down Expand Up @@ -86,7 +86,7 @@ dependencies = [

[project.optional-dependencies]
inference = [
"transformers[torch,sentencepiece]==4.39.3",
"transformers[torch,sentencepiece]>=4.46,<5.0",
"sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"huggingface-hub>=0.5.0",
]
Expand Down

0 comments on commit ab28a2b

Please sign in to comment.