chore: Update transformers on 1.x branch (#8528)

* Update transformers * New Anthropic tokenizer.json URL * Black it * Fix mypy failures * Fix pylint failures --------- Co-authored-by: Silvano Cerza <[email protected]>
deepset-ai · Nov 11, 2024 · ab28a2b · ab28a2b
1 parent a7005f6
commit ab28a2b
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 24 deletions.
diff --git a/haystack/document_stores/mongodb_atlas.py b/haystack/document_stores/mongodb_atlas.py
@@ -1,19 +1,23 @@
 import re
 from typing import Dict, Generator, List, Optional, Union
+
 import numpy as np
 from tqdm import tqdm
+
+from haystack import __version__ as haystack_version
 from haystack.document_stores import BaseDocumentStore
 from haystack.errors import DocumentStoreError
 from haystack.nodes.retriever import DenseRetriever
 from haystack.schema import Document, FilterType
 from haystack.utils import get_batches_from_generator
-from haystack import __version__ as haystack_version
-from .mongodb_filters import mongo_filter_converter
+
 from ..lazy_imports import LazyImport
+from .mongodb_filters import mongo_filter_converter
 
 with LazyImport("Run 'pip install farm-haystack[mongodb]'") as mongodb_import:
     import pymongo
     from pymongo import InsertOne, ReplaceOne, UpdateOne
+    from pymongo.collection import Collection as MongoCollection
     from pymongo.driver_info import DriverInfo
 
 METRIC_TYPES = ["euclidean", "cosine", "dotProduct"]
@@ -82,7 +86,7 @@ def __init__(
     def _create_document_field_map(self) -> Dict:
         return {self.embedding_field: "embedding"}
 
-    def _get_collection(self, index=None) -> "pymongo.collection.Collection":
+    def _get_collection(self, index=None) -> "MongoCollection":
         """
         Returns the collection named by index or returns the collection specified when the
         driver was initialized.
@@ -126,7 +130,7 @@ def delete_documents(
         elif (ids, filters) == (ids, filters):
             mongo_filters = {"$and": [mongo_filter_converter(filters), {"id": {"$in": ids}}]}
 
-        collection.delete_many(filter=mongo_filters)
+        collection.delete_many(filter=mongo_filters)  # pylint: disable=possibly-used-before-assignment
 
     def delete_index(self, index=None):
         """

diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py
@@ -18,26 +18,24 @@
 Thanks for the great work!
 """
 
-from typing import Type, Optional, Dict, Any, Union, List
-
-import re
 import json
 import logging
 import os
+import re
 from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Type, Union
+
 import numpy as np
 import torch
-from torch import nn
 import transformers
-from transformers import PretrainedConfig, PreTrainedModel
-from transformers import AutoModel, AutoConfig
+from torch import nn
+from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import SequenceSummary
 
 from haystack.errors import ModelingError
 from haystack.modeling.utils import silence_transformers_logs
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -213,8 +211,7 @@ def _pool_tokens(
     ):
         token_vecs = sequence_output.cpu().numpy()
         # we only take the aggregated value of non-padding tokens
-        padding_mask = padding_mask.cpu().numpy()
-        ignore_mask_2d = padding_mask == 0
+        ignore_mask_2d = padding_mask.cpu().numpy() == 0
         # sometimes we want to exclude the CLS token as well from our aggregation operation
         if ignore_first_token:
             ignore_mask_2d[:, 0] = True
@@ -225,7 +222,7 @@ def _pool_tokens(
         if strategy == "reduce_mean":
             pooled_vecs = np.ma.array(data=token_vecs, mask=ignore_mask_3d).mean(axis=1).data
 
-        return pooled_vecs
+        return pooled_vecs  # pylint: disable=possibly-used-before-assignment
 
 
 class HFLanguageModel(LanguageModel):

diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py
@@ -502,15 +502,14 @@ def logits_to_preds(
         # sorted_candidates.shape : (batch_size, max_seq_len^2, 2)
         start_indices = torch.div(flat_sorted_indices, max_seq_len, rounding_mode="trunc")
         end_indices = flat_sorted_indices % max_seq_len
-        sorted_candidates = torch.cat((start_indices, end_indices), dim=2)
 
         # Get the n_best candidate answers for each sample
-        sorted_candidates = sorted_candidates.cpu().numpy()
-        start_end_matrix = start_end_matrix.cpu().numpy()
+        sorted_candidates = torch.cat((start_indices, end_indices), dim=2).cpu().numpy()
+        start_end_matrix_array = start_end_matrix.cpu().numpy()
         for sample_idx in range(batch_size):
             sample_top_n = self.get_top_candidates(
                 sorted_candidates[sample_idx],
-                start_end_matrix[sample_idx],
+                start_end_matrix_array[sample_idx],
                 sample_idx,
                 start_matrix=start_matrix[sample_idx],
                 end_matrix=end_matrix[sample_idx],

diff --git a/haystack/nodes/prompt/invocation_layer/anthropic_claude.py b/haystack/nodes/prompt/invocation_layer/anthropic_claude.py
@@ -22,9 +22,10 @@
 # Taken from:
 # https://github.com/anthropics/anthropic-sdk-python/blob/main/anthropic/tokenizer.py#L7
 # This is a JSON config to load the tokenizer used for Anthropic Claude.
-CLAUDE_TOKENIZER_REMOTE_FILE = (
-    "https://raw.githubusercontent.com/anthropics/anthropic-sdk-python/main/src/anthropic/tokenizer.json"
-)
+# Anthropic removed tokenizer.json from their repo (https://github.com/anthropics/anthropic-sdk-python/pull/726),
+# we need to use the commit from the latest version of the SDK that still
+# has it, i.e. 0.38.0 and commit hash 14afc93ffd809e60666a267763a57a328184c5e4.
+CLAUDE_TOKENIZER_REMOTE_FILE = "https://raw.githubusercontent.com/anthropics/anthropic-sdk-python/14afc93ffd809e60666a267763a57a328184c5e4/src/anthropic/tokenizer.json"
 
 
 class AnthropicClaudeInvocationLayer(PromptModelInvocationLayer):

diff --git a/haystack/utils/experiment_tracking.py b/haystack/utils/experiment_tracking.py
@@ -213,7 +213,7 @@ def track_params(self, params: Dict[str, Any]):
 
     def track_artifacts(self, dir_path: Union[str, Path], artifact_path: Optional[str] = None):
         try:
-            mlflow.log_artifacts(dir_path, artifact_path)
+            mlflow.log_artifacts(str(dir_path), artifact_path)
         except ConnectionError:
             logger.warning("ConnectionError in logging artifacts to MLflow")
         except Exception as e:

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ dependencies = [
   "requests",
   "httpx",
   "pydantic<2",
-  "transformers==4.39.3",
+  "transformers>=4.46,<5.0",
   "pandas",
   "rank_bm25",
   "scikit-learn>=1.3.0", # TF-IDF and metrics
@@ -86,7 +86,7 @@ dependencies = [
 
 [project.optional-dependencies]
 inference = [
-  "transformers[torch,sentencepiece]==4.39.3",
+  "transformers[torch,sentencepiece]>=4.46,<5.0",
   "sentence-transformers<=3.0.0,>=2.3.1",  # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
   "huggingface-hub>=0.5.0",
 ]