Skip to content

Commit

Permalink
Merge pull request FlagOpen#1227 from 545999961/master
Browse files Browse the repository at this point in the history
update mteb eval
  • Loading branch information
545999961 authored Nov 15, 2024
2 parents d1c3b3f + abd9ae8 commit fde4abd
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 31 deletions.
3 changes: 3 additions & 0 deletions FlagEmbedding/abc/evaluation/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ def __call__(
(not os.path.exists(os.path.join(corpus_embd_save_dir, "doc.npy")) or self.overwrite):
os.makedirs(corpus_embd_save_dir, exist_ok=True)
np.save(os.path.join(corpus_embd_save_dir, "doc.npy"), corpus_emb)

gc.collect()
torch.cuda.empty_cache()

faiss_index = index(corpus_embeddings=corpus_emb)
all_scores, all_indices = search(query_embeddings=queries_emb, faiss_index=faiss_index, k=self.search_top_k)
Expand Down
3 changes: 1 addition & 2 deletions FlagEmbedding/abc/inference/AbsEmbedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,7 @@ def encode(
return embeddings

def __del__(self):
if self.pool is not None:
self.stop_multi_process_pool(self.pool)
self.stop_self_pool()

@abstractmethod
def encode_single_device(
Expand Down
3 changes: 1 addition & 2 deletions FlagEmbedding/abc/inference/AbsReranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,7 @@ def compute_score(
return scores

def __del__(self):
if self.pool is not None:
self.stop_multi_process_pool(self.pool)
self.stop_self_pool()

@abstractmethod
def compute_score_single_gpu(
Expand Down
26 changes: 12 additions & 14 deletions FlagEmbedding/evaluation/mteb/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ def read_results(self, output_folder, tasks):
print('ERROR')
break

temp_data = data['scores'][split][0]

if metric == 'ap':
tasks_results[t_type][task_name] = round(temp_data['cos_sim']['ap'] * 100, 2)
elif metric == 'cosine_spearman':
tasks_results[t_type][task_name] = round(temp_data['cos_sim']['spearman'] * 100, 2)
else:
tasks_results[t_type][task_name] = round(temp_data[metric] * 100, 2)
temp_datas = data['scores'][split][0]
temp_data = None
for td in temp_datas:
if td['hf_subset'] == 'default':
temp_data = td
if temp_data is None:
temp_data = temp_datas[0]
tasks_results[t_type][task_name] = round(temp_data['main_score'] * 100, 2)

print(f"tasks_results: {tasks_results}")
return tasks_results

Expand Down Expand Up @@ -145,16 +146,13 @@ def run(self):
task_types=task_types
)
output_folder = self.eval_args.output_dir
new_tasks = []
for task in tasks:
if task.languages is not None:
if len(task.languages) == len([e for e in languages if e in task.languages]):
new_tasks.append(task)

for task in new_tasks:
for task in tasks:
task_name = task.metadata.name
task_type = task.metadata.type

self.retriever.stop_pool()

if self.eval_args.use_special_instructions:
try:
instruction = get_task_def_by_task_name_and_type(task_name, task_type)
Expand Down
15 changes: 12 additions & 3 deletions FlagEmbedding/evaluation/mteb/searcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import numpy as np

from typing import List, Dict, Optional
from FlagEmbedding.abc.evaluation import EvalDenseRetriever, EvalReranker

Expand Down Expand Up @@ -41,6 +43,13 @@ def set_normalize_embeddings(self, normalize_embeddings: bool = True):
"""
self.embedder.normalize_embeddings = normalize_embeddings

def stop_pool(self):
self.embedder.stop_self_pool()
try:
self.embedder.stop_self_query_pool()
except:
pass

def encode_queries(self, queries: List[str], **kwargs):
"""Encode input queries.
Expand All @@ -53,7 +62,7 @@ def encode_queries(self, queries: List[str], **kwargs):
emb = self.embedder.encode_queries(queries)
if isinstance(emb, dict):
emb = emb["dense_vecs"]
return emb
return emb.astype(np.float32)

def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
"""Encode input corpus.
Expand All @@ -71,7 +80,7 @@ def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
emb = self.embedder.encode_corpus(input_texts)
if isinstance(emb, dict):
emb = emb["dense_vecs"]
return emb
return emb.astype(np.float32)

def encode(self, corpus: List[Dict[str, str]], **kwargs):
"""Encode the imput.
Expand All @@ -89,7 +98,7 @@ def encode(self, corpus: List[Dict[str, str]], **kwargs):
emb = self.embedder.encode_queries(input_texts)
if isinstance(emb, dict):
emb = emb["dense_vecs"]
return emb
return emb.astype(np.float32)

class MTEBEvalReranker(EvalReranker):
"""
Expand Down
23 changes: 13 additions & 10 deletions FlagEmbedding/inference/embedder/decoder_only/icl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import queue
from multiprocessing import Queue

import gc
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
Expand Down Expand Up @@ -121,10 +122,8 @@ def __init__(
self.query_pool = None

def __del__(self):
if self.pool is not None:
self.stop_multi_process_pool(self.pool)
if self.query_pool is not None:
self.stop_multi_process_pool(self.query_pool)
self.stop_self_pool()
self.stop_self_query_pool()

def set_examples(self, examples_for_task: Optional[List[dict]] = None):
"""Set the prefix to the provided examples.
Expand Down Expand Up @@ -175,6 +174,14 @@ def get_detailed_example(instruction_format: str, instruction: str, query: str,
"""
return instruction_format.format(instruction, query, response)

def stop_self_query_pool(self):
if self.query_pool is not None:
self.stop_multi_process_pool(self.query_pool)
self.query_pool = None
self.model.to('cpu')
gc.collect()
torch.cuda.empty_cache()

def encode_queries(
self,
queries: Union[List[str], str],
Expand Down Expand Up @@ -209,9 +216,7 @@ def encode_queries(
**kwargs
)

if self.pool is not None:
self.stop_multi_process_pool(self.pool)
self.pool = None
self.stop_self_pool()
if self.query_pool is None:
self.query_pool = self.start_multi_process_pool(ICLLLMEmbedder._encode_queries_multi_process_worker)
embeddings = self.encode_multi_process(
Expand Down Expand Up @@ -244,9 +249,7 @@ def encode_corpus(
Returns:
Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
"""
if self.query_pool is not None:
self.stop_multi_process_pool(self.query_pool)
self.query_pool = None
self.stop_self_query_pool()
return super().encode_corpus(
corpus,
batch_size=batch_size,
Expand Down

0 comments on commit fde4abd

Please sign in to comment.