Skip to content

Commit

Permalink
feat, chore: add CerebrasProvider, update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
vTuanpham committed Oct 4, 2024
1 parent c9acb08 commit f837592
Show file tree
Hide file tree
Showing 6 changed files with 401 additions and 2 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,18 @@ https://cloud.google.com/translate/docs/languages
## Known Issues
* 'TypeError: "NoneType' object is not iterable"
This issue is relevant to gender-specific translation, you can read more here https://github.com/ssut/py-googletrans/issues/260
## Other use cases
The tool can be used for various NLP tasks that can be parallelized with input and output data that can be represented as a list of strings or string. Some examples include:
* Image captioning
* Text summarization
* Text generation
* Text classification
* Text to speech
* Text to image

Implementing these tasks requires creating a new Provider class that inherits from the `Provider` class and implements the `def __init__` and the `def _do_translate`. Config the throttle decorator to control the rate of requests.


#### Feel free to star 🌟 the repository if the test was successful!
#### Disclaimer: This repo is for private use only.

Expand Down
77 changes: 77 additions & 0 deletions examples/argilla-magpie-ultra-v0.1-cerebras/MagpieUltraV01.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import random
import sys

sys.path.insert(0,r'./')
from tqdm.auto import tqdm

from datasets import load_dataset

from configs import BaseConfig
from translator import DataParser, VerboseCallback
from providers import CerebrasProvider


PARSER_NAME = "MagpieUltraV01_500"

# Patience is the key since the data is large and is using an LLM based translator
class MagpieUltraV01Parser(DataParser):
def __init__(self, file_path: str, output_path: str):
super().__init__(file_path, output_path,
parser_name=PARSER_NAME,
target_config=BaseConfig, # The data config to be validated to check if self implement "convert" function is correct or not,
# you must map the data form to the correct fields of the @dataclass in the configs/base_config.py
target_fields=['question_text', 'orig_answer_texts'], # The data fields to be translated (The fields belong to BaseConfig)
do_translate=True,
no_translated_code=False, # Remove any instance of string that appears to be coding language (e.g. Python code, HTML, etc.)
translator=CerebrasProvider, # Cerebras is very slow but it is a high quality translator
parser_callbacks=[VerboseCallback], # The callback to be called after the data has been converted and translated
max_example_per_thread=25, # Set this to a lower number since a fail translation will cause the whole thread to restart, loosing all the progress of the thread
large_chunks_threshold=3000)

# Read function must assign data that has been read to self.data_read
def read(self) -> None:
# The read function must call the read function in DataParser class
# I just want to be sure that the file path is correct
super(MagpieUltraV01Parser, self).read()

self.data_read = load_dataset("argilla/magpie-ultra-v0.1")
self.system_prompts = load_dataset("teilomillet/system_prompt")

return None

# Convert function must assign data that has been converted to self.converted_data
def convert(self) -> None:
# The convert function must call the convert function in DataParser class
# I just want to be sure the read function has actually assigned the self.data_read
super(MagpieUltraV01Parser, self).convert()

data_converted = []
for split in self.data_read:
for data in tqdm(self.data_read[split], desc=f"Converting {split} data"):
data_dict = {}
random_index = random.randint(0, len(self.system_prompts['train']) - 1)

if random.random() < 0.5:
data_dict['system_prompt'] = None
else:
data_dict['system_prompt'] = self.system_prompts['train'][random_index]['prompt']

data_dict['qas_id'] = self.id_generator()
data_dict['question_text'] = data['instruction']
data_dict['orig_answer_texts'] = data['response']
data_dict['answer_lengths'] = None

data_converted.append(data_dict)

# Be sure to assign the final data list to self.converted_data
self.converted_data = data_converted[:500] # 500 examples at a time to avoid tokens limit

return None


if __name__ == '__main__':
magpie_ultra_v01_parser = MagpieUltraV01Parser(r"examples/argilla-magpie-ultra-v0.1-cerebras/dummy.txt",
r"examples/argilla-magpie-ultra-v0.1-cerebras")
magpie_ultra_v01_parser.read()
magpie_ultra_v01_parser.convert()
magpie_ultra_v01_parser.save
Empty file.
7 changes: 6 additions & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@ echo "Installing dependencies..."

pip install -r requirements.txt
pip install groq==0.9.0
pip install cerebras_cloud_sdk==1.5.0

if [ -z "$GROQ_API_KEY" ]; then
echo "GROQ_API_KEY environment variable is not set. Please set it to your project's GROQ API key. to use the groq provider."
fi

pip install httpx==1.0.0.beta0 --force-reinstall
if [ -z "$CEREBRAS_API_KEY" ]; then
echo "CEREBRAS_API_KEY environment variable is not set. Please set it to your project's CEREBRAS API key. to use the CEREBRAS provider."
fi

pip install httpx==1.0.0.beta0 --force-reinstall # Must be last to avoid conflicts with other dependencies

python string_ops/build.py

Expand Down
3 changes: 2 additions & 1 deletion providers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base_provider import Provider
from .google_provider import GoogleProvider
from .multiple_providers import MultipleProviders
from .groq_provider import GroqProvider
from .groq_provider import GroqProvider
from .cerebras_provider import CerebrasProvider
Loading

0 comments on commit f837592

Please sign in to comment.