diff --git a/milvus/build/merlinite-qq.sh b/milvus/build/merlinite-qq.sh deleted file mode 100755 index a70f526..0000000 --- a/milvus/build/merlinite-qq.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -## EXPECTED INPUT IS STRING ECAPSULATED -input="$1" -echo "input: $input" -request_body='{"model":"ibm/merlinite-7b","logprobs":false,"messages":[{"role": "system","content": "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."},{"role":"user","content": "'$input'"}],"stream":false}' -echo $request_body -curl -X 'POST' 'https://merlinite-7b-vllm-openai.apps.fmaas-backend.fmaas.res.ibm.com/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -k -d $request_body diff --git a/milvus/seed/README.md b/milvus/seed/README.md index e69de29..5df33a2 100644 --- a/milvus/seed/README.md +++ b/milvus/seed/README.md @@ -0,0 +1,29 @@ +RAG application with ILAB + +1. setup a vector DB (Milvus) + +Development story: + 0. Starting Goal: + - Naive RAG no KG aided + - Addition: + 1. identify what the model lacks knowledge in + 2. Can I use the interal trained model or do I have to use the HF model + - + +- UI integration + +----------------------------------------------- + +variable definition +class Config + +_identify_params, +_llm_type, _extract_token_usage, + +Inherint in defining this spec which could eventually live as a contribution to langchain are some assumptions / questions I made: + - Is the model serializable: Assumed no + - Max tokens for merlinite and granite: Both assumed 4096 + - Does this model have attention / memmory? + - Does these models have a verbosity option for output? + - Recomended default values: + - \ No newline at end of file diff --git a/milvus/seed/__pycache__/ilab_model.cpython-311.pyc b/milvus/seed/__pycache__/ilab_model.cpython-311.pyc new file mode 100644 index 0000000..2b8da03 Binary files /dev/null and b/milvus/seed/__pycache__/ilab_model.cpython-311.pyc differ diff --git a/milvus/seed/__pycache__/merlinite_model.cpython-311.pyc b/milvus/seed/__pycache__/merlinite_model.cpython-311.pyc new file mode 100644 index 0000000..19d0734 Binary files /dev/null and b/milvus/seed/__pycache__/merlinite_model.cpython-311.pyc differ diff --git a/milvus/seed/client.py b/milvus/seed/client.py index 0e23b13..53d1e4e 100644 --- a/milvus/seed/client.py +++ b/milvus/seed/client.py @@ -1,6 +1,12 @@ -import httpx +import requests +import json import os -import ssl +from ilab_model import IlabLLM +from dotenv import load_dotenv +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain + +load_dotenv() # manage ENV model_endpoint=os.getenv('MODEL_ENDPOINT') @@ -11,48 +17,50 @@ if model_name == "": model_name = "ibm/merlinite-7b" -model_token=os.getenv('MODEL_TOKEN') +model_token=os.getenv('ILAB_API_TOKEN') # HTTPS client -client_key_path = "/home/fedora/client-tls-key.pem2" -client_crt_path = "/home/fedora/client-tls-crt.pem2" -server_ca_crt = "/home/fedora/server-ca-crt.pem2" - -ssl_context = ssl.create_default_context(cafile=server_ca_crt) -ssl_context.load_cert_chain(certfile=client_crt_path, keyfile=client_key_path) - -client = httpx.Client(verify=ssl_context) - - -def get_openai_response(prompt, **kwargs): - url = model_endpoint - headers = { - "Authorization": f"Bearer {model_token}", - "Content-Type": "application/json" - } - data = { - "model": model_name, - "max_tokens": 4096, - "messages": [ - { - "role": "system", - "content": "You are an AI language model developed by IBM Research. You are a cautious assistant that carefully follows instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior." - }, - { - "role":"user", - "content": prompt - } - ], - "logprobs":False, - "stream":False - } - - response = client.post(url, json=data, headers=headers) - response.raise_for_status() - return response.json() - -question = """ Question: I am training for an upcoming marathon but I am completely out of shape! Can you help me to implement a plan to prepare me for running a marathon in 12 weeks? - -Answer: Let's think step by step. """ - -# get_openai_response(question) +# client_key_path = "/home/fedora/client-tls-key.pem2" +# client_crt_path = "/home/fedora/client-tls-crt.pem2" +# server_ca_crt = "/home/fedora/server-ca-crt.pem2" + +# ssl_context = ssl.create_default_context(cafile=server_ca_crt) +# ssl_context.load_cert_chain(certfile=client_crt_path, keyfile=client_key_path) + +# client = httpx.Client(verify=ssl_context) + +# data = { +# "model": "instructlab/granite-7b-lab", +# "messages": [ +# {"role": "system", "content": "your name is carl"}, +# {"role": "user", "content": "what is your name?"} +# ], +# "temperature": 1, +# "max_tokens": 1792, +# "top_p": 1, +# "repetition_penalty": 1.05, +# "stop": ["<|endoftext|>"], +# "logprobs": False, +# "stream": False +# } + +# response = requests.post(url, headers=headers, data=json.dumps(data), verify=False) +# print(response.json()) +print(f'model_name={model_name}') +llm = IlabLLM( + model_endpoint=model_endpoint, + model_name=model_name, + apikey=model_token, + temperature=1, + max_tokens=500, + top_p=1, + repetition_penalty=1.05, + stop=["<|endoftext|>"], + streaming=False +) + +prompt="I am training for a marathon in 12 weeks. Can you help me build an exercise plan to help prepare myself?" +prompts=[prompt] +# prompt_template = PromptTemplate.from_template(prompt) +llm.generate(prompts) +# llm.invoke("dog") diff --git a/milvus/seed/dumb_client.py b/milvus/seed/dumb_client.py new file mode 100644 index 0000000..e08c912 --- /dev/null +++ b/milvus/seed/dumb_client.py @@ -0,0 +1,40 @@ +import requests +import json +import os +from dotenv import load_dotenv + +load_dotenv() + +# manage ENV +model_endpoint=os.getenv('MODEL_ENDPOINT') +if model_endpoint == "": + model_endpoint = "http://localhost:8001" + +model_name=os.getenv('MODEL_NAME') +if model_name == "": + model_name = "ibm/merlinite-7b" + +model_token=os.getenv('MODEL_TOKEN') + +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {model_token}" +} + +data = { + "model": model_name, + "messages": [ + {"role": "system", "content": "your name is carl"}, + {"role": "user", "content": "what is your name?"} + ], + "temperature": 1, + "max_tokens": 1792, + "top_p": 1, + "repetition_penalty": 1.05, + "stop": ["<|endoftext|>"], + "logprobs": False, + "stream": False +} + +response = requests.post(model_endpoint, headers=headers, data=json.dumps(data), verify=False) +print(response.json()) \ No newline at end of file diff --git a/milvus/seed/ilab_model.py b/milvus/seed/ilab_model.py new file mode 100644 index 0000000..bc0b009 --- /dev/null +++ b/milvus/seed/ilab_model.py @@ -0,0 +1,372 @@ +#!/bin/python3 + +## This is a langchain compatabible implementation for the Ilab models. It will remain in this repo until we publish APIKey +## functionality and route backendservice endpoints through a proxy that can be exposed, similary to openAI. At which point +## we can move this pr as a contribution to langchain and easily scale our usage! + +### Fixes in progress: + ### - override self params with calls invoke or generate for temperature, etc. + ### - test that invoke works, generate starts + ### - Feat: streaming implementation + ### - Callbacks with streaming + ### - Authentication enablement via user and password rather than just API keys + ### - Authentication checking for API keys (whole backend API setup) + ### - Utilize tags and metadata with langserve + ### - Allow logprobs as an option + +import os +import httpx +import requests +import json +from langchain_core.language_models.llms import BaseLLM +from dotenv import load_dotenv +from langchain_core.outputs import Generation, LLMResult +from langchain_core.pydantic_v1 import Field, SecretStr, root_validator +from langchain_core.utils import ( + convert_to_secret_str, + get_from_dict_or_env, + get_pydantic_field_names, +) +from langchain_core.utils.utils import build_extra_kwargs + +load_dotenv() +from typing import ( + Any, + Dict, + List, + Set, + Optional, + Mapping +) + +class IlabLLM(BaseLLM): + """ + Instructlab large language model. + + As this model is currently private, you must have pre-arranged access. + """ + + # REQUIRED PARAMS + + model_endpoint: str = "" + """The model Endpoint to Use""" + + model_name: str = Field(alias="model") + """Type of deployed model to use.""" + + # OPTIONAL BUT DEFAULTS + + system_prompt: Optional[str] = "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior." + """Default system prompt to use.""" + + model_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Holds any model parameters valid for `create` call not explicitly specified.""" + + max_tokens: int = 4096 + """The maximum number of tokens to generate in the completion. + -1 returns as many tokens as possible given the prompt and + the models maximal context size.""" + + # TOTALLY OPTIONAL + + apikey: Optional[SecretStr] = None + """Apikey to the Ilab model APIs (merlinte or granite)""" + + top_p: Optional[float] = 1 + """Total probability mass of tokens to consider at each step.""" + + frequency_penalty: Optional[float] = 0 + """Penalizes repeated tokens according to frequency.""" + + repetition_penalty: Optional[float] = 0 + """Penalizes repeated tokens.""" + + temperature: Optional[float] = 0.7 + """What sampling temperature to use.""" + + # verbose: Optional[str] = None + # """If the model should return verbose output or standard""" + + streaming: bool = False + """ Whether to stream the results or not. """ + + # FUTURE EXTENSIONS + + tags: Optional[List[str]] = None + """Tags to add to the run trace.""" + + metadata: Optional[Dict[str, Any]] = None + """Metadata to add to the run trace.""" + + # This gets implemented with stream + # callbacks: Optional[SecretStr] = None + # """callbacks""" + + # END PARMS + + class Config: + """Configuration for this pydantic object.""" + allow_population_by_field_name = True + + @property + def lc_secrets(self) -> Dict[str, str]: + """A map of constructor argument names to secret ids. + + For example: + { + "apikey": "ILAB_API_KEY", + } + """ + return { + "apikey": "ILAB_API_KEY", + } + + @classmethod + def is_lc_serializable(cls) -> bool: + """Return whether this model can be serialized by Langchain.""" + return False + + @root_validator(pre=True) + def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """Build extra kwargs from additional params that were passed in.""" + all_required_field_names = get_pydantic_field_names(cls) + extra = values.get("model_kwargs", {}) + values["model_kwargs"] = build_extra_kwargs( + extra, values, all_required_field_names + ) + return values + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + if values["streaming"] == True: + raise ValueError("streaming has not yet been implemented.") + if values["apikey"] or "ILAB_API_KEY" in os.environ: + values["apikey"] = convert_to_secret_str( + get_from_dict_or_env(values, "apikey", "ILAB_API_KEY") + ) + values['model_name'] = get_from_dict_or_env( + values, + "model_name", + "MODEL_NAME", + ) + ## extension for more options for required auth params + ## client_params = { + ## "api_key": ( + ## values["apikey"].get_secret_value() + ## if values["apikey"] + ## else None + ## ) + ## } + # CURRENTLY WE DONT CHECK KEYS + ## if not client_params['values']['apikey']: + ## raise ValueError("Did not find token `apikey`.") + return values + + @property + def _params(self) -> Mapping[str, Any]: + """Get the identifying parameters.""" + params = {**{ + "model_name": self.model_name, + "model_endpoint": self.model_endpoint, + }, **self._default_params} + if self.apikey: + params['apikey'] = self.apikey + if self.model_name: + params['model_name'] = self.model_name + return params + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters for calling Merlinite API.""" + normal_params: Dict[str, Any] = { + "temperature": self.temperature, + "top_p": self.top_p, + "frequency_penalty": self.frequency_penalty, + "presence_penalty": self.repetition_penalty, + } + + if self.max_tokens is not None: + normal_params["max_tokens"] = self.max_tokens + + return {**normal_params, **self.model_kwargs} + + + def _invocation_params(self) -> Dict[str, Any]: + """Get the parameters used to invoke the model.""" + return self._params + + def make_request(self, params: Dict[str, Any], prompt: str, stop: Optional[List[str]]) -> Dict[str, Any]: + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.apikey}" + } + + data = { + "model": params['model_name'], + "messages": [ + { + "role": "system", + "content": self.system_prompt + }, + { + "role": "user", + "content": prompt + } + ], + "temperature": params['temperature'], + "max_tokens": params['max_tokens'], + "top_p": params['top_p'], + "stop": stop, + "logprobs": False, + } + + if 'repetition_penalty' in params: + data["repetition_penalty"] = params['repetition_penalty'] + + if 'streaming' in params: + # Shadowing basemodel re-route for streaming + data["stream"] = params["streaming"] + + response = requests.post(self.model_endpoint, headers=headers, data=json.dumps(data), verify=False) + response_json = response.json() + + def _call(self, prompt: str, stop:Optional[List[str]] = None, **kwargs: Any) -> str: + """Call the ilab inference endpoint. The result of invoke. + Args: + prompt: The prompt to pass into the model. + stop: Optional list of stop words to use when generating. + run_manager: Optional callback manager. + Returns: + The string generated by the model. + Example: + .. code-block:: python + + response = merlinite.invoke("What is a molecule") + """ + + invocation_params = self._invocation_params + params = {**invocation_params, **kwargs} + + if stop == None: + stop = ["<|endoftext|>"] + response_json = self.make_request( + params=params, prompt=prompt, stop=stop, **kwargs + ) + return response_json['choices'][0]['messages']['content'] + + def _generate( + self, + prompts: List[str], + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> LLMResult: + """Call out to Ilab's endpoint with prompt. + + Args: + prompt: The prompt to pass into the model. + stop: Optional list of stop words to use when generating. + + Returns: + The full LLM output. + + Example: + .. code-block:: python + + response = ilab.generate(["Tell me a joke."]) + """ + + invocation_params = self._invocation_params() + params = {**invocation_params, **kwargs} + token_usage: Dict[str, int] = {} + system_fingerprint: Optional[str] = None + + response_json = self.make_request( + params=params, prompt=prompts[0], stop=stop, **kwargs + ) + + if not ('choices' in response_json and len(response_json['choices']) > 0): + raise ValueError("No valid response from the model") + + if response_json.get("error"): + raise ValueError(response_json.get("error")) + + if not system_fingerprint: + system_fingerprint = response_json.get("system_fingerprint") + return self._create_llm_result( + response_json=response_json, + ) + + def _llm_type(self) -> str: + """Get the type of language model used by this chat model. Used for logging purposes only.""" + return "instructlab" + + @property + def max_context_size(self) -> int: + """Get max context size for this model.""" + return self.modelname_to_contextsize(self.model_name) + + def _create_llm_result(self, response: List[dict]) -> LLMResult: + """Create the LLMResult from the choices and prompt.""" + generations = [] + for res in response: + results = res.get("results") + if results: + finish_reason = results[0].get("choices")[0].get('finished_reason') + gen = Generation( + text=results[0].get("choices")[0].get('message').get('content'), + generation_info={"finish_reason": finish_reason}, + ) + generations.append([gen]) + final_token_usage = self._extract_token_usage(response) + llm_output = { + "token_usage": final_token_usage, + "model_name": self.model_name + } + return LLMResult(generations=generations, llm_output=llm_output) + + @staticmethod + def _extract_token_usage( + response: Optional[List[Dict[str, Any]]] = None, + ) -> Dict[str, Any]: + if response is None: + return {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} + + prompt_tokens = 0 + completion_tokens = 0 + total_tokens = 0 + + def get_count_value(key: str, result: Dict[str, Any]) -> int: + return result.get(key, 0) or 0 + + for res in response: + results = res.get("results") + if results: + prompt_tokens += get_count_value("prompt_tokens", results[0]) + completion_tokens += get_count_value( + "completion_tokens", results[0] + ) + total_tokens += get_count_value("total_tokens", results[0]) + + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens + } + + @staticmethod + def modelname_to_contextsize(modelname: str) -> int: + """Calculate the maximum number of tokens possible to generate for a model.""" + model_token_mapping = { + "ibm/merlinite-7b": 4096, + "instructlab/granite-7b-lab": 4096 + } + + context_size = model_token_mapping.get(modelname, None) + + if context_size is None: + raise ValueError( + f"Unknown model: {modelname}. Please provide a valid Ilab model name." + "Known models are: " + ", ".join(model_token_mapping.keys()) + ) + + return context_size diff --git a/milvus/seed/new-seed.py b/milvus/seed/new-seed.py deleted file mode 100644 index c6ea909..0000000 --- a/milvus/seed/new-seed.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -from pymilvus import MilvusClient, DataType -from langchain_experimental.text_splitter import SemanticChunker -from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader -from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings -from tika import parser # pip install tika - -def log_step(step_num, step_name) -> None: - print("-----------------------------------------------") - print(f"{step_num}. {step_name}") - print("-----------------------------------------------") - -model_name = "ibm/merlinite-7b" -model_kwargs = {"device": "cpu"} -encode_kwargs = {"normalize_embeddings": True} - -log_step(0, "Generate embeddings") -embeddings = HuggingFaceBgeEmbeddings( - model_name=model_name, - model_kwargs=model_kwargs, - encode_kwargs=encode_kwargs, - query_instruction = "search_query:", - embed_instruction = "search_document:" -) - - -# data_url = "https://orkerhulen.dk/onewebmedia/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf" -# loader = WebBaseLoader(data_url) -# data = loader.load() -raw = parser.from_file("data/DnD-5e-Handbook.pdf") -print(raw['content']) diff --git a/milvus/seed/new_seed.py b/milvus/seed/new_seed.py new file mode 100644 index 0000000..60311c7 --- /dev/null +++ b/milvus/seed/new_seed.py @@ -0,0 +1,41 @@ +import os +from pymilvus import MilvusClient, DataType +from langchain_community.vectorstores import Milvus +from langchain_experimental.text_splitter import SemanticChunker +from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter +from langchain import hub +from langchain_core.runnables import RunnablePassthrough +from langchain_core.output_parsers import StrOutputParser +from tika import parser # pip install tika +from langchain_openai import OpenAI +from ilab_models import IlabOpenAILLM + + +def log_step(step_num, step_name) -> None: + print("-----------------------------------------------") + print(f"{step_num}. {step_name}") + print("-----------------------------------------------") + +embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") + +text_splitter = SemanticChunker(embeddings=embeddings) # fails + +loader = PyPDFLoader('./data/DnD-5e-Handbook.pdf') +data = loader.load() +split_data = text_splitter.split_documents(data) +print(len(split_data)) +vector_store = Milvus.from_documents( + documents=split_data, + embedding=embeddings, + connection_args={"host": "localhost", "port": 19530}, + collection_name="dnd" +) + +llm = IlabOpenAILLM( + +) + +retreiver = vector_store.as_retreiver() +prompt = hub.pull("rlm/rag-prompt") \ No newline at end of file diff --git a/milvus/seed/requirements.txt b/milvus/seed/requirements.txt index 297139d..431c4f8 100644 --- a/milvus/seed/requirements.txt +++ b/milvus/seed/requirements.txt @@ -7,3 +7,4 @@ langchain-experimental==0.0.59 tika==2.6.0 sentence-transformers==2.7.0 beautifulsoup4==4.12.3 +python-dotenv==1.0.1 diff --git a/milvus/seed/seed.py b/milvus/seed/seed.py index 09a0f4d..044158e 100644 --- a/milvus/seed/seed.py +++ b/milvus/seed/seed.py @@ -4,7 +4,7 @@ from langchain_experimental.text_splitter import SemanticChunker from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter from langchain import hub from langchain_core.runnables import RunnablePassthrough from langchain_core.output_parsers import StrOutputParser @@ -24,18 +24,25 @@ def milvus_init() -> MilvusClient: def fill_dnd_collection(text_splitter: any, embeddings: any) -> None: # local - raw = parser.from_file("data/DnD-5e-Handbook.pdf") - print(len(raw['content'])) - docs = text_splitter.create_documents([raw['content']]) + # raw = parser.from_file("data/DnD-5e-Handbook.pdf") + # print(len(raw['content'])) + # docs = text_splitter.create_documents([raw['content']]) + # vector_store = Milvus.from_documents( + # docs, + # embedding=embeddings, + # connection_args={"host": "localhost", "port": 19530}, + # collection_name="dnd" + # ) + # remote + loader = PyPDFLoader('https://orkerhulen.dk/onewebmedia/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf') + data = loader.load() + split_data = text_splitter.split_documents(data) vector_store = Milvus.from_documents( - docs, + documents=split_data, embedding=embeddings, connection_args={"host": "localhost", "port": 19530}, collection_name="dnd" ) - # remote - # loader = PyPDFLoader('https://orkerhulen.dk/onewebmedia/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf') - # data = loader.load() def generate_embeddings() -> any: # model_name = "ibm/merlinite-7b"