GoogleCloudPlatform · alpha-amundson · May 3, 2024 · May 3, 2024 · May 6, 2024 · May 6, 2024
diff --git a/applications/rag/frontend/container/cloud_sql/cloud_sql.py b/applications/rag/frontend/container/cloud_sql/cloud_sql.py
@@ -1,18 +1,43 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
+from typing import (List, Optional, Iterable, Any)
 
 from google.cloud.sql.connector import Connector, IPTypes
 import pymysql
 import sqlalchemy
 from sentence_transformers import SentenceTransformer
+from langchain_core.vectorstores import VectorStore
 import pg8000
+from langchain_core.embeddings import Embeddings
+from langchain_core.documents import Document
+from sqlalchemy.engine import Engine
+from langchain_google_cloud_sql_pg import PostgresEngine
 
-db = None
+VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get('TABLE_NAME', '') # CloudSQL table name for vector embeddings
+# TODO make this configurable from tf
+CHAT_HISTORY_TABLE_NAME = "message_store" # CloudSQL table name where chat history is stored
 
-TABLE_NAME = os.environ.get('TABLE_NAME', '')  # CloudSQL table name
 INSTANCE_CONNECTION_NAME = os.environ.get('INSTANCE_CONNECTION_NAME', '')
 SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' # Transformer to use for converting text chunks to vector embeddings
 DB_NAME = "pgvector-database"
 
+PROJECT_ID = os.environ.get('PROJECT_ID', '')
+REGION = os.environ.get('REGION', '') 
+INSTANCE = os.environ.get('INSTANCE', '')
+
 db_username_file = open("/etc/secret-volume/username", "r")
 DB_USER = db_username_file.read()
 db_username_file.close()
@@ -23,14 +48,6 @@
 
 transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
 
-def init_db() -> sqlalchemy.engine.base.Engine:
-  """Initiates connection to database and its structure."""
-  global db
-  connector = Connector()
-  if db is None:
-    db = init_connection_pool(connector)
-
-
 # helper function to return SQLAlchemy connection pool
 def init_connection_pool(connector: Connector) -> sqlalchemy.engine.Engine:
   # function used to generate database connection
@@ -52,32 +69,73 @@ def getconn() -> pymysql.connections.Connection:
   )
   return pool
 
-def fetchContext(query_text):
-  with db.connect() as conn:
-    try:
-      results = conn.execute(sqlalchemy.text("SELECT * FROM " + TABLE_NAME)).fetchall()
-      print(f"query database results:")
-      for row in results:
-        print(row)
-
-      # chunkify query & fetch matches
-      query_emb = transformer.encode(query_text).tolist()
-      query_request = "SELECT id, text, text_embedding, 1 - ('[" + ",".join(map(str, query_emb)) + "]' <=> text_embedding) AS cosine_similarity FROM " + TABLE_NAME + " ORDER BY cosine_similarity DESC LIMIT 5;"
-      query_results = conn.execute(sqlalchemy.text(query_request)).fetchall()
-      conn.commit()
-
-      if not query_results:
-        message = f"Table {TABLE_NAME} returned empty result"
-        raise ValueError(message)
-      for row in query_results:
-        print(row)
-    except sqlalchemy.exc.DBAPIError or pg8000.exceptions.DatabaseError as err:
-      message = f"Table {TABLE_NAME} does not exist: {err}"
-      raise sqlalchemy.exc.DataError(message)
-    except sqlalchemy.exc.DatabaseError as err:
-      message = f"Database {INSTANCE_CONNECTION_NAME} does not exist: {err}"
-      raise sqlalchemy.exc.DataError(message)
-    except Exception as err:
-      raise Exception(f"General error: {err}")
-
-  return query_results[0][1]
+def create_sync_postgres_engine():
+    engine = PostgresEngine.from_instance(
+        project_id=PROJECT_ID,
+        region=REGION, 
+        instance=INSTANCE,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASS,
+        ip_type=IPTypes.PRIVATE        
+    )
+    engine.init_chat_history_table(table_name=CHAT_HISTORY_TABLE_NAME)
+    return engine
+
+#TODO replace this with the Cloud SQL vector store for langchain,
+#   once the notebook also uses it (and creates the correct schema)
+class CustomVectorStore(VectorStore):
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ):
+        raise NotImplementedError
+
+    def __init__(self, embedding: Embeddings, engine: Engine):
+        self.embedding = embedding
+        self.engine = engine
+
+    @property
+    def embeddings(self) -> Embeddings:
+        return self.embedding
+
+
+    # TODO implement
+    def add_texts(self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any) -> List[str]:
+        raise NotImplementedError
+
+    #TODO implement similarity search with cosine similarity threshold
+
+    def similarity_search(self, query: dict, k: int = 4, **kwargs: Any) -> List[Document]:
+        with self.engine.connect() as conn:
+            try:
+                q = query["question"]
+                # embed query & fetch matches
+                query_emb = self.embedding.embed_query(q)
+                emb_str = ",".join(map(str, query_emb))
+                query_request = f"""SELECT id, text, 1 - ('[{emb_str}]' <=> text_embedding) AS cosine_similarity 
+                    FROM {VECTOR_EMBEDDINGS_TABLE_NAME} 
+                    ORDER BY cosine_similarity DESC LIMIT {k};"""
+                query_results = conn.execute(sqlalchemy.text(query_request)).fetchall()
+                print(f"GOT {len(query_results)} results")
+                conn.commit()
+
+                if not query_results:
+                    message = f"Table {VECTOR_EMBEDDINGS_TABLE_NAME} returned empty result"
+                    raise ValueError(message)
+            except sqlalchemy.exc.DBAPIError or pg8000.exceptions.DatabaseError as err:
+                message = f"Table {VECTOR_EMBEDDINGS_TABLE_NAME} does not exist: {err}"
+                raise sqlalchemy.exc.DataError(message)
+            except sqlalchemy.exc.DatabaseError as err:
+                message = f"Database {INSTANCE_CONNECTION_NAME} does not exist: {err}"
+                raise sqlalchemy.exc.DataError(message)
+            except Exception as err:
+                raise Exception(f"General error: {err}")
+
+        #convert query results into List[Document]
+        texts = [result[1] for result in query_results]
+        return [Document(page_content=text) for text in texts]
diff --git a/applications/rag/frontend/container/main.py b/applications/rag/frontend/container/main.py
@@ -16,60 +16,33 @@
 import logging as log
 import google.cloud.logging as logging
 import traceback
+import uuid
 
-from flask import Flask, render_template, request, jsonify
-from langchain.chains import LLMChain
-from langchain.llms import HuggingFaceTextGenInference
-from langchain.prompts import PromptTemplate
+from flask import Flask, render_template, request, jsonify, session
 from rai import dlp_filter # Google's Cloud Data Loss Prevention (DLP) API. https://cloud.google.com/security/products/dlp
 from rai import nlp_filter # https://cloud.google.com/natural-language/docs/moderating-text
 from cloud_sql import cloud_sql
-import sqlalchemy
+from rag_langchain.rag_chain import clear_chat_history, create_chain, take_chat_turn, engine
+from datetime import datetime, timedelta, timezone
 
 # Setup logging
 logging_client = logging.Client()
 logging_client.setup_logging()
 
+# TODO: refactor the app startup code into a flask app factory
+# TODO: include the chat history cache in the app lifecycle and ensure that it's threadsafe.
 app = Flask(__name__, static_folder='static')
 app.jinja_env.trim_blocks = True
 app.jinja_env.lstrip_blocks = True
+app.config['ENGINE'] = engine # force the connection pool to warm up eagerly
 
-# initialize parameters
-INFERENCE_ENDPOINT=os.environ.get('INFERENCE_ENDPOINT', '127.0.0.1:8081')
-
-llm = HuggingFaceTextGenInference(
-    inference_server_url=f'http://{INFERENCE_ENDPOINT}/',
-    max_new_tokens=512,
-    top_k=10,
-    top_p=0.95,
-    typical_p=0.95,
-    temperature=0.01,
-    repetition_penalty=1.03,
-)
-
-prompt_template = """
-### [INST]
-Instruction: Always assist with care, respect, and truth. Respond with utmost utility yet securely.
-Avoid harmful, unethical, prejudiced, or negative content.
-Ensure replies promote fairness and positivity.
-Here is context to help:
-
-{context}
-
-### QUESTION:
-{user_prompt}
-
-[/INST]
- """
-
-# Create prompt from prompt template
-prompt = PromptTemplate(
-    input_variables=["context", "user_prompt"],
-    template=prompt_template,
-)
+SESSION_TIMEOUT_MINUTES = 30
+#TODO replace with real secret
+SECRET_KEY = "TODO replace this with an actual secret that is stored and managed by kubernetes and added to the terraform configuration."
+app.config['SECRET_KEY'] = SECRET_KEY
 
 # Create llm chain
-llm_chain = LLMChain(llm=llm, prompt=prompt)
+llm_chain = create_chain()
 
 @app.route('/get_nlp_status', methods=['GET'])
 def get_nlp_status():
@@ -80,6 +53,7 @@
 def get_dlp_status():
     dlp_enabled = dlp_filter.is_dlp_api_enabled()
     return jsonify({"dlpEnabled": dlp_enabled})
+
 @app.route('/get_inspect_templates')
 def get_inspect_templates():
     return jsonify(dlp_filter.list_inspect_templates_from_parent())
@@ -89,15 +63,36 @@
     return jsonify(dlp_filter.list_deidentify_templates_from_parent())
 
 @app.before_request
-def init_db():
-    cloud_sql.init_db()
+def check_new_session():
+    if 'session_id' not in session:
+        # instantiate a new session using a generated UUID
+        session_id = str(uuid.uuid4())
+        session['session_id'] = session_id
+
+@app.before_request
+def check_inactivity():
+    # Inactivity cleanup
+    if 'last_activity' in session:
+        time_elapsed = datetime.now(timezone.utc) - session['last_activity'] 
+
+        if time_elapsed > timedelta(minutes=SESSION_TIMEOUT_MINUTES):
+            print("Session inactive: Cleaning up resources...")
+            session_id = session['session_id']
+            # TODO: implement garbage collection process for idle sessions that have timed out
+            clear_chat_history(session_id)
+            session.clear()
+
+    # Always update the 'last_activity' data
+    session['last_activity'] = datetime.now(timezone.utc) 
 
 @app.route('/')
 def index():
     return render_template('index.html')
 
 @app.route('/prompt', methods=['POST'])
 def handlePrompt():
+    # TODO on page refresh, load chat history into browser.
+    session['last_activity'] = datetime.now(timezone.utc) 
     data = request.get_json()
     warnings = []
 
@@ -107,19 +102,12 @@
     user_prompt = data['prompt']
     log.info(f"handle user prompt: {user_prompt}")
 
-    context = ""
     try:
-        context = cloud_sql.fetchContext(user_prompt)
-    except Exception as err:
-        error_traceback = traceback.format_exc()
-        log.warn(f"Error: {err}\nTraceback:\n{error_traceback}")
-        warnings.append(f"Error: {err}\nTraceback:\n{error_traceback}")
+        response = {}
+        result = take_chat_turn(llm_chain, session['session_id'], user_prompt)
+        response['text'] = result
 
-    try:
-        response = llm_chain.invoke({
-            "context": context,
-            "user_prompt": user_prompt
-        })
+        # TODO: enable filtering in chain
         if 'nlpFilterLevel' in data:
             if nlp_filter.is_content_inappropriate(response['text'], data['nlpFilterLevel']):
                 response['text'] = 'The response is deemed inappropriate for display.'
@@ -149,4 +137,6 @@
 
 
 if __name__ == '__main__':
-    app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080)))
+    # TODO using gunicorn to start the server results in the first request being really slow.
+    # Sometimes, the worker thread has to restart due to an unknown error.
+    app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080)))