-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPanversal-ChromaDB-client-integration.py
231 lines (199 loc) · 10.1 KB
/
Panversal-ChromaDB-client-integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
Universal ChromaDB Integration Template
This script is a comprehensive guide for integrating ChromaDB with various document sources and credential management options.
It is designed to cater to users with varying levels of coding experience. Below is a schema representing the general structure
of this template:
1. Import necessary libraries and load the ChromaDB Client.
2. Create or access a ChromaDB collection.
3. Initialize lists to store documents, metadatas, and ids.
4. Fetch documents from desired sources (Local Files, Google Drive, OneDrive, Dropbox, iCloud, Web, Databases).
5. Add documents to the collection.
6. Optionally query the collection to test the integration.
Each data source section includes optional alternatives for credential management using .env files or Key Vault Managers like
Google Vault, AWS Key Management Services, Azure Key Vault, Hashicorp Vault, etc. Uncomment and configure the section(s) that
align with your requirements.
Before proceeding, ensure you have installed and imported the necessary packages:
- chromadb
- os
- dotenv (for .env approach)
- The respective libraries if you're utilizing the Key-Vault Manager approach
- Other libraries as needed based on your document sources and credential management choices
The current structure of the script allows for pulling data from multiple sources simultaneously, as long as the corresponding
sections are uncommented and properly configured. Each section is independent, so you can activate as many or as few as needed
for your use case. The script will iterate through each enabled section, fetching and appending documents, metadatas, and ids
to their respective lists before adding them all to the ChromaDB collection at once.
Incorporating Key Vault Managers can enhance security by managing sensitive information like credentials and API keys. Below is
a version of the script that includes a generic template for integrating Key Vault Managers. You would need to install
and import the respective libraries for the Key Vault Manager you choose to use and configure the access according to the
service's API.
This template also includes sections for OneDrive, Dropbox, Apple's iCloud, and a generic SQL database. Each section requires
setup specific to the service, such as API keys, access tokens, client IDs, and authentication credentials. These details are
often found in the respective service's API documentation.
Please note that for Apple's iCloud, accessing documents programmatically may be more complex due to Apple's security measures,
and it may require additional steps not covered in this template.
"""
# Instructional section for the USER
# Here, you would provide detailed instructions about the script's intentions, usage, and how to configure each section. This should include the example schema and discussions on key management options.
import chromadb
import os
# Load the environment variables from the .env file (if using .env approach)
from dotenv import load_dotenv
# Import additional libraries as needed based on your document source and key management strategy
# Load your ChromaDB Client
chroma_client = chromadb.Client()
# Create or access your collection
collection_name = "your_collection_name" # Replace with your actual collection name
try:
collection = chroma_client.create_collection(name=collection_name)
except:
collection = chroma_client.get_collection(name=collection_name)
# Initialize lists to store documents, metadatas, and ids
documents = []
metadatas = []
ids = []
# Credential Management Options
# This template includes placeholders for integrating various Key Vault Managers.
# Replace these with the actual implementation details specific to your chosen
# Key Vault Manager. Each Key Vault Manager has its own methods for accessing
# and retrieving secrets, so you'll need to refer to their respective documentation
# for precise implementation details.
# ---------------------------
# .env file approach
# ---------------------------
# from dotenv import load_dotenv
# load_dotenv()
# example_variable = os.getenv('EXAMPLE_VARIABLE')
# ---------------------------
# Key Vault Managers (Generic Template)
# ---------------------------
# Replace with specific Key Vault Manager implementation
# For Google Vault:
# from google.cloud import secretmanager
# client = secretmanager.SecretManagerServiceClient()
# secret_name = f"projects/{project_id}/secrets/{secret_id}/versions/latest"
# response = client.access_secret_version(request={"name": secret_name})
# secret_value = response.payload.data.decode('UTF-8')
# For AWS Key Management Service:
# import boto3
# client = boto3.client('kms')
# response = client.decrypt(CiphertextBlob=b'encrypted_blob')
# secret_value = response['Plaintext']
# For Azure Key Vault:
# from azure.keyvault.secrets import SecretClient
# from azure.identity import DefaultAzureCredential
# credential = DefaultAzureCredential()
# client = SecretClient(vault_url="https://your-vault-name.vault.azure.net/", credential=credential)
# secret_value = client.get_secret("your-secret-name").value
# For Hashicorp Vault:
# import hvac
# client = hvac.Client(url='https://your.vault.server:8200')
# client.auth.approle.login(role_id='your_role_id', secret_id='your_secret_id')
# secret_value = client.secrets.kv.v2.read_secret_version(path='your/secret/path')['data']['data']['your_key']
# Uncomment and configure the sections as needed based on your document source and key management strategy
# Document source and credential management options
# ---------------------------
# Local Files
# ---------------------------
# directory_path = "path_to_your_documents" # Replace with the path to your documents
# for filename in os.listdir(directory_path):
# if filename.endswith(".txt"): # or .pdf, .docx, etc. based on your files
# file_path = os.path.join(directory_path, filename)
# # Open and read the file content
# with open(file_path, 'r') as file:
# content = file.read()
# documents.append(content)
# metadatas.append({"filename": filename}) # Add any other metadata you might have
# ids.append(filename) # Or any other unique identifier
# ---------------------------
# Google Drive (requires setup)
# ---------------------------
# from googleapiclient.discovery import build
# from google.oauth2.credentials import Credentials
# creds = Credentials.from_authorized_user_file(os.getenv('GOOGLE_TOKEN_JSON')) # Replace with your credentials file or use .env
# service = build('drive', 'v3', credentials=creds)
# # Call the Drive v3 API
# results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
# items = results.get('files', [])
# if not items:
# print('No files found.')
# else:
# for item in items:
# # Download and read file content
# request = service.files().get_media(fileId=item['id'])
# content = request.execute()
# documents.append(content)
# metadatas.append({"google_drive_id": item['id']})
# ids.append(item['id'])
# ---------------------------
# Fetch from Web
# ---------------------------
# import requests
# urls = ["http://example.com/document1", "http://example.com/document2"] # Replace with your document URLs
# for url in urls:
# response = requests.get(url)
# documents.append(response.text)
# metadatas.append({"url": url})
# ids.append(url) # Or any other unique identifier
# ---------------------------
# OneDrive (requires setup)
# ---------------------------
# from onedrivesdk import HttpProvider, AuthProvider, OneDriveClient
# http_provider = HttpProvider() # Initialize your HttpProvider
# client_id = os.getenv('ONEDRIVE_CLIENT_ID') # Replace with your client ID or use .env
# scopes = ['wl.signin', 'wl.offline_access', 'onedrive.readwrite'] # Define your scopes
# auth_provider = AuthProvider(http_provider, client_id, scopes)
# client = OneDriveClient(os.getenv('ONEDRIVE_API_BASE_URL'), auth_provider, http_provider) # Replace with your API base URL or use .env
# items = client.item(drive='me', id='root').children.get()
# for item in items:
# # Download and read file content
# content = client.item(drive='me', id=item.id).content.request().get()
# documents.append(content)
# metadatas.append({"onedrive_id": item.id})
# ids.append(item.id)
# ---------------------------
# Dropbox (requires setup)
# ---------------------------
# import dropbox
# dbx = dropbox.Dropbox(os.getenv('DROPBOX_ACCESS_TOKEN')) # Replace with your access token or use .env
# for entry in dbx.files_list_folder('').entries:
# # Download and read file content
# metadata, res = dbx.files_download('/' + entry.name)
# content = res.content
# documents.append(content)
# metadatas.append({"dropbox_id": metadata.id})
# ids.append(metadata.id)
# ---------------------------
# Apple's iCloud (requires setup)
# ---------------------------
# from pyicloud import PyiCloudService
# icloud = PyiCloudService(os.getenv('APPLE_ID'), os.getenv('ICLOUD_PASSWORD')) # Replace with your credentials or use .env
# for file in icloud.drive['Documents']:
# # Download and read file content
# content = file.open(stream=True).content
# documents.append(content)
# metadatas.append({"icloud_id": file.id})
# ids.append(file.id)
# ---------------------------
# Database (generic SQL)
# ---------------------------
# import psycopg2
# connection = psycopg2.connect(user=os.getenv('DB_USER'), password=os.getenv('DB_PASSWORD'), host=os.getenv('DB_HOST'), port=os.getenv('DB_PORT'), database=os.getenv('DB_NAME')) # Replace with your credentials or use .env
# cursor = connection.cursor()
# cursor.execute("SELECT document_content, document_id, document_metadata FROM your_table")
# records = cursor.fetchall()
# for record in records:
# documents.append(record[0])
# metadatas.append({"source": "your_database", "additional_metadata": record[2]})
# ids.append(record[1])
# Add documents to the collection
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
# Query the collection (optional, for testing)
results = collection.query(
query_texts=["This is a query document"],
n_results=2
)
print(results)