From 65639460469bec12635c64e84df2aede3e482539 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Thu, 8 Feb 2024 13:54:19 +0530 Subject: [PATCH 01/18] added couchbase document loader --- langchain/package.json | 13 ++++ .../document_loaders/tests/couchbase.test.ts | 12 +++ .../src/document_loaders/web/couchbase.ts | 70 +++++++++++++++++ yarn.lock | 75 +++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 langchain/src/document_loaders/tests/couchbase.test.ts create mode 100644 langchain/src/document_loaders/web/couchbase.ts diff --git a/langchain/package.json b/langchain/package.json index 359579835be5..30aba5dba641 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -460,6 +460,9 @@ "document_loaders/web/confluence.cjs", "document_loaders/web/confluence.js", "document_loaders/web/confluence.d.ts", + "document_loaders/web/couchbase.cjs", + "document_loaders/web/couchbase.js", + "document_loaders/web/couchbase.d.ts", "document_loaders/web/searchapi.cjs", "document_loaders/web/searchapi.js", "document_loaders/web/searchapi.d.ts", @@ -952,6 +955,7 @@ "cheerio": "^1.0.0-rc.12", "chromadb": "^1.5.3", "convex": "^1.3.1", + "couchbase": "^4.2.10", "d3-dsv": "^2.0.0", "dotenv": "^16.0.3", "dpdm": "^3.12.0", @@ -1021,6 +1025,7 @@ "cheerio": "^1.0.0-rc.12", "chromadb": "*", "convex": "^1.3.1", + "couchbase": "^4.2.10", "d3-dsv": "^2.0.0", "epub2": "^3.0.1", "fast-xml-parser": "^4.2.7", @@ -1116,6 +1121,9 @@ "convex": { "optional": true }, + "couchbase": { + "optional": true + }, "d3-dsv": { "optional": true }, @@ -1997,6 +2005,11 @@ "types": "./document_loaders/web/confluence.d.ts", "import": "./document_loaders/web/confluence.js", "require": "./document_loaders/web/confluence.cjs" + }, + "./document_loaders/web/couchbase": { + "types": "./document_loaders/web/couchbase.d.ts", + "import": "./document_loaders/web/couchbase.js", + "require": "./document_loaders/web/couchbase.cjs" }, "./document_loaders/web/searchapi": { "types": "./document_loaders/web/searchapi.d.ts", diff --git a/langchain/src/document_loaders/tests/couchbase.test.ts b/langchain/src/document_loaders/tests/couchbase.test.ts new file mode 100644 index 000000000000..7a45ec90e2bd --- /dev/null +++ b/langchain/src/document_loaders/tests/couchbase.test.ts @@ -0,0 +1,12 @@ +import { test, expect } from "@jest/globals"; +import { Cluster } from "couchbase"; +import { CouchbaseDocumentLoader } from "../web/couchbase.js"; + + +test("Test Couchbase Cluster connection ", async ()=>{ + const couchbaseClient = await Cluster.connect(""); + const loader = new CouchbaseDocumentLoader(couchbaseClient, ""); + const doc = await loader.load(); + console.log(doc); + expect(doc.length).toBeGreaterThan(0); +}) \ No newline at end of file diff --git a/langchain/src/document_loaders/web/couchbase.ts b/langchain/src/document_loaders/web/couchbase.ts new file mode 100644 index 000000000000..580fd707b599 --- /dev/null +++ b/langchain/src/document_loaders/web/couchbase.ts @@ -0,0 +1,70 @@ +import { Cluster, QueryResult } from "couchbase"; +import { Document } from "../../document.js"; +import { BaseDocumentLoader, DocumentLoader } from "../base.js"; + +export class CouchbaseDocumentLoader + extends BaseDocumentLoader + implements DocumentLoader +{ + private cluster: Cluster; + + private query: string; + + private pageContentFields?: string[]; + + private metadataFields?: string[]; + + constructor( + client: Cluster, + query: string, + pageContentFields?: string[], + metadataFields?: string[] + ) { + super(); + if (!client) { + throw new Error("Couchbase client cluster must be provided."); + } + this.cluster = client; + this.query = query; + this.pageContentFields = pageContentFields; + this.metadataFields = metadataFields; + } + + async load(): Promise { + const documents: Document[] = []; + for await (const doc of this.lazyLoad()) { + documents.push(doc); + } + return documents; + } + + async *lazyLoad(): AsyncIterable { + // Run SQL++ Query + const result: QueryResult = await this.cluster.query(this.query); + for await (const row of result.rows) { + let { metadataFields, pageContentFields } = this; + + if (!pageContentFields) { + pageContentFields = Object.keys(row); + } + + if (!metadataFields) { + metadataFields = []; + } + + const metadata = metadataFields.reduce( + (obj, field) => ({ ...obj, [field]: row[field] }), + {} + ); + + const document = pageContentFields + .map((k) => `${k}: ${row[k]}`) + .join("\n"); + + yield new Document({ + pageContent: document, + metadata, + }); + } + } +} diff --git a/yarn.lock b/yarn.lock index d05202de3006..6174564e68eb 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6558,6 +6558,48 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-darwin-arm64-napi@npm:4.2.10": + version: 4.2.10 + resolution: "@couchbase/couchbase-darwin-arm64-napi@npm:4.2.10" + conditions: os=darwin & cpu=arm64 + languageName: node + linkType: hard + +"@couchbase/couchbase-darwin-x64-napi@npm:4.2.10": + version: 4.2.10 + resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.10" + conditions: os=darwin & cpu=x64 + languageName: node + linkType: hard + +"@couchbase/couchbase-linux-arm64-napi@npm:4.2.10": + version: 4.2.10 + resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.10" + conditions: os=linux & cpu=arm64 + languageName: node + linkType: hard + +"@couchbase/couchbase-linux-x64-napi@npm:4.2.10": + version: 4.2.10 + resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.10" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + +"@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10": + version: 4.2.10 + resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + +"@couchbase/couchbase-win32-x64-napi@npm:4.2.10": + version: 4.2.10 + resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.10" + conditions: os=win32 & cpu=x64 + languageName: node + linkType: hard + "@crawlee/types@npm:^3.3.0": version: 3.3.1 resolution: "@crawlee/types@npm:3.3.1" @@ -18005,6 +18047,35 @@ __metadata: languageName: node linkType: hard +"couchbase@npm:^4.2.10": + version: 4.2.10 + resolution: "couchbase@npm:4.2.10" + dependencies: + "@couchbase/couchbase-darwin-arm64-napi": 4.2.10 + "@couchbase/couchbase-darwin-x64-napi": 4.2.10 + "@couchbase/couchbase-linux-arm64-napi": 4.2.10 + "@couchbase/couchbase-linux-x64-napi": 4.2.10 + "@couchbase/couchbase-linuxmusl-x64-napi": 4.2.10 + "@couchbase/couchbase-win32-x64-napi": 4.2.10 + cmake-js: ^7.2.1 + node-addon-api: ^7.0.0 + dependenciesMeta: + "@couchbase/couchbase-darwin-arm64-napi": + optional: true + "@couchbase/couchbase-darwin-x64-napi": + optional: true + "@couchbase/couchbase-linux-arm64-napi": + optional: true + "@couchbase/couchbase-linux-x64-napi": + optional: true + "@couchbase/couchbase-linuxmusl-x64-napi": + optional: true + "@couchbase/couchbase-win32-x64-napi": + optional: true + checksum: 1cc4725c5f16c3173691a9e4f702e479df545473deac694f7a8627f58a63a92718824d018730b51a7d4d6a0a8e125b0ef5f3f81cf995a831b8a3adfa05e9ecc7 + languageName: node + linkType: hard + "create-langchain-integration@workspace:libs/create-langchain-integration": version: 0.0.0-use.local resolution: "create-langchain-integration@workspace:libs/create-langchain-integration" @@ -25203,6 +25274,7 @@ __metadata: cheerio: ^1.0.0-rc.12 chromadb: ^1.5.3 convex: ^1.3.1 + couchbase: ^4.2.10 d3-dsv: ^2.0.0 dotenv: ^16.0.3 dpdm: ^3.12.0 @@ -25284,6 +25356,7 @@ __metadata: cheerio: ^1.0.0-rc.12 chromadb: "*" convex: ^1.3.1 + couchbase: ^4.2.10 d3-dsv: ^2.0.0 epub2: ^3.0.1 fast-xml-parser: ^4.2.7 @@ -25357,6 +25430,8 @@ __metadata: optional: true convex: optional: true + couchbase: + optional: true d3-dsv: optional: true epub2: From 65974da61a6d7dd47210d69dafb962da584ca637 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Thu, 8 Feb 2024 18:21:03 +0530 Subject: [PATCH 02/18] fixed loader to use stringify --- langchain/package.json | 8 ++++---- langchain/src/document_loaders/web/couchbase.ts | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/langchain/package.json b/langchain/package.json index 30aba5dba641..80a9e78a18bc 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -2006,10 +2006,10 @@ "import": "./document_loaders/web/confluence.js", "require": "./document_loaders/web/confluence.cjs" }, - "./document_loaders/web/couchbase": { - "types": "./document_loaders/web/couchbase.d.ts", - "import": "./document_loaders/web/couchbase.js", - "require": "./document_loaders/web/couchbase.cjs" + "./document_loaders/web/couchbase": { + "types": "./document_loaders/web/couchbase.d.ts", + "import": "./document_loaders/web/couchbase.js", + "require": "./document_loaders/web/couchbase.cjs" }, "./document_loaders/web/searchapi": { "types": "./document_loaders/web/searchapi.d.ts", diff --git a/langchain/src/document_loaders/web/couchbase.ts b/langchain/src/document_loaders/web/couchbase.ts index 580fd707b599..4c90cee5be0c 100644 --- a/langchain/src/document_loaders/web/couchbase.ts +++ b/langchain/src/document_loaders/web/couchbase.ts @@ -1,5 +1,5 @@ import { Cluster, QueryResult } from "couchbase"; -import { Document } from "../../document.js"; +import { Document } from "@langchain/core/documents"; import { BaseDocumentLoader, DocumentLoader } from "../base.js"; export class CouchbaseDocumentLoader @@ -58,7 +58,7 @@ export class CouchbaseDocumentLoader ); const document = pageContentFields - .map((k) => `${k}: ${row[k]}`) + .map((k) => `${k}: ${JSON.stringify(row[k])}`) .join("\n"); yield new Document({ From ba71e7e92f70e21ec2cbb2126b92e170fbb9d949 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Sun, 11 Feb 2024 10:48:03 +0530 Subject: [PATCH 03/18] add doc file --- .../web_loaders/couchbase.mdx | 102 ++++++++++++++++++ langchain/langchain.config.js | 2 + .../tests/couchbase.int.test.ts | 18 ++++ .../src/document_loaders/web/couchbase.ts | 16 +++ langchain/src/load/import_constants.ts | 1 + langchain/src/load/import_type.d.ts | 3 + 6 files changed, 142 insertions(+) create mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx create mode 100644 langchain/src/document_loaders/tests/couchbase.int.test.ts diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx new file mode 100644 index 000000000000..acf0ed077ad6 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx @@ -0,0 +1,102 @@ +--- +hide_table_of_contents: true +sidebar_class_name: node-only +--- + +# Couchbase + +[Couchbase](http://couchbase.com/) is an award-winning distributed NoSQL cloud database that delivers unmatched versatility, performance, scalability, and financial value for all of your cloud, mobile, AI, and edge computing applications. + +This guide shows how to use load documents from couchbase database. + +# Installation + +```bash npm2yarn +npm install couchbase +``` + + +## Usage + +### Querying for Documents from Couchbase +For more details on connecting to a Couchbase cluster, please check the [Node.js SDK documentation](https://docs.couchbase.com/nodejs-sdk/current/howtos/managing-connections.html#connection-strings). + +For help with querying for documents using SQL++ (SQL for JSON), please check the [documentation](https://docs.couchbase.com/server/current/n1ql/n1ql-language-reference/index.html). + +```typescript +import { CouchbaseDocumentLoader } from "langchain/document_loaders/web/couchbase"; + +const connectionString = "couchbase://localhost"; // valid couchbase connection string +const dbUsername = "Administrator" // valid database user with read access to the bucket being queried +const dbPassword = "Password" // password for the database user + +// query is a valid SQL++ query +const query = ` + SELECT h.* FROM `travel-sample`.inventory.hotel h + WHERE h.country = 'United States' + LIMIT 1 + `; +``` + +### Connect to Couchbase Cluster + +```typescript + const couchbaseClient = await Cluster.connect(connectionString, { + username: dbUsername, + password: dbPassword, + }); + +``` + +### Create the Loader + +```typescript +const loader = new CouchbaseDocumentLoader( + couchbaseClient, // The connected couchbase cluster client + query, // A valid SQL++ query which will return the required data +); +``` + +### Load Documents + +You can fetch the documents by calling the `load` method of the loader. It will return a list with all the documents. If you want to avoid this blocking call, you can call `lazy_load` method that returns an Iterator. + +```typescript +// using load method +docs = await loader.load(); +console.log(docs) +``` + +```typescript +// using lazy_load +for await (const doc of this.lazyLoad()){ + console.log(doc); + break; // break based on required condition +} +``` + +### Specifying Fields with Content and Metadata +The fields that are part of the Document content can be specified using the `pageContentFields` parameter. +The metadata fields for the Document can be specified using the `metadataFields` parameter. + +```typescript +const loaderWithSelectedFields = new CouchbaseDocumentLoader( + couchbaseClient, + query, + // pageContentFields + [ + "address", + "name", + "city", + "phone", + "country", + "geo", + "description", + "reviews", + ], + ["id"] // metadataFields +); + +const filtered_docs = await loaderWithSelectedFields.load() +console.log(filtered_docs) +``` \ No newline at end of file diff --git a/langchain/langchain.config.js b/langchain/langchain.config.js index 34693bb0251c..2b88a7e0399b 100644 --- a/langchain/langchain.config.js +++ b/langchain/langchain.config.js @@ -183,6 +183,7 @@ export const config = { "document_loaders/web/azure_blob_storage_file": "document_loaders/web/azure_blob_storage_file", "document_loaders/web/cheerio": "document_loaders/web/cheerio", + "document_loaders/web/couchbase": "document_loaders/web/couchbase", "document_loaders/web/puppeteer": "document_loaders/web/puppeteer", "document_loaders/web/playwright": "document_loaders/web/playwright", "document_loaders/web/college_confidential": @@ -627,6 +628,7 @@ export const config = { "document_loaders/web/azure_blob_storage_container", "document_loaders/web/azure_blob_storage_file", "document_loaders/web/cheerio", + "document_loaders/web/couchbase", "document_loaders/web/puppeteer", "document_loaders/web/playwright", "document_loaders/web/college_confidential", diff --git a/langchain/src/document_loaders/tests/couchbase.int.test.ts b/langchain/src/document_loaders/tests/couchbase.int.test.ts new file mode 100644 index 000000000000..573e54a5a5f0 --- /dev/null +++ b/langchain/src/document_loaders/tests/couchbase.int.test.ts @@ -0,0 +1,18 @@ +import { test, expect } from "@jest/globals"; +import { Cluster } from "couchbase"; +import { CouchbaseDocumentLoader } from "../web/couchbase.js"; + +test("Test Couchbase Cluster connection ", async () => { + const couchbaseClient = await Cluster.connect("couchbase://localhost", { + username: "Administrator", + password: "password", + }); + const loader = new CouchbaseDocumentLoader( + couchbaseClient, + "Select r.* from `travel-sample`.`inventory`.`route` as r limit 10", + ["airline", "sourceairport"] + ); + const doc = await loader.load(); + console.log(doc); + expect(doc.length).toBeGreaterThan(0); +}); diff --git a/langchain/src/document_loaders/web/couchbase.ts b/langchain/src/document_loaders/web/couchbase.ts index 4c90cee5be0c..805e5fdac01d 100644 --- a/langchain/src/document_loaders/web/couchbase.ts +++ b/langchain/src/document_loaders/web/couchbase.ts @@ -2,6 +2,9 @@ import { Cluster, QueryResult } from "couchbase"; import { Document } from "@langchain/core/documents"; import { BaseDocumentLoader, DocumentLoader } from "../base.js"; +/** + * loader for couchbase document + */ export class CouchbaseDocumentLoader extends BaseDocumentLoader implements DocumentLoader @@ -14,6 +17,13 @@ export class CouchbaseDocumentLoader private metadataFields?: string[]; + /** + * construct Couchbase document loader with a requirement for couchbase cluster client + * @param client { Cluster } [couchbase connected client to connect to database] + * @param query { string } [query to get results from while loading the data] + * @param pageContentFields { Array } [filters fields of the document and shows these only] + * @param metadataFields { Array } [metadata fields required] + */ constructor( client: Cluster, query: string, @@ -30,6 +40,9 @@ export class CouchbaseDocumentLoader this.metadataFields = metadataFields; } + /** + * Function to load document based on query from couchbase + */ async load(): Promise { const documents: Document[] = []; for await (const doc of this.lazyLoad()) { @@ -38,6 +51,9 @@ export class CouchbaseDocumentLoader return documents; } + /** + * Helper function to load each document + */ async *lazyLoad(): AsyncIterable { // Run SQL++ Query const result: QueryResult = await this.cluster.query(this.query); diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index e2a54f81c42e..2216953a83d0 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -99,6 +99,7 @@ export const optionalImportEntrypoints = [ "langchain/document_loaders/web/s3", "langchain/document_loaders/web/sonix_audio", "langchain/document_loaders/web/confluence", + "langchain/document_loadrs/web/couchbase", "langchain/document_loaders/web/youtube", "langchain/document_loaders/fs/directory", "langchain/document_loaders/fs/buffer", diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts index d9295b9e5f8c..0fbaf3a75ee4 100644 --- a/langchain/src/load/import_type.d.ts +++ b/langchain/src/load/import_type.d.ts @@ -295,6 +295,9 @@ export interface OptionalImportMap { "langchain/document_loaders/web/confluence"?: | typeof import("../document_loaders/web/confluence.js") | Promise; + "langchain/document_loaders/web/couchbase"?: + | typeof import("../document_loaders/web/couchbase.js") + | Promise; "langchain/document_loaders/web/youtube"?: | typeof import("../document_loaders/web/youtube.js") | Promise; From a3b5262fd5475a3ed04a4075b52d0fad2cace611 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Sun, 11 Feb 2024 11:10:14 +0530 Subject: [PATCH 04/18] updated tests --- .../web_loaders/couchbase.mdx | 2 ++ .../tests/couchbase.int.test.ts | 36 ++++++++++++++----- .../document_loaders/tests/couchbase.test.ts | 12 ------- 3 files changed, 30 insertions(+), 20 deletions(-) delete mode 100644 langchain/src/document_loaders/tests/couchbase.test.ts diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx index acf0ed077ad6..c3fbe5be44f3 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx @@ -25,6 +25,7 @@ For help with querying for documents using SQL++ (SQL for JSON), please check th ```typescript import { CouchbaseDocumentLoader } from "langchain/document_loaders/web/couchbase"; +import { Cluster } from "couchbase" const connectionString = "couchbase://localhost"; // valid couchbase connection string const dbUsername = "Administrator" // valid database user with read access to the bucket being queried @@ -44,6 +45,7 @@ const query = ` const couchbaseClient = await Cluster.connect(connectionString, { username: dbUsername, password: dbPassword, + configProfile: "wanDevelopment" }); ``` diff --git a/langchain/src/document_loaders/tests/couchbase.int.test.ts b/langchain/src/document_loaders/tests/couchbase.int.test.ts index 573e54a5a5f0..ea4f147cdec2 100644 --- a/langchain/src/document_loaders/tests/couchbase.int.test.ts +++ b/langchain/src/document_loaders/tests/couchbase.int.test.ts @@ -3,16 +3,36 @@ import { Cluster } from "couchbase"; import { CouchbaseDocumentLoader } from "../web/couchbase.js"; test("Test Couchbase Cluster connection ", async () => { - const couchbaseClient = await Cluster.connect("couchbase://localhost", { - username: "Administrator", - password: "password", + const connectionString = ""; + const databaseUsername = ""; + const databasePassword = ""; + const query = ` + SELECT h.* FROM \`travel-sample\`.inventory.hotel h + WHERE h.country = 'United States' + LIMIT 10 + `; + const validPageContentFields = ["country", "name", "description"]; + const validMetadataFields = ["id"] + + const couchbaseClient = await Cluster.connect(connectionString, { + username: databaseUsername, + password: databasePassword, + configProfile: "wanDevelopment" }); const loader = new CouchbaseDocumentLoader( couchbaseClient, - "Select r.* from `travel-sample`.`inventory`.`route` as r limit 10", - ["airline", "sourceairport"] + query, + validPageContentFields, + validMetadataFields ); - const doc = await loader.load(); - console.log(doc); - expect(doc.length).toBeGreaterThan(0); + const docs = await loader.load(); + console.log(docs); + expect(docs.length).toBeGreaterThan(0); + + for (const doc of docs) { + console.log(doc); + expect(doc.pageContent).not.toBe(""); // Assuming valid page content fields + expect(doc.metadata).toHaveProperty('id'); // Assuming metadata has id field + expect(doc.metadata.id).not.toBe(""); + } }); diff --git a/langchain/src/document_loaders/tests/couchbase.test.ts b/langchain/src/document_loaders/tests/couchbase.test.ts deleted file mode 100644 index 7a45ec90e2bd..000000000000 --- a/langchain/src/document_loaders/tests/couchbase.test.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { test, expect } from "@jest/globals"; -import { Cluster } from "couchbase"; -import { CouchbaseDocumentLoader } from "../web/couchbase.js"; - - -test("Test Couchbase Cluster connection ", async ()=>{ - const couchbaseClient = await Cluster.connect(""); - const loader = new CouchbaseDocumentLoader(couchbaseClient, ""); - const doc = await loader.load(); - console.log(doc); - expect(doc.length).toBeGreaterThan(0); -}) \ No newline at end of file From 4cb8591c01a44032504c1bb291c133a7818f99a5 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Sun, 11 Feb 2024 11:20:42 +0530 Subject: [PATCH 05/18] update types as per new requirement --- langchain/.gitignore | 4 ++++ langchain/langchain.config.js | 4 ++-- langchain/package.json | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/langchain/.gitignore b/langchain/.gitignore index fe3c2a67a82e..3bf13f3a9c30 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -602,6 +602,10 @@ document_loaders/web/confluence.cjs document_loaders/web/confluence.js document_loaders/web/confluence.d.ts document_loaders/web/confluence.d.cts +document_loaders/web/couchbase.cjs +document_loaders/web/couchbase.js +document_loaders/web/couchbase.d.ts +document_loaders/web/couchbase.d.cts document_loaders/web/searchapi.cjs document_loaders/web/searchapi.js document_loaders/web/searchapi.d.ts diff --git a/langchain/langchain.config.js b/langchain/langchain.config.js index bb29a1c25c19..6f8d0df50673 100644 --- a/langchain/langchain.config.js +++ b/langchain/langchain.config.js @@ -182,7 +182,6 @@ export const config = { "document_loaders/web/azure_blob_storage_file": "document_loaders/web/azure_blob_storage_file", "document_loaders/web/cheerio": "document_loaders/web/cheerio", - "document_loaders/web/couchbase": "document_loaders/web/couchbase", "document_loaders/web/puppeteer": "document_loaders/web/puppeteer", "document_loaders/web/playwright": "document_loaders/web/playwright", "document_loaders/web/college_confidential": @@ -200,6 +199,7 @@ export const config = { "document_loaders/web/sitemap": "document_loaders/web/sitemap", "document_loaders/web/sonix_audio": "document_loaders/web/sonix_audio", "document_loaders/web/confluence": "document_loaders/web/confluence", + "document_loaders/web/couchbase": "document_loaders/web/couchbase", "document_loaders/web/searchapi": "document_loaders/web/searchapi", "document_loaders/web/serpapi": "document_loaders/web/serpapi", "document_loaders/web/sort_xyz_blockchain": @@ -628,7 +628,6 @@ export const config = { "document_loaders/web/azure_blob_storage_container", "document_loaders/web/azure_blob_storage_file", "document_loaders/web/cheerio", - "document_loaders/web/couchbase", "document_loaders/web/puppeteer", "document_loaders/web/playwright", "document_loaders/web/college_confidential", @@ -645,6 +644,7 @@ export const config = { "document_loaders/web/sitemap", "document_loaders/web/sonix_audio", "document_loaders/web/confluence", + "document_loaders/web/couchbase", "document_loaders/web/youtube", "document_loaders/fs/directory", "document_loaders/fs/buffer", diff --git a/langchain/package.json b/langchain/package.json index a5a63fbd9f66..002c7113f0ba 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -617,6 +617,7 @@ "document_loaders/web/couchbase.cjs", "document_loaders/web/couchbase.js", "document_loaders/web/couchbase.d.ts", + "document_loaders/web/couchbase.d.cts", "document_loaders/web/searchapi.cjs", "document_loaders/web/searchapi.js", "document_loaders/web/searchapi.d.ts", @@ -2908,7 +2909,11 @@ "require": "./document_loaders/web/confluence.cjs" }, "./document_loaders/web/couchbase": { - "types": "./document_loaders/web/couchbase.d.ts", + "types": { + "import": "./document_loaders/web/couchbase.d.ts", + "require": "./document_loaders/web/couchbase.d.cts", + "default": "./document_loaders/web/couchbase.d.ts" + }, "import": "./document_loaders/web/couchbase.js", "require": "./document_loaders/web/couchbase.cjs" }, From 6852c2289ed7e7f0321848ce9780a57252ffee3c Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Sun, 11 Feb 2024 11:47:08 +0530 Subject: [PATCH 06/18] update comments for typedoc --- langchain/src/document_loaders/web/couchbase.ts | 12 +++++++----- langchain/src/load/import_constants.ts | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/langchain/src/document_loaders/web/couchbase.ts b/langchain/src/document_loaders/web/couchbase.ts index 805e5fdac01d..a18ccc4c1bbd 100644 --- a/langchain/src/document_loaders/web/couchbase.ts +++ b/langchain/src/document_loaders/web/couchbase.ts @@ -19,10 +19,10 @@ export class CouchbaseDocumentLoader /** * construct Couchbase document loader with a requirement for couchbase cluster client - * @param client { Cluster } [couchbase connected client to connect to database] - * @param query { string } [query to get results from while loading the data] - * @param pageContentFields { Array } [filters fields of the document and shows these only] - * @param metadataFields { Array } [metadata fields required] + * @param client { Cluster } [ couchbase connected client to connect to database ] + * @param query { string } [ query to get results from while loading the data ] + * @param pageContentFields { Array } [ filters fields of the document and shows these only ] + * @param metadataFields { Array } [ metadata fields required ] */ constructor( client: Cluster, @@ -42,6 +42,7 @@ export class CouchbaseDocumentLoader /** * Function to load document based on query from couchbase + * @returns {Promise} [ Returns a promise of all the documents as array ] */ async load(): Promise { const documents: Document[] = []; @@ -52,7 +53,8 @@ export class CouchbaseDocumentLoader } /** - * Helper function to load each document + * Function to load documents based on iterator rather than full load + * @returns {AsyncIterable} [ Returns an iterator to fetch documents ] */ async *lazyLoad(): AsyncIterable { // Run SQL++ Query diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index 26d81dfbb127..81b09d272a05 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -100,7 +100,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain/document_loaders/web/sitemap", "langchain/document_loaders/web/sonix_audio", "langchain/document_loaders/web/confluence", - "langchain/document_loadrs/web/couchbase", + "langchain/document_loaders/web/couchbase", "langchain/document_loaders/web/youtube", "langchain/document_loaders/fs/directory", "langchain/document_loaders/fs/buffer", From 54fb39e118c12024fe12a8411d46ae939d118d04 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 13 Feb 2024 15:22:35 +0530 Subject: [PATCH 07/18] fix formatting issues and remove print in tests --- .../document_loaders/web_loaders/couchbase.mdx | 12 ++++++------ .../tests/couchbase.int.test.ts | 18 ++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx index c3fbe5be44f3..9cad48ed03ad 100644 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/couchbase.mdx @@ -33,20 +33,20 @@ const dbPassword = "Password" // password for the database user // query is a valid SQL++ query const query = ` - SELECT h.* FROM `travel-sample`.inventory.hotel h - WHERE h.country = 'United States' - LIMIT 1 - `; + SELECT h.* FROM \`travel-sample\`.inventory.hotel h + WHERE h.country = 'United States' + LIMIT 1 +`; ``` ### Connect to Couchbase Cluster ```typescript - const couchbaseClient = await Cluster.connect(connectionString, { +const couchbaseClient = await Cluster.connect(connectionString, { username: dbUsername, password: dbPassword, configProfile: "wanDevelopment" - }); +}); ``` diff --git a/langchain/src/document_loaders/tests/couchbase.int.test.ts b/langchain/src/document_loaders/tests/couchbase.int.test.ts index ea4f147cdec2..7147955df9a5 100644 --- a/langchain/src/document_loaders/tests/couchbase.int.test.ts +++ b/langchain/src/document_loaders/tests/couchbase.int.test.ts @@ -7,17 +7,17 @@ test("Test Couchbase Cluster connection ", async () => { const databaseUsername = ""; const databasePassword = ""; const query = ` - SELECT h.* FROM \`travel-sample\`.inventory.hotel h - WHERE h.country = 'United States' - LIMIT 10 - `; + SELECT h.* FROM \`travel-sample\`.inventory.hotel h + WHERE h.country = 'United States' + LIMIT 10 + `; const validPageContentFields = ["country", "name", "description"]; - const validMetadataFields = ["id"] - + const validMetadataFields = ["id"]; + const couchbaseClient = await Cluster.connect(connectionString, { username: databaseUsername, password: databasePassword, - configProfile: "wanDevelopment" + configProfile: "wanDevelopment", }); const loader = new CouchbaseDocumentLoader( couchbaseClient, @@ -26,13 +26,11 @@ test("Test Couchbase Cluster connection ", async () => { validMetadataFields ); const docs = await loader.load(); - console.log(docs); expect(docs.length).toBeGreaterThan(0); for (const doc of docs) { - console.log(doc); expect(doc.pageContent).not.toBe(""); // Assuming valid page content fields - expect(doc.metadata).toHaveProperty('id'); // Assuming metadata has id field + expect(doc.metadata).toHaveProperty("id"); // Assuming metadata has id field expect(doc.metadata.id).not.toBe(""); } }); From 4b5209275ff9d85348e298c3114c6cf69dcbb082 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 27 Feb 2024 14:38:14 +0530 Subject: [PATCH 08/18] add support for couchbase vector search using sdk --- langchain/src/vectorstores/couchbase.ts | 0 libs/langchain-community/package.json | 2 + .../src/vectorstores/couchbase.ts | 320 ++++++++++++++++++ .../src/vectorstores/tests/couchbase.test.ts | 33 ++ yarn.lock | 73 ++++ 5 files changed, 428 insertions(+) create mode 100644 langchain/src/vectorstores/couchbase.ts create mode 100644 libs/langchain-community/src/vectorstores/couchbase.ts create mode 100644 libs/langchain-community/src/vectorstores/tests/couchbase.test.ts diff --git a/langchain/src/vectorstores/couchbase.ts b/langchain/src/vectorstores/couchbase.ts new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 54d6744997b8..123bcca93427 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -121,6 +121,7 @@ "closevector-web": "0.1.6", "cohere-ai": ">=6.0.0", "convex": "^1.3.1", + "couchbase": "^4.2.11-rc.1", "discord.js": "^14.14.1", "dotenv": "^16.0.3", "dpdm": "^3.12.0", @@ -223,6 +224,7 @@ "closevector-web": "0.1.6", "cohere-ai": "*", "convex": "^1.3.1", + "couchbase": "^4.2.11-rc.1", "discord.js": "^14.14.1", "dria": "^0.0.3", "faiss-node": "^0.5.1", diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts new file mode 100644 index 000000000000..60f75581b120 --- /dev/null +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -0,0 +1,320 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import { Embeddings } from "@langchain/core/embeddings"; +import { VectorStore } from "@langchain/core/vectorstores"; +import { + Bucket, + Cluster, + Collection, + MatchAllSearchQuery, + Scope, + SearchRequest, + VectorQuery, + VectorSearch, +} from "couchbase"; +import { Document, DocumentInterface } from "@langchain/core/documents"; +import { v4 as uuid } from "uuid"; + +export interface AddVectorOptions { + ids?: string[]; + metadata?: string[]; +} + +type CouchbaseVectorStoreFilter = { [key: string]: any }; + +export class CouchbaseVectorSearch extends VectorStore { + declare FilterType: CouchbaseVectorStoreFilter; + + private readonly cluster: Cluster; + + private readonly _bucket: Bucket; + + private readonly _scope: Scope; + + private readonly _collection: Collection; + + // private readonly connectionString: string; + + // private readonly dbUsername: string; + + // private readonly dbPassword: string; + + private readonly bucketName: string; + + private readonly scopeName: string; + + private readonly collectionName: string; + + private readonly indexName: string; + + private readonly textKey: string; + + private readonly embeddingKey: string; + + private readonly scopedIndex: boolean; + + private readonly metadataKey = "metadata"; + + constructor( + cluster: Cluster, + // connectionString: string, + // dbUsername: string, + // dbPassword: string, + bucketName: string, + scopeName: string, + collectionName: string, + embedding: Embeddings, + indexName: string, + textKey = "text", + embeddingKey: string | undefined = undefined, + scopedIndex = true + ) { + super(embedding, embedding); + this.cluster = cluster; + // this.connectionString = connectionString; + // this.dbUsername = dbUsername; + // this.dbPassword = dbPassword; + this.bucketName = bucketName; + this.scopeName = scopeName; + this.collectionName = collectionName; + this.indexName = indexName; + this.textKey = textKey; + if (embeddingKey) { + this.embeddingKey = embeddingKey; + } else { + this.embeddingKey = `${textKey}_embedding`; + } + this.scopedIndex = scopedIndex; + + this._bucket = this.cluster.bucket(this.bucketName); + this._scope = this._bucket.scope(this.scopeName); + this._collection = this._scope.collection(this.collectionName); + + void this.verifyIndexes(); + } + + async verifyIndexes() { + if (this.scopedIndex) { + const allIndexes = await this._scope.searchIndexes().getAllIndexes(); + const indexNames = allIndexes.map((index) => index.name); + if (!indexNames.includes(this.indexName)) { + throw new Error( + `Index ${this.indexName} does not exist. Please create the index before searching.` + ); + } + } else { + const allIndexes = await this.cluster.searchIndexes().getAllIndexes(); + const indexNames = allIndexes.map((index) => index.name); + if (!indexNames.includes(this.indexName)) { + throw new Error( + `Index ${this.indexName} does not exist. Please create the index before searching.` + ); + } + } + } + + _vectorstoreType(): string { + return "couchbase"; + } + + public async addVectors( + vectors: number[][], + documents: Document[], + options: AddVectorOptions = {} + ): Promise { + // Get document ids. if ids are not available then use UUIDs for each document + let ids: string[] | undefined = options ? options.ids : undefined; + if (ids === undefined) { + ids = Array.from({ length: documents.length }, () => uuid()); + } + + // Get metadata for each document. if metadata is not available, use empty object for each document + let metadata: any = options ? options.metadata : undefined; + if (metadata === undefined) { + metadata = Array.from({ length: documents.length }, () => ({})); + } + + const documentsToInsert = ids.map((id: string, index: number) => ({ + [id]: { + [this.textKey]: documents[index], + [this.embeddingKey]: vectors[index], + [this.metadataKey]: metadata[index], + }, + })); + + const docIds: string[] = []; + for (const document of documentsToInsert) { + try { + const currentDocumentKey = Object.keys(document)[0]; + await this._collection.upsert( + currentDocumentKey, + document[currentDocumentKey] + ); + docIds.push(currentDocumentKey); + } catch (e) { + console.log("error received while upserting document", e); + } + } + + return docIds; + } + + async similaritySearchVectorWithScore( + embeddings: number[], + k = 4, + filter: CouchbaseVectorStoreFilter = {}, + fetchK = 20, + kwargs: { [key: string]: any } = {} + ): Promise<[DocumentInterface>, number][]> { + let { fields } = kwargs; + + if (fields === null) { + fields = [this.textKey, this.metadataKey]; + } + + const searchRequest = new SearchRequest( + new MatchAllSearchQuery() + ).withVectorSearch( + VectorSearch.fromVectorQuery( + new VectorQuery(this.embeddingKey, embeddings).numCandidates(fetchK) + ) + ); + console.log(searchRequest, this.indexName); + + let searchIterator; + const docsWithScore: [DocumentInterface>, number][] = + []; + + try { + if (this.scopedIndex) { + searchIterator = this._scope.search(this.indexName, searchRequest, { + limit: k, + fields: [this.textKey, "metadata"], + raw: filter, + }); + } else { + searchIterator = this.cluster.search(this.indexName, searchRequest, { + limit: k, + fields: [this.textKey, "metadata"], + raw: filter, + }); + } + + const searchRows = (await searchIterator).rows; + for (const row of searchRows) { + console.log(`row: ${JSON.stringify(row)}`); + const text = row.fields[this.textKey]; + delete row.fields[this.textKey]; + const metadataField = row.fields; + const searchScore = row.score; + const doc = new Document({ + pageContent: text, + metadata: metadataField, + }); + docsWithScore.push([doc, searchScore]); + } + } catch (err) { + throw new Error(`Search failed with error: ${err}`); + } + return docsWithScore; + } + + async similaritySearchByVector( + embeddings: number[], + k = 4, + filter: CouchbaseVectorStoreFilter = {}, + fetchK = 20, + kwargs: { [key: string]: any } = {} + ): Promise { + const docsWithScore = await this.similaritySearchVectorWithScore( + embeddings, + k, + filter, + fetchK, + kwargs + ); + const docs = []; + for (const doc of docsWithScore) { + docs.push(doc[0]); + } + console.log(docs); + return docs; + } + + async similaritySearch( + query: string, + k = 4, + filter: CouchbaseVectorStoreFilter = {} + ): Promise { + const docsWithScore = await this.similaritySearchWithScore(query,k,filter); + const docs = []; + for (const doc of docsWithScore) { + docs.push(doc[0]); + } + console.log(docs); + return docs; + } + + async similaritySearchWithScore( + query: string, + k = 4, + filter: CouchbaseVectorStoreFilter = {} + ): Promise<[DocumentInterface>, number][]> { + const embeddings = await this.embeddings.embedQuery(query); + const docsWithScore = await this.similaritySearchVectorWithScore( + embeddings, + k, + filter + ); + return docsWithScore; + // const dbHost = this.connectionString + // .split("//") + // .pop() + // ?.split("/")[0] + // .split(":")[0]; + // console.log(dbHost); + // const searchQuery = { + // fields: [this.textKey, "metadata"], + // sort: ["-_score"], + // limit: k, + // query: { match_none: {} }, + // knn: [{ k: k * 10, field: this.embeddingKey, vector: embedding }], + // }; + } + + async textTo2DList(text: string): Promise { + // TODO: Delete before Creating PR + // Split the text into words + const words = text.split(" "); + + // Initialize the 2D list + const numList: number[][] = []; + + // Iterate over each word + for await (const word of words) { + const numWord: number[] = []; + + // Iterate over each character in the word + for await (const char of word) { + // Convert the character to its ASCII value and add it to the list + numWord.push(char.charCodeAt(0)); + } + + // Add the numerical word to the 2D list + numList.push(numWord); + } + + return numList; + } + + public async addDocuments( + documents: Document[], + options: AddVectorOptions = {} + ) { + const texts = documents.map(({ pageContent }) => pageContent); + return this.addVectors( + await this.embeddings.embedDocuments(texts), + documents, + options + ); + } +} diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts new file mode 100644 index 000000000000..c9d890fb3a33 --- /dev/null +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts @@ -0,0 +1,33 @@ +import { expect, test } from "@jest/globals"; +import { Cluster } from "couchbase"; +import { OpenAIEmbeddings } from "@langchain/openai"; +// import { faker } from "@faker-js/faker"; +import { CouchbaseVectorSearch } from "../couchbase.js"; + +test("Test Couchbase Cluster connection ", async () => { + const connectionString = "couchbase://3.76.104.168"; + const databaseUsername = "Administrator"; + const databasePassword = "P@ssword1!"; + // const query = ` + // SELECT h.* FROM \`travel-sample\`.inventory.hotel h + // WHERE h.country = 'United States' + // LIMIT 10 + // `; + // const validPageContentFields = ["country", "name", "description"]; + // const validMetadataFields = ["id"]; + + const couchbaseClient = await Cluster.connect(connectionString, { + username: databaseUsername, + password: databasePassword, + configProfile: "wanDevelopment", + }); + + console.log("connected"); + + const embeddings = new OpenAIEmbeddings({openAIApiKey: "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"}); + const couchbaseVectorStore = new CouchbaseVectorSearch(couchbaseClient,"movies-clone","testing", "1024",embeddings,"movies-clone","overview", "overview-embeddings") + // const pageContent = faker.lorem.sentence(5); + // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) + const docsWithScore = await couchbaseVectorStore.similaritySearch("Star Wars"); + expect(docsWithScore.length).toBeGreaterThan(0); +}); diff --git a/yarn.lock b/yarn.lock index 31671809b3ec..0cfd4cdf33c5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6652,6 +6652,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-darwin-arm64-napi@npm:4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "@couchbase/couchbase-darwin-arm64-napi@npm:4.2.11-rc.1" + conditions: os=darwin & cpu=arm64 + languageName: node + linkType: hard + "@couchbase/couchbase-darwin-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.10" @@ -6659,6 +6666,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-darwin-x64-napi@npm:4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.11-rc.1" + conditions: os=darwin & cpu=x64 + languageName: node + linkType: hard + "@couchbase/couchbase-linux-arm64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.10" @@ -6666,6 +6680,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-linux-arm64-napi@npm:4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.11-rc.1" + conditions: os=linux & cpu=arm64 + languageName: node + linkType: hard + "@couchbase/couchbase-linux-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.10" @@ -6673,6 +6694,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-linux-x64-napi@npm:4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.11-rc.1" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10" @@ -6680,6 +6708,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.11-rc.1" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + "@couchbase/couchbase-win32-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.10" @@ -6687,6 +6722,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-win32-x64-napi@npm:4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.11-rc.1" + conditions: os=win32 & cpu=x64 + languageName: node + linkType: hard + "@crawlee/types@npm:^3.3.0": version: 3.3.1 resolution: "@crawlee/types@npm:3.3.1" @@ -8981,6 +9023,7 @@ __metadata: closevector-web: 0.1.6 cohere-ai: ">=6.0.0" convex: ^1.3.1 + couchbase: ^4.2.11-rc.1 discord.js: ^14.14.1 dotenv: ^16.0.3 dpdm: ^3.12.0 @@ -9086,6 +9129,7 @@ __metadata: closevector-web: 0.1.6 cohere-ai: "*" convex: ^1.3.1 + couchbase: ^4.2.11-rc.1 discord.js: ^14.14.1 dria: ^0.0.3 faiss-node: ^0.5.1 @@ -18488,6 +18532,35 @@ __metadata: languageName: node linkType: hard +"couchbase@npm:^4.2.11-rc.1": + version: 4.2.11-rc.1 + resolution: "couchbase@npm:4.2.11-rc.1" + dependencies: + "@couchbase/couchbase-darwin-arm64-napi": 4.2.11-rc.1 + "@couchbase/couchbase-darwin-x64-napi": 4.2.11-rc.1 + "@couchbase/couchbase-linux-arm64-napi": 4.2.11-rc.1 + "@couchbase/couchbase-linux-x64-napi": 4.2.11-rc.1 + "@couchbase/couchbase-linuxmusl-x64-napi": 4.2.11-rc.1 + "@couchbase/couchbase-win32-x64-napi": 4.2.11-rc.1 + cmake-js: ^7.2.1 + node-addon-api: ^7.0.0 + dependenciesMeta: + "@couchbase/couchbase-darwin-arm64-napi": + optional: true + "@couchbase/couchbase-darwin-x64-napi": + optional: true + "@couchbase/couchbase-linux-arm64-napi": + optional: true + "@couchbase/couchbase-linux-x64-napi": + optional: true + "@couchbase/couchbase-linuxmusl-x64-napi": + optional: true + "@couchbase/couchbase-win32-x64-napi": + optional: true + checksum: ad0bce7bce551e037dc178fac95a071be75a7374828b4c14ca8db6fc45425e50b3118ac0b062b13378dd7906b3c846d02af470ca4ed845cf077ba015174d9cd1 + languageName: node + linkType: hard + "create-langchain-integration@workspace:libs/create-langchain-integration": version: 0.0.0-use.local resolution: "create-langchain-integration@workspace:libs/create-langchain-integration" From 02355334a7b5fbe22d8110684e753d5166ef8fbd Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 5 Mar 2024 12:58:34 +0530 Subject: [PATCH 09/18] improved the params of couchbase --- .../src/vectorstores/couchbase.ts | 17 ++++++++++------- .../src/vectorstores/tests/couchbase.test.ts | 6 +++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts index 60f75581b120..9043f7925e71 100644 --- a/libs/langchain-community/src/vectorstores/couchbase.ts +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -5,7 +5,6 @@ import { Bucket, Cluster, Collection, - MatchAllSearchQuery, Scope, SearchRequest, VectorQuery, @@ -171,13 +170,17 @@ export class CouchbaseVectorSearch extends VectorStore { fields = [this.textKey, this.metadataKey]; } + // const searchRequest = new SearchRequest( + // new MatchAllSearchQuery() + // ).withVectorSearch( + // VectorSearch.fromVectorQuery( + // new VectorQuery(this.embeddingKey, embeddings).numCandidates(fetchK) + // ) + // ); const searchRequest = new SearchRequest( - new MatchAllSearchQuery() - ).withVectorSearch( VectorSearch.fromVectorQuery( - new VectorQuery(this.embeddingKey, embeddings).numCandidates(fetchK) - ) - ); + new VectorQuery(this.embeddingKey, + embeddings).numCandidates(fetchK))); console.log(searchRequest, this.indexName); let searchIterator; @@ -241,7 +244,7 @@ export class CouchbaseVectorSearch extends VectorStore { } async similaritySearch( - query: string, + query: string, k = 4, filter: CouchbaseVectorStoreFilter = {} ): Promise { diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts index c9d890fb3a33..6faa219c9be2 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts @@ -24,10 +24,10 @@ test("Test Couchbase Cluster connection ", async () => { console.log("connected"); - const embeddings = new OpenAIEmbeddings({openAIApiKey: "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"}); - const couchbaseVectorStore = new CouchbaseVectorSearch(couchbaseClient,"movies-clone","testing", "1024",embeddings,"movies-clone","overview", "overview-embeddings") + const embeddings = new OpenAIEmbeddings({openAIApiKey: "OPEN-AI-API-KEY"}) + const couchbaseVectorStore = new CouchbaseVectorSearch(couchbaseClient,"movies-clone","testing", "1024",embeddings,"movies-clone","overview", "overview_embedding") // const pageContent = faker.lorem.sentence(5); // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) - const docsWithScore = await couchbaseVectorStore.similaritySearch("Star Wars"); + const docsWithScore = await couchbaseVectorStore.similaritySearch("star wars"); expect(docsWithScore.length).toBeGreaterThan(0); }); From feff046aadd9d1fd6a68b6b6e1496e07e3ff43c0 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 5 Mar 2024 14:18:10 +0530 Subject: [PATCH 10/18] bump couchbase sdk version --- libs/langchain-community/package.json | 4 +- yarn.lock | 75 ++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 671e82f2b8d4..eed6f1fcc22d 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -121,7 +121,7 @@ "closevector-web": "0.1.6", "cohere-ai": ">=6.0.0", "convex": "^1.3.1", - "couchbase": "^4.2.11-rc.1", + "couchbase": "^4.2.11", "discord.js": "^14.14.1", "dotenv": "^16.0.3", "dpdm": "^3.12.0", @@ -224,7 +224,7 @@ "closevector-web": "0.1.6", "cohere-ai": "*", "convex": "^1.3.1", - "couchbase": "^4.2.11-rc.1", + "couchbase": "^4.2.11", "discord.js": "^14.14.1", "dria": "^0.0.3", "faiss-node": "^0.5.1", diff --git a/yarn.lock b/yarn.lock index 11d5fb591c1f..e9fa20f75b3b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6652,6 +6652,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-darwin-arm64-napi@npm:4.2.11": + version: 4.2.11 + resolution: "@couchbase/couchbase-darwin-arm64-napi@npm:4.2.11" + conditions: os=darwin & cpu=arm64 + languageName: node + linkType: hard + "@couchbase/couchbase-darwin-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.10" @@ -6659,6 +6666,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-darwin-x64-napi@npm:4.2.11": + version: 4.2.11 + resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.11" + conditions: os=darwin & cpu=x64 + languageName: node + linkType: hard + "@couchbase/couchbase-linux-arm64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.10" @@ -6666,6 +6680,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-linux-arm64-napi@npm:4.2.11": + version: 4.2.11 + resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.11" + conditions: os=linux & cpu=arm64 + languageName: node + linkType: hard + "@couchbase/couchbase-linux-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.10" @@ -6673,6 +6694,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-linux-x64-napi@npm:4.2.11": + version: 4.2.11 + resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.11" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10" @@ -6680,6 +6708,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.11": + version: 4.2.11 + resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.11" + conditions: os=linux & cpu=x64 + languageName: node + linkType: hard + "@couchbase/couchbase-win32-x64-napi@npm:4.2.10": version: 4.2.10 resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.10" @@ -6687,6 +6722,13 @@ __metadata: languageName: node linkType: hard +"@couchbase/couchbase-win32-x64-napi@npm:4.2.11": + version: 4.2.11 + resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.11" + conditions: os=win32 & cpu=x64 + languageName: node + linkType: hard + "@crawlee/types@npm:^3.3.0": version: 3.3.1 resolution: "@crawlee/types@npm:3.3.1" @@ -8998,7 +9040,7 @@ __metadata: closevector-web: 0.1.6 cohere-ai: ">=6.0.0" convex: ^1.3.1 - couchbase: ^4.2.11-rc.1 + couchbase: ^4.2.11 discord.js: ^14.14.1 dotenv: ^16.0.3 dpdm: ^3.12.0 @@ -9104,7 +9146,7 @@ __metadata: closevector-web: 0.1.6 cohere-ai: "*" convex: ^1.3.1 - couchbase: ^4.2.11-rc.1 + couchbase: ^4.2.11 discord.js: ^14.14.1 dria: ^0.0.3 faiss-node: ^0.5.1 @@ -18691,6 +18733,35 @@ __metadata: languageName: node linkType: hard +"couchbase@npm:^4.2.11": + version: 4.2.11 + resolution: "couchbase@npm:4.2.11" + dependencies: + "@couchbase/couchbase-darwin-arm64-napi": 4.2.11 + "@couchbase/couchbase-darwin-x64-napi": 4.2.11 + "@couchbase/couchbase-linux-arm64-napi": 4.2.11 + "@couchbase/couchbase-linux-x64-napi": 4.2.11 + "@couchbase/couchbase-linuxmusl-x64-napi": 4.2.11 + "@couchbase/couchbase-win32-x64-napi": 4.2.11 + cmake-js: ^7.2.1 + node-addon-api: ^7.0.0 + dependenciesMeta: + "@couchbase/couchbase-darwin-arm64-napi": + optional: true + "@couchbase/couchbase-darwin-x64-napi": + optional: true + "@couchbase/couchbase-linux-arm64-napi": + optional: true + "@couchbase/couchbase-linux-x64-napi": + optional: true + "@couchbase/couchbase-linuxmusl-x64-napi": + optional: true + "@couchbase/couchbase-win32-x64-napi": + optional: true + checksum: f1987b8cbe1763d2f30fcd5a51ab5fb1c6c3828ab3e7fa744bc4911e091f3aac89c918960f4664d539e57ff4b2b30d1a7fe5a69c44dc02f60062794c5d45c6de + languageName: node + linkType: hard + "create-langchain-integration@workspace:libs/create-langchain-integration": version: 0.0.0-use.local resolution: "create-langchain-integration@workspace:libs/create-langchain-integration" From de9da4d5aade2ebfb64e8a2ce00bb54fedd53a5c Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 5 Mar 2024 15:35:12 +0530 Subject: [PATCH 11/18] remove rest implementation --- .../src/vectorstores/couchbase.ts | 77 +++---------------- .../src/vectorstores/tests/couchbase.test.ts | 19 ++--- 2 files changed, 17 insertions(+), 79 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts index 9043f7925e71..b4d44572ac3d 100644 --- a/libs/langchain-community/src/vectorstores/couchbase.ts +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -31,12 +31,6 @@ export class CouchbaseVectorSearch extends VectorStore { private readonly _collection: Collection; - // private readonly connectionString: string; - - // private readonly dbUsername: string; - - // private readonly dbPassword: string; - private readonly bucketName: string; private readonly scopeName: string; @@ -55,9 +49,6 @@ export class CouchbaseVectorSearch extends VectorStore { constructor( cluster: Cluster, - // connectionString: string, - // dbUsername: string, - // dbPassword: string, bucketName: string, scopeName: string, collectionName: string, @@ -69,9 +60,6 @@ export class CouchbaseVectorSearch extends VectorStore { ) { super(embedding, embedding); this.cluster = cluster; - // this.connectionString = connectionString; - // this.dbUsername = dbUsername; - // this.dbPassword = dbPassword; this.bucketName = bucketName; this.scopeName = scopeName; this.collectionName = collectionName; @@ -165,23 +153,15 @@ export class CouchbaseVectorSearch extends VectorStore { kwargs: { [key: string]: any } = {} ): Promise<[DocumentInterface>, number][]> { let { fields } = kwargs; - - if (fields === null) { + if (!fields) { fields = [this.textKey, this.metadataKey]; } - // const searchRequest = new SearchRequest( - // new MatchAllSearchQuery() - // ).withVectorSearch( - // VectorSearch.fromVectorQuery( - // new VectorQuery(this.embeddingKey, embeddings).numCandidates(fetchK) - // ) - // ); const searchRequest = new SearchRequest( VectorSearch.fromVectorQuery( - new VectorQuery(this.embeddingKey, - embeddings).numCandidates(fetchK))); - console.log(searchRequest, this.indexName); + new VectorQuery(this.embeddingKey, embeddings).numCandidates(fetchK) + ) + ); let searchIterator; const docsWithScore: [DocumentInterface>, number][] = @@ -204,7 +184,6 @@ export class CouchbaseVectorSearch extends VectorStore { const searchRows = (await searchIterator).rows; for (const row of searchRows) { - console.log(`row: ${JSON.stringify(row)}`); const text = row.fields[this.textKey]; delete row.fields[this.textKey]; const metadataField = row.fields; @@ -239,21 +218,23 @@ export class CouchbaseVectorSearch extends VectorStore { for (const doc of docsWithScore) { docs.push(doc[0]); } - console.log(docs); return docs; } async similaritySearch( - query: string, + query: string, k = 4, filter: CouchbaseVectorStoreFilter = {} ): Promise { - const docsWithScore = await this.similaritySearchWithScore(query,k,filter); + const docsWithScore = await this.similaritySearchWithScore( + query, + k, + filter + ); const docs = []; for (const doc of docsWithScore) { docs.push(doc[0]); } - console.log(docs); return docs; } @@ -269,44 +250,6 @@ export class CouchbaseVectorSearch extends VectorStore { filter ); return docsWithScore; - // const dbHost = this.connectionString - // .split("//") - // .pop() - // ?.split("/")[0] - // .split(":")[0]; - // console.log(dbHost); - // const searchQuery = { - // fields: [this.textKey, "metadata"], - // sort: ["-_score"], - // limit: k, - // query: { match_none: {} }, - // knn: [{ k: k * 10, field: this.embeddingKey, vector: embedding }], - // }; - } - - async textTo2DList(text: string): Promise { - // TODO: Delete before Creating PR - // Split the text into words - const words = text.split(" "); - - // Initialize the 2D list - const numList: number[][] = []; - - // Iterate over each word - for await (const word of words) { - const numWord: number[] = []; - - // Iterate over each character in the word - for await (const char of word) { - // Convert the character to its ASCII value and add it to the list - numWord.push(char.charCodeAt(0)); - } - - // Add the numerical word to the 2D list - numList.push(numWord); - } - - return numList; } public async addDocuments( diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts index 6faa219c9be2..11e452c5a5cb 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts @@ -1,3 +1,4 @@ +/* eslint-disable no-process-env */ import { expect, test } from "@jest/globals"; import { Cluster } from "couchbase"; import { OpenAIEmbeddings } from "@langchain/openai"; @@ -5,16 +6,9 @@ import { OpenAIEmbeddings } from "@langchain/openai"; import { CouchbaseVectorSearch } from "../couchbase.js"; test("Test Couchbase Cluster connection ", async () => { - const connectionString = "couchbase://3.76.104.168"; - const databaseUsername = "Administrator"; - const databasePassword = "P@ssword1!"; - // const query = ` - // SELECT h.* FROM \`travel-sample\`.inventory.hotel h - // WHERE h.country = 'United States' - // LIMIT 10 - // `; - // const validPageContentFields = ["country", "name", "description"]; - // const validMetadataFields = ["id"]; + const connectionString = process.env.DB_CONN_STR || "localhost"; + const databaseUsername = process.env.DB_USERNAME; + const databasePassword = process.env.DB_PASSWORD; const couchbaseClient = await Cluster.connect(connectionString, { username: databaseUsername, @@ -24,10 +18,11 @@ test("Test Couchbase Cluster connection ", async () => { console.log("connected"); - const embeddings = new OpenAIEmbeddings({openAIApiKey: "OPEN-AI-API-KEY"}) + const embeddings = new OpenAIEmbeddings({openAIApiKey: process.env.OPENAI_API_KEY}) const couchbaseVectorStore = new CouchbaseVectorSearch(couchbaseClient,"movies-clone","testing", "1024",embeddings,"movies-clone","overview", "overview_embedding") // const pageContent = faker.lorem.sentence(5); // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) - const docsWithScore = await couchbaseVectorStore.similaritySearch("star wars"); + const docsWithScore = await couchbaseVectorStore.similaritySearch("Dinosaurs are being artificially created in a park where"); + console.log(docsWithScore) expect(docsWithScore.length).toBeGreaterThan(0); }); From dcb6b3347424da7497bf37c8b5fc2a242a2fc665 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Wed, 6 Mar 2024 16:08:43 +0530 Subject: [PATCH 12/18] add tsdoc --- .../src/vectorstores/couchbase.ts | 113 +++++++++++++++++- .../src/vectorstores/tests/couchbase.test.ts | 29 ++++- 2 files changed, 132 insertions(+), 10 deletions(-) diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts index b4d44572ac3d..eb9e144ec9d3 100644 --- a/libs/langchain-community/src/vectorstores/couchbase.ts +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -1,3 +1,4 @@ +/* eslint-disable no-param-reassign */ /* eslint-disable @typescript-eslint/no-explicit-any */ import { Embeddings } from "@langchain/core/embeddings"; import { VectorStore } from "@langchain/core/vectorstores"; @@ -13,13 +14,21 @@ import { import { Document, DocumentInterface } from "@langchain/core/documents"; import { v4 as uuid } from "uuid"; +/** + * This interface define the optional fields for adding vector + */ export interface AddVectorOptions { ids?: string[]; - metadata?: string[]; + metadata?: Record[]; } type CouchbaseVectorStoreFilter = { [key: string]: any }; +/** + * Class for interacting with the Couchbase database. It extends the + * VectorStore class and provides methods for adding vectors and + * documents, and searching for similar vectors + */ export class CouchbaseVectorSearch extends VectorStore { declare FilterType: CouchbaseVectorStoreFilter; @@ -47,6 +56,22 @@ export class CouchbaseVectorSearch extends VectorStore { private readonly metadataKey = "metadata"; + /** + * Class for interacting with the Couchbase database. + * It extends the VectorStore class and provides methods + * for adding vectors and documents, and searching for similar vectors. + * This also verifies the index + * + * @param cluster - The Couchbase cluster that the store will interact with. + * @param bucketName - The name of the bucket in the Couchbase cluster. + * @param scopeName - The name of the scope within the bucket. + * @param collectionName - The name of the collection within the scope. + * @param embedding - The embeddings to be used for vector operations. + * @param indexName - The name of the index to be used for vector search. + * @param textKey - The key to be used for text in the documents. Defaults to "text". + * @param embeddingKey - The key to be used for embeddings in the documents. If not provided, defaults to undefined. + * @param scopedIndex - Whether to use a scoped index for vector search. Defaults to true. + */ constructor( cluster: Cluster, bucketName: string, @@ -79,6 +104,12 @@ export class CouchbaseVectorSearch extends VectorStore { void this.verifyIndexes(); } + /** + * An asynchrononous method to verify the search indexes. + * It retrieves all indexes and checks if specified index is present. + * + * @throws {Error} If the specified index does not exist in the database. + */ async verifyIndexes() { if (this.scopedIndex) { const allIndexes = await this._scope.searchIndexes().getAllIndexes(); @@ -103,6 +134,17 @@ export class CouchbaseVectorSearch extends VectorStore { return "couchbase"; } + /** + * Add vectors and corresponding documents to a couchbase collection + * If the document IDs are passed, the existing documents (if any) will be + * overwritten with the new ones. + * @param vectors - The vectors to be added to the collection. + * @param documents - The corresponding documents to be added to the collection. + * @param options - Optional parameters for adding vectors. + * This may include the IDs and metadata of the documents to be added. Defaults to an empty object. + * + * @returns - A promise that resolves to an array of document IDs that were added to the collection. + */ public async addVectors( vectors: number[][], documents: Document[], @@ -145,21 +187,40 @@ export class CouchbaseVectorSearch extends VectorStore { return docIds; } + /** + * Performs a similarity search on the vectors in the Couchbase database and returns the documents and their corresponding scores. + * + * @param embeddings - Embedding vector to look up documents similar to. + * @param k - Number of documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object + * @param kwargs - Optional list of fields to include in the + * metadata of results. Note that these need to be stored in the index. + * If nothing is specified, defaults to document metadata fields. + * + * @returns - Promise of list of [document, score] that are the most similar to the query vector. + * + * @throws If the search operation fails. + */ async similaritySearchVectorWithScore( embeddings: number[], k = 4, filter: CouchbaseVectorStoreFilter = {}, - fetchK = 20, kwargs: { [key: string]: any } = {} ): Promise<[DocumentInterface>, number][]> { let { fields } = kwargs; + if (!fields) { fields = [this.textKey, this.metadataKey]; } + // Document text field needs to be returned from the search + if (!fields.include(this.textKey)) { + fields.push(this.textKey); + } + const searchRequest = new SearchRequest( VectorSearch.fromVectorQuery( - new VectorQuery(this.embeddingKey, embeddings).numCandidates(fetchK) + new VectorQuery(this.embeddingKey, embeddings).numCandidates(k) ) ); @@ -200,18 +261,28 @@ export class CouchbaseVectorSearch extends VectorStore { return docsWithScore; } + /** + * Return documents that are most similar to the vector embedding. + * + * @param embeddings - Embedding to look up documents similar to. + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search options that are passed to Couchbase search. Defaults to empty object. + * @param kwargs - Optional list of fields to include in the metadata of results. + * Note that these need to be stored in the index. + * If nothing is specified, defaults to document text and metadata fields. + * + * @returns - A promise that resolves to an array of documents that match the similarity search. + */ async similaritySearchByVector( embeddings: number[], k = 4, filter: CouchbaseVectorStoreFilter = {}, - fetchK = 20, kwargs: { [key: string]: any } = {} ): Promise { const docsWithScore = await this.similaritySearchVectorWithScore( embeddings, k, filter, - fetchK, kwargs ); const docs = []; @@ -221,6 +292,15 @@ export class CouchbaseVectorSearch extends VectorStore { return docs; } + /** + * Return documents that are most similar to the query. + * + * @param query - Query to look up for similar documents + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search options that are passed to Couchbase search. Defaults to empty object. + * + * @returns - Promise of list of documents that are most similar to the query. + */ async similaritySearch( query: string, k = 4, @@ -238,6 +318,15 @@ export class CouchbaseVectorSearch extends VectorStore { return docs; } + /** + * Return documents that are most similar to the query with their scores. + * + * @param query - Query to look up for similar documents + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search options that are passed to Couchbase search. Defaults to empty object. + * + * @returns - Promise of list of documents that are most similar to the query. + */ async similaritySearchWithScore( query: string, k = 4, @@ -252,11 +341,25 @@ export class CouchbaseVectorSearch extends VectorStore { return docsWithScore; } + /** + * Run texts through the embeddings and persist in vectorstore. + * If the document IDs are passed, the existing documents (if any) will be + * overwritten with the new ones. + * @param documents - The corresponding documents to be added to the collection. + * @param options - Optional parameters for adding documents. + * This may include the IDs and metadata of the documents to be added. Defaults to an empty object. + * + * @returns - A promise that resolves to an array of document IDs that were added to the collection. + */ public async addDocuments( documents: Document[], options: AddVectorOptions = {} ) { const texts = documents.map(({ pageContent }) => pageContent); + const metadatas = documents.map((doc) => doc.metadata); + if (!options.metadata) { + options.metadata = metadatas; + } return this.addVectors( await this.embeddings.embedDocuments(texts), documents, diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts index 11e452c5a5cb..5c17c14f2930 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts @@ -7,8 +7,15 @@ import { CouchbaseVectorSearch } from "../couchbase.js"; test("Test Couchbase Cluster connection ", async () => { const connectionString = process.env.DB_CONN_STR || "localhost"; - const databaseUsername = process.env.DB_USERNAME; + const databaseUsername = process.env.DB_USERNAME; const databasePassword = process.env.DB_PASSWORD; + const bucketName = "movies-clone"; + const scopeName = "testing"; + const collectionName = "1024"; + const indexName = "movies-clone"; + const textFieldKey = "overview"; + const embeddingFieldKey = "overview_embedding"; + const isScopedIndex = true; const couchbaseClient = await Cluster.connect(connectionString, { username: databaseUsername, @@ -18,11 +25,23 @@ test("Test Couchbase Cluster connection ", async () => { console.log("connected"); - const embeddings = new OpenAIEmbeddings({openAIApiKey: process.env.OPENAI_API_KEY}) - const couchbaseVectorStore = new CouchbaseVectorSearch(couchbaseClient,"movies-clone","testing", "1024",embeddings,"movies-clone","overview", "overview_embedding") + const embeddings = new OpenAIEmbeddings({ + openAIApiKey: process.env.OPENAI_API_KEY, + }); + const couchbaseVectorStore = new CouchbaseVectorSearch( + couchbaseClient, + bucketName, + scopeName, + collectionName, + embeddings, + indexName, + textFieldKey, + embeddingFieldKey, + isScopedIndex + ); // const pageContent = faker.lorem.sentence(5); // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) - const docsWithScore = await couchbaseVectorStore.similaritySearch("Dinosaurs are being artificially created in a park where"); - console.log(docsWithScore) + const docsWithScore = await couchbaseVectorStore.similaritySearch("titanic"); + console.log(docsWithScore); expect(docsWithScore.length).toBeGreaterThan(0); }); From e666434a281452fde1b204bead78072343a64b8e Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Sat, 9 Mar 2024 19:03:33 +0530 Subject: [PATCH 13/18] use initialize to create instance of class --- libs/langchain-community/.gitignore | 4 + libs/langchain-community/langchain.config.js | 2 + libs/langchain-community/package.json | 16 + .../src/load/import_constants.ts | 1 + .../src/vectorstores/couchbase.ts | 328 ++++++++++++------ .../src/vectorstores/tests/couchbase.test.ts | 51 ++- yarn.lock | 2 + 7 files changed, 289 insertions(+), 115 deletions(-) diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index ec7a33d56c2f..5dbd32503780 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -298,6 +298,10 @@ vectorstores/convex.cjs vectorstores/convex.js vectorstores/convex.d.ts vectorstores/convex.d.cts +vectorstores/couchbase.cjs +vectorstores/couchbase.js +vectorstores/couchbase.d.ts +vectorstores/couchbase.d.cts vectorstores/elasticsearch.cjs vectorstores/elasticsearch.js vectorstores/elasticsearch.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 74ba742c0cdb..0c173a71a1c1 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -105,6 +105,7 @@ export const config = { "vectorstores/closevector/web": "vectorstores/closevector/web", "vectorstores/cloudflare_vectorize": "vectorstores/cloudflare_vectorize", "vectorstores/convex": "vectorstores/convex", + "vectorstores/couchbase": "vectorstores/couchbase", "vectorstores/elasticsearch": "vectorstores/elasticsearch", "vectorstores/faiss": "vectorstores/faiss", "vectorstores/googlevertexai": "vectorstores/googlevertexai", @@ -266,6 +267,7 @@ export const config = { "vectorstores/closevector/web", "vectorstores/cloudflare_vectorize", "vectorstores/convex", + "vectorstores/couchbase", "vectorstores/elasticsearch", "vectorstores/faiss", "vectorstores/googlevertexai", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index eed6f1fcc22d..17180b4a1cfd 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -417,6 +417,9 @@ "convex": { "optional": true }, + "couchbase": { + "optional": true + }, "discord.js": { "optional": true }, @@ -1190,6 +1193,15 @@ "import": "./vectorstores/convex.js", "require": "./vectorstores/convex.cjs" }, + "./vectorstores/couchbase": { + "types": { + "import": "./vectorstores/couchbase.d.ts", + "require": "./vectorstores/couchbase.d.cts", + "default": "./vectorstores/couchbase.d.ts" + }, + "import": "./vectorstores/couchbase.js", + "require": "./vectorstores/couchbase.cjs" + }, "./vectorstores/elasticsearch": { "types": { "import": "./vectorstores/elasticsearch.d.ts", @@ -2412,6 +2424,10 @@ "vectorstores/convex.js", "vectorstores/convex.d.ts", "vectorstores/convex.d.cts", + "vectorstores/couchbase.cjs", + "vectorstores/couchbase.js", + "vectorstores/couchbase.d.ts", + "vectorstores/couchbase.d.cts", "vectorstores/elasticsearch.cjs", "vectorstores/elasticsearch.js", "vectorstores/elasticsearch.d.ts", diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index bf5526878cb5..996fdebec21a 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -43,6 +43,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/vectorstores/closevector/web", "langchain_community/vectorstores/cloudflare_vectorize", "langchain_community/vectorstores/convex", + "langchain_community/vectorstores/couchbase", "langchain_community/vectorstores/elasticsearch", "langchain_community/vectorstores/faiss", "langchain_community/vectorstores/googlevertexai", diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts index eb9e144ec9d3..6708d90f884b 100644 --- a/libs/langchain-community/src/vectorstores/couchbase.ts +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -1,6 +1,6 @@ /* eslint-disable no-param-reassign */ /* eslint-disable @typescript-eslint/no-explicit-any */ -import { Embeddings } from "@langchain/core/embeddings"; +import { EmbeddingsInterface } from "@langchain/core/embeddings"; import { VectorStore } from "@langchain/core/vectorstores"; import { Bucket, @@ -22,6 +22,18 @@ export interface AddVectorOptions { metadata?: Record[]; } +export interface CouchbaseVectorStoreArgs { + cluster: Cluster; + bucketName: string; + scopeName: string; + collectionName: string; + indexName: string; + textKey: string; + embeddingKey: string | undefined; + scopedIndex: boolean; + addVectorOptions?: AddVectorOptions; +} + type CouchbaseVectorStoreFilter = { [key: string]: any }; /** @@ -29,30 +41,30 @@ type CouchbaseVectorStoreFilter = { [key: string]: any }; * VectorStore class and provides methods for adding vectors and * documents, and searching for similar vectors */ -export class CouchbaseVectorSearch extends VectorStore { +export class CouchbaseVectorStore extends VectorStore { declare FilterType: CouchbaseVectorStoreFilter; - private readonly cluster: Cluster; + private cluster: Cluster; - private readonly _bucket: Bucket; + private _bucket: Bucket; - private readonly _scope: Scope; + private _scope: Scope; - private readonly _collection: Collection; + private _collection: Collection; - private readonly bucketName: string; + private bucketName: string; - private readonly scopeName: string; + private scopeName: string; - private readonly collectionName: string; + private collectionName: string; - private readonly indexName: string; + private indexName: string; - private readonly textKey: string; + private textKey: string; - private readonly embeddingKey: string; + private embeddingKey: string; - private readonly scopedIndex: boolean; + private scopedIndex: boolean; private readonly metadataKey = "metadata"; @@ -73,35 +85,65 @@ export class CouchbaseVectorSearch extends VectorStore { * @param scopedIndex - Whether to use a scoped index for vector search. Defaults to true. */ constructor( - cluster: Cluster, - bucketName: string, - scopeName: string, - collectionName: string, - embedding: Embeddings, - indexName: string, - textKey = "text", - embeddingKey: string | undefined = undefined, - scopedIndex = true + embedding: EmbeddingsInterface, + config: CouchbaseVectorStoreArgs + ) { + super(embedding, config); + } + + static async initialize( + embeddings: EmbeddingsInterface, + config: CouchbaseVectorStoreArgs ) { - super(embedding, embedding); - this.cluster = cluster; - this.bucketName = bucketName; - this.scopeName = scopeName; - this.collectionName = collectionName; - this.indexName = indexName; - this.textKey = textKey; + const store = new CouchbaseVectorStore(embeddings, config); + + const { + cluster, + bucketName, + scopeName, + collectionName, + indexName, + textKey, + embeddingKey, + scopedIndex, + } = config; + if (embeddingKey) { - this.embeddingKey = embeddingKey; + store.embeddingKey = embeddingKey; } else { - this.embeddingKey = `${textKey}_embedding`; + store.embeddingKey = `${textKey}_embedding`; } - this.scopedIndex = scopedIndex; - this._bucket = this.cluster.bucket(this.bucketName); - this._scope = this._bucket.scope(this.scopeName); - this._collection = this._scope.collection(this.collectionName); + store.cluster = cluster; + store.bucketName = bucketName; + store.scopeName = scopeName; + store.collectionName = collectionName; + store.indexName = indexName; + store.textKey = textKey; + store.scopedIndex = scopedIndex; + + try { + store._bucket = store.cluster.bucket(store.bucketName); + store._scope = store._bucket.scope(store.scopeName); + store._collection = store._scope.collection(store.collectionName); + } catch (err) { + throw new Error( + "Error connecting to couchbase, Please check connection and credentials" + ); + } - void this.verifyIndexes(); + try { + if ( + !(await store.checkBucketExists()) || + !(await store.checkIndexExists()) || + !(await store.checkScopeAndCollectionExists()) + ) { + throw new Error("Error while initializing vector store"); + } + } catch (err) { + throw new Error(`Error while initializing vector store: ${err}`); + } + return store; } /** @@ -109,8 +151,10 @@ export class CouchbaseVectorSearch extends VectorStore { * It retrieves all indexes and checks if specified index is present. * * @throws {Error} If the specified index does not exist in the database. + * + * @returns {Promise} returns promise true if no error is found */ - async verifyIndexes() { + private async checkIndexExists(): Promise { if (this.scopedIndex) { const allIndexes = await this._scope.searchIndexes().getAllIndexes(); const indexNames = allIndexes.map((index) => index.name); @@ -128,63 +172,54 @@ export class CouchbaseVectorSearch extends VectorStore { ); } } + return true; } - _vectorstoreType(): string { - return "couchbase"; + private async checkBucketExists(): Promise { + const bucketManager = this.cluster.buckets(); + try { + await bucketManager.getBucket(this.bucketName); + return true; + } catch (error) { + throw new Error( + `Bucket ${this.bucketName} does not exist. Please create the bucket before searching.` + ); + } } - /** - * Add vectors and corresponding documents to a couchbase collection - * If the document IDs are passed, the existing documents (if any) will be - * overwritten with the new ones. - * @param vectors - The vectors to be added to the collection. - * @param documents - The corresponding documents to be added to the collection. - * @param options - Optional parameters for adding vectors. - * This may include the IDs and metadata of the documents to be added. Defaults to an empty object. - * - * @returns - A promise that resolves to an array of document IDs that were added to the collection. - */ - public async addVectors( - vectors: number[][], - documents: Document[], - options: AddVectorOptions = {} - ): Promise { - // Get document ids. if ids are not available then use UUIDs for each document - let ids: string[] | undefined = options ? options.ids : undefined; - if (ids === undefined) { - ids = Array.from({ length: documents.length }, () => uuid()); - } + private async checkScopeAndCollectionExists(): Promise { + const scopeCollectionMap: Record = {}; - // Get metadata for each document. if metadata is not available, use empty object for each document - let metadata: any = options ? options.metadata : undefined; - if (metadata === undefined) { - metadata = Array.from({ length: documents.length }, () => ({})); + // Get a list of all scopes in the bucket + const scopes = await this._bucket.collections().getAllScopes(); + for (const scope of scopes) { + scopeCollectionMap[scope.name] = []; + + // Get a list of all the collections in the scope + for (const collection of scope.collections) { + scopeCollectionMap[scope.name].push(collection.name); + } } - const documentsToInsert = ids.map((id: string, index: number) => ({ - [id]: { - [this.textKey]: documents[index], - [this.embeddingKey]: vectors[index], - [this.metadataKey]: metadata[index], - }, - })); + // Check if the scope exists + if (!Object.keys(scopeCollectionMap).includes(this.scopeName)) { + throw new Error( + `Scope ${this.scopeName} not found in Couchbase bucket ${this.bucketName}` + ); + } - const docIds: string[] = []; - for (const document of documentsToInsert) { - try { - const currentDocumentKey = Object.keys(document)[0]; - await this._collection.upsert( - currentDocumentKey, - document[currentDocumentKey] - ); - docIds.push(currentDocumentKey); - } catch (e) { - console.log("error received while upserting document", e); - } + // Check if the collection exists in the scope + if (!scopeCollectionMap[this.scopeName].includes(this.collectionName)) { + throw new Error( + `Collection ${this.collectionName} not found in scope ${this.scopeName} in Couchbase bucket ${this.bucketName}` + ); } - return docIds; + return true; + } + + _vectorstoreType(): string { + return "couchbase"; } /** @@ -195,7 +230,7 @@ export class CouchbaseVectorSearch extends VectorStore { * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object * @param kwargs - Optional list of fields to include in the * metadata of results. Note that these need to be stored in the index. - * If nothing is specified, defaults to document metadata fields. + * If nothing is specified, defaults to all the fields stored in the index. * * @returns - Promise of list of [document, score] that are the most similar to the query vector. * @@ -206,45 +241,47 @@ export class CouchbaseVectorSearch extends VectorStore { k = 4, filter: CouchbaseVectorStoreFilter = {}, kwargs: { [key: string]: any } = {} - ): Promise<[DocumentInterface>, number][]> { + ): Promise<[Document, number][]> { let { fields } = kwargs; if (!fields) { - fields = [this.textKey, this.metadataKey]; + fields = ["*"]; } - // Document text field needs to be returned from the search - if (!fields.include(this.textKey)) { + if (!(fields.length === 1 && fields[0] === "*" ) && !fields.includes(this.textKey)) { fields.push(this.textKey); } - + console.log("fields",fields) + console.log(this.embeddingKey) const searchRequest = new SearchRequest( VectorSearch.fromVectorQuery( new VectorQuery(this.embeddingKey, embeddings).numCandidates(k) ) ); - + console.log("here1"); let searchIterator; const docsWithScore: [DocumentInterface>, number][] = []; - + console.log("here2"); try { if (this.scopedIndex) { searchIterator = this._scope.search(this.indexName, searchRequest, { limit: k, - fields: [this.textKey, "metadata"], + fields, raw: filter, }); } else { searchIterator = this.cluster.search(this.indexName, searchRequest, { limit: k, - fields: [this.textKey, "metadata"], + fields, raw: filter, }); } const searchRows = (await searchIterator).rows; for (const row of searchRows) { + console.log("row", row); + const text = row.fields[this.textKey]; delete row.fields[this.textKey]; const metadataField = row.fields; @@ -256,6 +293,7 @@ export class CouchbaseVectorSearch extends VectorStore { docsWithScore.push([doc, searchScore]); } } catch (err) { + console.log("error received"); throw new Error(`Search failed with error: ${err}`); } return docsWithScore; @@ -306,8 +344,9 @@ export class CouchbaseVectorSearch extends VectorStore { k = 4, filter: CouchbaseVectorStoreFilter = {} ): Promise { - const docsWithScore = await this.similaritySearchWithScore( - query, + const queryEmbeddings = await this.embeddings.embedQuery(query); + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, k, filter ); @@ -331,7 +370,7 @@ export class CouchbaseVectorSearch extends VectorStore { query: string, k = 4, filter: CouchbaseVectorStoreFilter = {} - ): Promise<[DocumentInterface>, number][]> { + ): Promise<[Document, number][]> { const embeddings = await this.embeddings.embedQuery(query); const docsWithScore = await this.similaritySearchVectorWithScore( embeddings, @@ -341,6 +380,59 @@ export class CouchbaseVectorSearch extends VectorStore { return docsWithScore; } + /** + * Add vectors and corresponding documents to a couchbase collection + * If the document IDs are passed, the existing documents (if any) will be + * overwritten with the new ones. + * @param vectors - The vectors to be added to the collection. + * @param documents - The corresponding documents to be added to the collection. + * @param options - Optional parameters for adding vectors. + * This may include the IDs and metadata of the documents to be added. Defaults to an empty object. + * + * @returns - A promise that resolves to an array of document IDs that were added to the collection. + */ + public async addVectors( + vectors: number[][], + documents: Document[], + options: AddVectorOptions = {} + ): Promise { + // Get document ids. if ids are not available then use UUIDs for each document + let ids: string[] | undefined = options ? options.ids : undefined; + if (ids === undefined) { + ids = Array.from({ length: documents.length }, () => uuid()); + } + + // Get metadata for each document. if metadata is not available, use empty object for each document + let metadata: any = options ? options.metadata : undefined; + if (metadata === undefined) { + metadata = Array.from({ length: documents.length }, () => ({})); + } + + const documentsToInsert = ids.map((id: string, index: number) => ({ + [id]: { + [this.textKey]: documents[index].pageContent, + [this.embeddingKey]: vectors[index], + [this.metadataKey]: metadata[index], + }, + })); + + const docIds: string[] = []; + for (const document of documentsToInsert) { + try { + const currentDocumentKey = Object.keys(document)[0]; + await this._collection.upsert( + currentDocumentKey, + document[currentDocumentKey] + ); + docIds.push(currentDocumentKey); + } catch (e) { + console.log("error received while upserting document", e); + } + } + + return docIds; + } + /** * Run texts through the embeddings and persist in vectorstore. * If the document IDs are passed, the existing documents (if any) will be @@ -366,4 +458,46 @@ export class CouchbaseVectorSearch extends VectorStore { options ); } + + static async fromDocuments( + documents: Document[], + embeddings: EmbeddingsInterface, + config: CouchbaseVectorStoreArgs + ): Promise { + const store = await this.initialize(embeddings, config); + await store.addDocuments(documents, config.addVectorOptions); + return store; + } + + static async fromTexts( + texts: string[], + metadatas: any, + embeddings: EmbeddingsInterface, + config: CouchbaseVectorStoreArgs + ): Promise { + const docs = []; + + for (let i = 0; i < texts.length; i += 1) { + const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; + const newDoc = new Document({ + pageContent: texts[i], + metadata, + }); + docs.push(newDoc); + } + return await this.fromDocuments(docs, embeddings, config); + } + + public async delete(ids: string[]): Promise { + for (let i = 0; i < ids.length; i += 1) { + const removeId = ids[i]; + try { + await this._collection.remove(removeId); + } catch (err) { + throw new Error( + `Error while deleting document - Document Id: ${ids[i]}, Error: ${err}` + ); + } + } + } } diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts index 5c17c14f2930..c63032d12940 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts @@ -2,19 +2,27 @@ import { expect, test } from "@jest/globals"; import { Cluster } from "couchbase"; import { OpenAIEmbeddings } from "@langchain/openai"; -// import { faker } from "@faker-js/faker"; -import { CouchbaseVectorSearch } from "../couchbase.js"; +import { faker } from "@faker-js/faker"; +// eslint-disable-next-line import/no-extraneous-dependencies +import { PDFLoader } from "langchain/document_loaders/fs/pdf"; +import { + CouchbaseVectorStore, + CouchbaseVectorStoreArgs, +} from "../couchbase.js"; + + + test("Test Couchbase Cluster connection ", async () => { const connectionString = process.env.DB_CONN_STR || "localhost"; const databaseUsername = process.env.DB_USERNAME; const databasePassword = process.env.DB_PASSWORD; const bucketName = "movies-clone"; - const scopeName = "testing"; - const collectionName = "1024"; - const indexName = "movies-clone"; - const textFieldKey = "overview"; - const embeddingFieldKey = "overview_embedding"; + const scopeName = "test2"; + const collectionName = "col1"; + const indexName = "movies-clone-test"; + const textFieldKey = "text"; + const embeddingFieldKey = "embedding"; const isScopedIndex = true; const couchbaseClient = await Cluster.connect(connectionString, { @@ -28,20 +36,27 @@ test("Test Couchbase Cluster connection ", async () => { const embeddings = new OpenAIEmbeddings({ openAIApiKey: process.env.OPENAI_API_KEY, }); - const couchbaseVectorStore = new CouchbaseVectorSearch( - couchbaseClient, + + const couchbaseConfig: CouchbaseVectorStoreArgs = { + cluster: couchbaseClient, bucketName, scopeName, collectionName, - embeddings, indexName, - textFieldKey, - embeddingFieldKey, - isScopedIndex - ); - // const pageContent = faker.lorem.sentence(5); + textKey: textFieldKey, + embeddingKey: embeddingFieldKey, + scopedIndex: isScopedIndex, + }; + + const couchbaseVectorStore = await CouchbaseVectorStore.initialize(embeddings, couchbaseConfig); + + const pageContent = faker.lorem.sentence(5); + console.log(pageContent); + const pdf = new PDFLoader("pdfFile"); + const docs = await pdf.load() // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) - const docsWithScore = await couchbaseVectorStore.similaritySearch("titanic"); - console.log(docsWithScore); - expect(docsWithScore.length).toBeGreaterThan(0); + // await CouchbaseVectorStore.fromDocuments(docs,embeddings, couchbaseConfig) + const docsWithScore = await couchbaseVectorStore.similaritySearchWithScore("titanic is bad for climate",4) + // console.log(docsWithScore); + // expect(docsWithScore.length).toBeGreaterThan(0); }); diff --git a/yarn.lock b/yarn.lock index e9fa20f75b3b..85f40b9a3aae 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9285,6 +9285,8 @@ __metadata: optional: true convex: optional: true + couchbase: + optional: true discord.js: optional: true dria: From de640439ee3976df5bcb97802bde0f8ef519919d Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Mon, 11 Mar 2024 12:57:44 +0530 Subject: [PATCH 14/18] improved tsdocs --- langchain/package.json | 4 +- .../src/vectorstores/couchbase.ts | 202 ++++++++++++------ .../src/vectorstores/tests/couchbase.test.ts | 13 +- yarn.lock | 75 +------ 4 files changed, 150 insertions(+), 144 deletions(-) diff --git a/langchain/package.json b/langchain/package.json index 01454d8d8dc8..f1a07b6e6e54 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -1255,7 +1255,7 @@ "cheerio": "^1.0.0-rc.12", "chromadb": "^1.5.3", "convex": "^1.3.1", - "couchbase": "^4.2.10", + "couchbase": "^4.2.11", "d3-dsv": "^2.0.0", "dotenv": "^16.0.3", "dpdm": "^3.12.0", @@ -1325,7 +1325,7 @@ "cheerio": "^1.0.0-rc.12", "chromadb": "*", "convex": "^1.3.1", - "couchbase": "^4.2.10", + "couchbase": "^4.2.11", "d3-dsv": "^2.0.0", "epub2": "^3.0.1", "fast-xml-parser": "*", diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts index 6708d90f884b..9fb3349c5bce 100644 --- a/libs/langchain-community/src/vectorstores/couchbase.ts +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -11,30 +11,45 @@ import { VectorQuery, VectorSearch, } from "couchbase"; -import { Document, DocumentInterface } from "@langchain/core/documents"; +import { Document } from "@langchain/core/documents"; import { v4 as uuid } from "uuid"; /** * This interface define the optional fields for adding vector + * - `ids` - vector of ids for each document. If undefined, then uuid will be used + * - `metadata` - vector of metadata object for each document */ export interface AddVectorOptions { ids?: string[]; metadata?: Record[]; } +/** + * This interface defines the fields required to initialize a vector store + */ export interface CouchbaseVectorStoreArgs { cluster: Cluster; bucketName: string; scopeName: string; collectionName: string; indexName: string; - textKey: string; - embeddingKey: string | undefined; - scopedIndex: boolean; + textKey?: string; + embeddingKey?: string; + scopedIndex?: boolean; addVectorOptions?: AddVectorOptions; } -type CouchbaseVectorStoreFilter = { [key: string]: any }; +/** + * This type defines the search filters used in couchbase vector search + * - `fields`: Optional list of fields to include in the + * metadata of results. Note that these need to be stored in the index. + * If nothing is specified, defaults to all the fields stored in the index. + * - `searchOptions`: Optional search options that are passed to Couchbase search. Defaults to empty object. + */ +type CouchbaseVectorStoreFilter = { + fields?: any, + searchOptions?: any +}; /** * Class for interacting with the Couchbase database. It extends the @@ -44,6 +59,12 @@ type CouchbaseVectorStoreFilter = { [key: string]: any }; export class CouchbaseVectorStore extends VectorStore { declare FilterType: CouchbaseVectorStoreFilter; + private readonly metadataKey = "metadata"; + + private readonly defaultTextKey = "text"; + + private readonly defaultScopedIndex = true; + private cluster: Cluster; private _bucket: Bucket; @@ -60,37 +81,34 @@ export class CouchbaseVectorStore extends VectorStore { private indexName: string; - private textKey: string; + private textKey = "text"; private embeddingKey: string; private scopedIndex: boolean; - private readonly metadataKey = "metadata"; + constructor( + embedding: EmbeddingsInterface, + config: CouchbaseVectorStoreArgs + ) { + super(embedding, config); + } /** - * Class for interacting with the Couchbase database. + * initialize class for interacting with the Couchbase database. * It extends the VectorStore class and provides methods * for adding vectors and documents, and searching for similar vectors. - * This also verifies the index + * This also verifies the params * * @param cluster - The Couchbase cluster that the store will interact with. * @param bucketName - The name of the bucket in the Couchbase cluster. * @param scopeName - The name of the scope within the bucket. * @param collectionName - The name of the collection within the scope. - * @param embedding - The embeddings to be used for vector operations. * @param indexName - The name of the index to be used for vector search. * @param textKey - The key to be used for text in the documents. Defaults to "text". * @param embeddingKey - The key to be used for embeddings in the documents. If not provided, defaults to undefined. * @param scopedIndex - Whether to use a scoped index for vector search. Defaults to true. */ - constructor( - embedding: EmbeddingsInterface, - config: CouchbaseVectorStoreArgs - ) { - super(embedding, config); - } - static async initialize( embeddings: EmbeddingsInterface, config: CouchbaseVectorStoreArgs @@ -108,19 +126,28 @@ export class CouchbaseVectorStore extends VectorStore { scopedIndex, } = config; - if (embeddingKey) { - store.embeddingKey = embeddingKey; - } else { - store.embeddingKey = `${textKey}_embedding`; - } - store.cluster = cluster; store.bucketName = bucketName; store.scopeName = scopeName; store.collectionName = collectionName; store.indexName = indexName; - store.textKey = textKey; - store.scopedIndex = scopedIndex; + if (textKey) { + store.textKey = textKey; + } else { + store.textKey = store.defaultTextKey; + } + + if (embeddingKey) { + store.embeddingKey = embeddingKey; + } else { + store.embeddingKey = `${store.textKey}_embedding`; + } + + if (scopedIndex !== undefined) { + store.scopedIndex = scopedIndex; + } else { + store.scopedIndex = store.defaultScopedIndex; + } try { store._bucket = store.cluster.bucket(store.bucketName); @@ -150,9 +177,9 @@ export class CouchbaseVectorStore extends VectorStore { * An asynchrononous method to verify the search indexes. * It retrieves all indexes and checks if specified index is present. * - * @throws {Error} If the specified index does not exist in the database. + * @throws - If the specified index does not exist in the database. * - * @returns {Promise} returns promise true if no error is found + * @returns - returns promise true if no error is found */ private async checkIndexExists(): Promise { if (this.scopedIndex) { @@ -175,6 +202,14 @@ export class CouchbaseVectorStore extends VectorStore { return true; } + /** + * An asynchronous method to verify the existence of a bucket. + * It retrieves the bucket using the bucket manager and checks if the specified bucket is present. + * + * @throws - If the specified bucket does not exist in the database. + * + * @returns - Returns a promise that resolves to true if no error is found, indicating the bucket exists. + */ private async checkBucketExists(): Promise { const bucketManager = this.cluster.buckets(); try { @@ -187,6 +222,14 @@ export class CouchbaseVectorStore extends VectorStore { } } + /** + * An asynchronous method to verify the existence of a scope and a collection within that scope. + * It retrieves all scopes and collections in the bucket, and checks if the specified scope and collection are present. + * + * @throws - If the specified scope does not exist in the bucket, or if the specified collection does not exist in the scope. + * + * @returns - Returns a promise that resolves to true if no error is found, indicating the scope and collection exist. + */ private async checkScopeAndCollectionExists(): Promise { const scopeCollectionMap: Record = {}; @@ -225,56 +268,57 @@ export class CouchbaseVectorStore extends VectorStore { /** * Performs a similarity search on the vectors in the Couchbase database and returns the documents and their corresponding scores. * - * @param embeddings - Embedding vector to look up documents similar to. + * @param queryEmbeddings - Embedding vector to look up documents similar to. * @param k - Number of documents to return. Defaults to 4. - * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object - * @param kwargs - Optional list of fields to include in the + * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object. + * - `fields`: Optional list of fields to include in the * metadata of results. Note that these need to be stored in the index. * If nothing is specified, defaults to all the fields stored in the index. + * - `searchOptions`: Optional search options that are passed to Couchbase search. Defaults to empty object. * * @returns - Promise of list of [document, score] that are the most similar to the query vector. * * @throws If the search operation fails. */ async similaritySearchVectorWithScore( - embeddings: number[], + queryEmbeddings: number[], k = 4, - filter: CouchbaseVectorStoreFilter = {}, - kwargs: { [key: string]: any } = {} + filter: CouchbaseVectorStoreFilter = {} ): Promise<[Document, number][]> { - let { fields } = kwargs; + let { fields } = filter; + const { searchOptions } = filter; if (!fields) { - fields = ["*"]; + fields = ["*"]; } - // Document text field needs to be returned from the search - if (!(fields.length === 1 && fields[0] === "*" ) && !fields.includes(this.textKey)) { + if ( + !(fields.length === 1 && fields[0] === "*") && + !fields.includes(this.textKey) + ) { fields.push(this.textKey); } - console.log("fields",fields) - console.log(this.embeddingKey) + const searchRequest = new SearchRequest( VectorSearch.fromVectorQuery( - new VectorQuery(this.embeddingKey, embeddings).numCandidates(k) + new VectorQuery(this.embeddingKey, queryEmbeddings).numCandidates(k) ) ); - console.log("here1"); + let searchIterator; - const docsWithScore: [DocumentInterface>, number][] = + const docsWithScore: [Document, number][] = []; - console.log("here2"); try { if (this.scopedIndex) { searchIterator = this._scope.search(this.indexName, searchRequest, { limit: k, fields, - raw: filter, + raw: searchOptions, }); } else { searchIterator = this.cluster.search(this.indexName, searchRequest, { limit: k, fields, - raw: filter, + raw: searchOptions, }); } @@ -302,26 +346,25 @@ export class CouchbaseVectorStore extends VectorStore { /** * Return documents that are most similar to the vector embedding. * - * @param embeddings - Embedding to look up documents similar to. + * @param queryEmbeddings - Embedding to look up documents similar to. * @param k - The number of similar documents to return. Defaults to 4. - * @param filter - Optional search options that are passed to Couchbase search. Defaults to empty object. - * @param kwargs - Optional list of fields to include in the metadata of results. - * Note that these need to be stored in the index. - * If nothing is specified, defaults to document text and metadata fields. + * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object. + * - `fields`: Optional list of fields to include in the + * metadata of results. Note that these need to be stored in the index. + * If nothing is specified, defaults to all the fields stored in the index. + * - `searchOptions`: Optional search options that are passed to Couchbase search. Defaults to empty object. * * @returns - A promise that resolves to an array of documents that match the similarity search. */ async similaritySearchByVector( - embeddings: number[], + queryEmbeddings: number[], k = 4, - filter: CouchbaseVectorStoreFilter = {}, - kwargs: { [key: string]: any } = {} + filter: CouchbaseVectorStoreFilter = {} ): Promise { const docsWithScore = await this.similaritySearchVectorWithScore( - embeddings, + queryEmbeddings, k, - filter, - kwargs + filter ); const docs = []; for (const doc of docsWithScore) { @@ -335,7 +378,11 @@ export class CouchbaseVectorStore extends VectorStore { * * @param query - Query to look up for similar documents * @param k - The number of similar documents to return. Defaults to 4. - * @param filter - Optional search options that are passed to Couchbase search. Defaults to empty object. + * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object. + * - `fields`: Optional list of fields to include in the + * metadata of results. Note that these need to be stored in the index. + * If nothing is specified, defaults to all the fields stored in the index. + * - `searchOptions`: Optional search options that are passed to Couchbase search. Defaults to empty object. * * @returns - Promise of list of documents that are most similar to the query. */ @@ -362,7 +409,11 @@ export class CouchbaseVectorStore extends VectorStore { * * @param query - Query to look up for similar documents * @param k - The number of similar documents to return. Defaults to 4. - * @param filter - Optional search options that are passed to Couchbase search. Defaults to empty object. + * @param filter - Optional search filter that are passed to Couchbase search. Defaults to empty object. + * - `fields`: Optional list of fields to include in the + * metadata of results. Note that these need to be stored in the index. + * If nothing is specified, defaults to all the fields stored in the index. + * - `searchOptions`: Optional search options that are passed to Couchbase search. Defaults to empty object. * * @returns - Promise of list of documents that are most similar to the query. */ @@ -371,9 +422,9 @@ export class CouchbaseVectorStore extends VectorStore { k = 4, filter: CouchbaseVectorStoreFilter = {} ): Promise<[Document, number][]> { - const embeddings = await this.embeddings.embedQuery(query); + const queryEmbeddings = await this.embeddings.embedQuery(query); const docsWithScore = await this.similaritySearchVectorWithScore( - embeddings, + queryEmbeddings, k, filter ); @@ -410,7 +461,7 @@ export class CouchbaseVectorStore extends VectorStore { const documentsToInsert = ids.map((id: string, index: number) => ({ [id]: { - [this.textKey]: documents[index].pageContent, + [this.textKey]: documents[index].pageContent, [this.embeddingKey]: vectors[index], [this.metadataKey]: metadata[index], }, @@ -459,6 +510,15 @@ export class CouchbaseVectorStore extends VectorStore { ); } + /** + * Create a new CouchbaseVectorStore from a set of documents. + * This function will initialize a new store, add the documents to it, and then return the store. + * @param documents - The documents to be added to the new store. + * @param embeddings - The embeddings to be used for the documents. + * @param config - The configuration for the new CouchbaseVectorStore. This includes the options for adding vectors. + * + * @returns - A promise that resolves to the new CouchbaseVectorStore that contains the added documents. + */ static async fromDocuments( documents: Document[], embeddings: EmbeddingsInterface, @@ -469,6 +529,18 @@ export class CouchbaseVectorStore extends VectorStore { return store; } + /** + * Create a new CouchbaseVectorStore from a set of texts. + * This function will convert each text and its corresponding metadata into a Document, + * initialize a new store, add the documents to it, and then return the store. + * @param texts - The texts to be converted into Documents and added to the new store. + * @param metadatas - The metadata for each text. If an array is passed, each text will have its corresponding metadata. + * If not, all texts will have the same metadata. + * @param embeddings - The embeddings to be used for the documents. + * @param config - The configuration for the new CouchbaseVectorStore. This includes the options for adding vectors. + * + * @returns - A promise that resolves to the new CouchbaseVectorStore that contains the added documents. + */ static async fromTexts( texts: string[], metadatas: any, @@ -488,6 +560,14 @@ export class CouchbaseVectorStore extends VectorStore { return await this.fromDocuments(docs, embeddings, config); } + /** + * Delete documents from the collection. + * This function will attempt to remove each document in the provided list of IDs from the collection. + * If an error occurs during the deletion of a document, an error will be thrown with the ID of the document and the error message. + * @param ids - An array of document IDs to be deleted from the collection. + * + * @returns - A promise that resolves when all documents have been attempted to be deleted. If a document could not be deleted, an error is thrown. + */ public async delete(ids: string[]): Promise { for (let i = 0; i < ids.length; i += 1) { const removeId = ids[i]; diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts index c63032d12940..734ccae8e233 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts @@ -1,10 +1,9 @@ /* eslint-disable no-process-env */ -import { expect, test } from "@jest/globals"; +import { test } from "@jest/globals"; import { Cluster } from "couchbase"; import { OpenAIEmbeddings } from "@langchain/openai"; import { faker } from "@faker-js/faker"; -// eslint-disable-next-line import/no-extraneous-dependencies -import { PDFLoader } from "langchain/document_loaders/fs/pdf"; + import { CouchbaseVectorStore, CouchbaseVectorStoreArgs, @@ -52,11 +51,9 @@ test("Test Couchbase Cluster connection ", async () => { const pageContent = faker.lorem.sentence(5); console.log(pageContent); - const pdf = new PDFLoader("pdfFile"); - const docs = await pdf.load() - // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) - // await CouchbaseVectorStore.fromDocuments(docs,embeddings, couchbaseConfig) + // // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) + // // await CouchbaseVectorStore.fromDocuments(docs,embeddings, couchbaseConfig) const docsWithScore = await couchbaseVectorStore.similaritySearchWithScore("titanic is bad for climate",4) - // console.log(docsWithScore); + console.log( docsWithScore); // expect(docsWithScore.length).toBeGreaterThan(0); }); diff --git a/yarn.lock b/yarn.lock index 85f40b9a3aae..013c848c52d9 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6645,13 +6645,6 @@ __metadata: languageName: node linkType: hard -"@couchbase/couchbase-darwin-arm64-napi@npm:4.2.10": - version: 4.2.10 - resolution: "@couchbase/couchbase-darwin-arm64-napi@npm:4.2.10" - conditions: os=darwin & cpu=arm64 - languageName: node - linkType: hard - "@couchbase/couchbase-darwin-arm64-napi@npm:4.2.11": version: 4.2.11 resolution: "@couchbase/couchbase-darwin-arm64-napi@npm:4.2.11" @@ -6659,13 +6652,6 @@ __metadata: languageName: node linkType: hard -"@couchbase/couchbase-darwin-x64-napi@npm:4.2.10": - version: 4.2.10 - resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.10" - conditions: os=darwin & cpu=x64 - languageName: node - linkType: hard - "@couchbase/couchbase-darwin-x64-napi@npm:4.2.11": version: 4.2.11 resolution: "@couchbase/couchbase-darwin-x64-napi@npm:4.2.11" @@ -6673,13 +6659,6 @@ __metadata: languageName: node linkType: hard -"@couchbase/couchbase-linux-arm64-napi@npm:4.2.10": - version: 4.2.10 - resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.10" - conditions: os=linux & cpu=arm64 - languageName: node - linkType: hard - "@couchbase/couchbase-linux-arm64-napi@npm:4.2.11": version: 4.2.11 resolution: "@couchbase/couchbase-linux-arm64-napi@npm:4.2.11" @@ -6687,13 +6666,6 @@ __metadata: languageName: node linkType: hard -"@couchbase/couchbase-linux-x64-napi@npm:4.2.10": - version: 4.2.10 - resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.10" - conditions: os=linux & cpu=x64 - languageName: node - linkType: hard - "@couchbase/couchbase-linux-x64-napi@npm:4.2.11": version: 4.2.11 resolution: "@couchbase/couchbase-linux-x64-napi@npm:4.2.11" @@ -6701,13 +6673,6 @@ __metadata: languageName: node linkType: hard -"@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10": - version: 4.2.10 - resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.10" - conditions: os=linux & cpu=x64 - languageName: node - linkType: hard - "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.11": version: 4.2.11 resolution: "@couchbase/couchbase-linuxmusl-x64-napi@npm:4.2.11" @@ -6715,13 +6680,6 @@ __metadata: languageName: node linkType: hard -"@couchbase/couchbase-win32-x64-napi@npm:4.2.10": - version: 4.2.10 - resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.10" - conditions: os=win32 & cpu=x64 - languageName: node - linkType: hard - "@couchbase/couchbase-win32-x64-napi@npm:4.2.11": version: 4.2.11 resolution: "@couchbase/couchbase-win32-x64-napi@npm:4.2.11" @@ -18706,35 +18664,6 @@ __metadata: languageName: node linkType: hard -"couchbase@npm:^4.2.10": - version: 4.2.10 - resolution: "couchbase@npm:4.2.10" - dependencies: - "@couchbase/couchbase-darwin-arm64-napi": 4.2.10 - "@couchbase/couchbase-darwin-x64-napi": 4.2.10 - "@couchbase/couchbase-linux-arm64-napi": 4.2.10 - "@couchbase/couchbase-linux-x64-napi": 4.2.10 - "@couchbase/couchbase-linuxmusl-x64-napi": 4.2.10 - "@couchbase/couchbase-win32-x64-napi": 4.2.10 - cmake-js: ^7.2.1 - node-addon-api: ^7.0.0 - dependenciesMeta: - "@couchbase/couchbase-darwin-arm64-napi": - optional: true - "@couchbase/couchbase-darwin-x64-napi": - optional: true - "@couchbase/couchbase-linux-arm64-napi": - optional: true - "@couchbase/couchbase-linux-x64-napi": - optional: true - "@couchbase/couchbase-linuxmusl-x64-napi": - optional: true - "@couchbase/couchbase-win32-x64-napi": - optional: true - checksum: 1cc4725c5f16c3173691a9e4f702e479df545473deac694f7a8627f58a63a92718824d018730b51a7d4d6a0a8e125b0ef5f3f81cf995a831b8a3adfa05e9ecc7 - languageName: node - linkType: hard - "couchbase@npm:^4.2.11": version: 4.2.11 resolution: "couchbase@npm:4.2.11" @@ -26044,7 +25973,7 @@ __metadata: cheerio: ^1.0.0-rc.12 chromadb: ^1.5.3 convex: ^1.3.1 - couchbase: ^4.2.10 + couchbase: ^4.2.11 d3-dsv: ^2.0.0 dotenv: ^16.0.3 dpdm: ^3.12.0 @@ -26126,7 +26055,7 @@ __metadata: cheerio: ^1.0.0-rc.12 chromadb: "*" convex: ^1.3.1 - couchbase: ^4.2.10 + couchbase: ^4.2.11 d3-dsv: ^2.0.0 epub2: ^3.0.1 fast-xml-parser: "*" From 8fcaca0fab755ff10e17dfffc81b1f5baa26fd98 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 12 Mar 2024 12:52:36 +0530 Subject: [PATCH 15/18] add tests --- .../src/vectorstores/couchbase.ts | 65 ++++++--- .../vectorstores/tests/couchbase.int.test.ts | 135 ++++++++++++++++++ .../src/vectorstores/tests/couchbase.test.ts | 59 -------- 3 files changed, 179 insertions(+), 80 deletions(-) create mode 100644 libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts delete mode 100644 libs/langchain-community/src/vectorstores/tests/couchbase.test.ts diff --git a/libs/langchain-community/src/vectorstores/couchbase.ts b/libs/langchain-community/src/vectorstores/couchbase.ts index 9fb3349c5bce..c893b5bfbabc 100644 --- a/libs/langchain-community/src/vectorstores/couchbase.ts +++ b/libs/langchain-community/src/vectorstores/couchbase.ts @@ -26,6 +26,16 @@ export interface AddVectorOptions { /** * This interface defines the fields required to initialize a vector store + * These are the fields part of config: + * @property {Cluster} cluster - The Couchbase cluster that the store will interact with. + * @property {string} bucketName - The name of the bucket in the Couchbase cluster. + * @property {string} scopeName - The name of the scope within the bucket. + * @property {string} collectionName - The name of the collection within the scope. + * @property {string} indexName - The name of the index to be used for vector search. + * @property {string} textKey - The key to be used for text in the documents. Defaults to "text". + * @property {string} embeddingKey - The key to be used for embeddings in the documents. If not provided, defaults to undefined. + * @property {boolean} scopedIndex - Whether to use a scoped index for vector search. Defaults to true. + * @property {AddVectorOptions} addVectorOptions - Options for adding vectors with specific id/metadata */ export interface CouchbaseVectorStoreArgs { cluster: Cluster; @@ -47,19 +57,20 @@ export interface CouchbaseVectorStoreArgs { * - `searchOptions`: Optional search options that are passed to Couchbase search. Defaults to empty object. */ type CouchbaseVectorStoreFilter = { - fields?: any, - searchOptions?: any + fields?: any; + searchOptions?: any; }; /** * Class for interacting with the Couchbase database. It extends the * VectorStore class and provides methods for adding vectors and - * documents, and searching for similar vectors + * documents, and searching for similar vectors. + * Initiate the class using initialize() method. */ export class CouchbaseVectorStore extends VectorStore { declare FilterType: CouchbaseVectorStoreFilter; - private readonly metadataKey = "metadata"; + private metadataKey = "metadata"; private readonly defaultTextKey = "text"; @@ -87,7 +98,13 @@ export class CouchbaseVectorStore extends VectorStore { private scopedIndex: boolean; - constructor( + /** + * The private constructor used to provide embedding to parent class. + * Initialize the class using static initialize() method + * @param embedding - object to generate embedding + * @param config - the fields required to initialize a vector store + */ + private constructor( embedding: EmbeddingsInterface, config: CouchbaseVectorStoreArgs ) { @@ -100,14 +117,8 @@ export class CouchbaseVectorStore extends VectorStore { * for adding vectors and documents, and searching for similar vectors. * This also verifies the params * - * @param cluster - The Couchbase cluster that the store will interact with. - * @param bucketName - The name of the bucket in the Couchbase cluster. - * @param scopeName - The name of the scope within the bucket. - * @param collectionName - The name of the collection within the scope. - * @param indexName - The name of the index to be used for vector search. - * @param textKey - The key to be used for text in the documents. Defaults to "text". - * @param embeddingKey - The key to be used for embeddings in the documents. If not provided, defaults to undefined. - * @param scopedIndex - Whether to use a scoped index for vector search. Defaults to true. + * @param embeddings - object to generate embedding + * @param config - the fields required to initialize a vector store */ static async initialize( embeddings: EmbeddingsInterface, @@ -265,6 +276,22 @@ export class CouchbaseVectorStore extends VectorStore { return "couchbase"; } + /** + * Formats couchbase metadata by removing `metadata.` from initials + * @param fields - all the fields of row + * @returns - formatted metadata fields + */ + private formatMetadata = (fields: any) => { + delete fields[this.textKey]; + const metadataFields: { [key: string]: any } = {}; + // eslint-disable-next-line guard-for-in + for (const key in fields) { + const newKey = key.replace(`${this.metadataKey}.`, ""); + metadataFields[newKey] = fields[key]; + } + return metadataFields; + }; + /** * Performs a similarity search on the vectors in the Couchbase database and returns the documents and their corresponding scores. * @@ -303,10 +330,9 @@ export class CouchbaseVectorStore extends VectorStore { new VectorQuery(this.embeddingKey, queryEmbeddings).numCandidates(k) ) ); - + let searchIterator; - const docsWithScore: [Document, number][] = - []; + const docsWithScore: [Document, number][] = []; try { if (this.scopedIndex) { searchIterator = this._scope.search(this.indexName, searchRequest, { @@ -324,15 +350,12 @@ export class CouchbaseVectorStore extends VectorStore { const searchRows = (await searchIterator).rows; for (const row of searchRows) { - console.log("row", row); - const text = row.fields[this.textKey]; - delete row.fields[this.textKey]; - const metadataField = row.fields; + const metadataFields = this.formatMetadata(row.fields); const searchScore = row.score; const doc = new Document({ pageContent: text, - metadata: metadataField, + metadata: metadataFields, }); docsWithScore.push([doc, searchScore]); } diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts new file mode 100644 index 000000000000..1e07dc4ec324 --- /dev/null +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts @@ -0,0 +1,135 @@ +/* eslint-disable no-process-env */ +import { describe, test } from "@jest/globals"; +import { Cluster } from "couchbase"; +import { OpenAIEmbeddings } from "@langchain/openai"; +import { Document } from "@langchain/core/documents"; +import { + CouchbaseVectorStore, + CouchbaseVectorStoreArgs, +} from "../couchbase.js"; + +describe("Couchbase vector store", () => { + const connectionString = process.env.DB_CONN_STR || "localhost"; + const databaseUsername = process.env.DB_USERNAME; + const databasePassword = process.env.DB_PASSWORD; + const bucketName = "movies-clone"; + const scopeName = "test2"; + const collectionName = "col1"; + const indexName = "movies-clone-test"; + const textFieldKey = "text"; + const embeddingFieldKey = "embedding"; + const isScopedIndex = true; + let couchbaseClient: Cluster; + + const texts = [ + "Couchbase, built on a key-value store, offers efficient data operations.", + "As a NoSQL database, Couchbase provides scalability and flexibility to handle diverse data types.", + "Couchbase supports N1QL, a SQL-like language, easing the transition for developers familiar with SQL.", + "Couchbase ensures high availability with built-in fault tolerance and automatic multi-master replication.", + "With its memory-first architecture, Couchbase delivers high performance and low latency data access.", + ]; + + const metadata = [ + { id: "101", name: "Efficient Operator" }, + { id: "102", name: "Flexible Storer" }, + { id: "103", name: "Quick Performer" }, + { id: "104", name: "Reliable Guardian" }, + { id: "105", name: "Adaptable Navigator" }, + ]; + + beforeEach(async () => { + couchbaseClient = await Cluster.connect(connectionString, { + username: databaseUsername, + password: databasePassword, + configProfile: "wanDevelopment", + }); + }); + + test("from Texts to vector store", async () => { + const embeddings = new OpenAIEmbeddings({ + openAIApiKey: process.env.OPENAI_API_KEY, + }); + + const couchbaseConfig: CouchbaseVectorStoreArgs = { + cluster: couchbaseClient, + bucketName, + scopeName, + collectionName, + indexName, + textKey: textFieldKey, + embeddingKey: embeddingFieldKey, + scopedIndex: isScopedIndex, + }; + + const store = await CouchbaseVectorStore.fromTexts( + texts, + metadata, + embeddings, + couchbaseConfig + ); + const results = await store.similaritySearchWithScore(texts[0], 1); + + expect(results.length).toEqual(1); + expect(results[0][0].pageContent).toEqual(texts[0]); + expect(results[0][0].metadata.name).toEqual(metadata[0].name); + expect(results[0][0].metadata.id).toEqual(metadata[0].id); + }); + + test.skip("Add and delete Documents to vector store", async () => { + const embeddings = new OpenAIEmbeddings({ + openAIApiKey: process.env.OPENAI_API_KEY, + }); + + const couchbaseConfig: CouchbaseVectorStoreArgs = { + cluster: couchbaseClient, + bucketName, + scopeName, + collectionName, + indexName, + textKey: textFieldKey, + embeddingKey: embeddingFieldKey, + scopedIndex: isScopedIndex, + }; + + const documents: Document[] = []; + for (let i = 0; i < texts.length; i += 1) { + documents.push({ + pageContent: texts[i], + metadata: {}, + }); + } + + const store = await CouchbaseVectorStore.initialize( + embeddings, + couchbaseConfig + ); + const ids = await store.addDocuments(documents, { + ids: metadata.map((val) => val.id), + metadata: metadata.map((val) => { + const metadataObj = { + name: val.name, + }; + return metadataObj; + }), + }); + + expect(ids.length).toEqual(texts.length); + for (let i = 0; i < ids.length; i += 1) { + expect(ids[i]).toEqual(metadata[i].id); + } + + const results = await store.similaritySearch(texts[1], 1); + + expect(results.length).toEqual(1); + expect(results[0].pageContent).toEqual(texts[1]); + expect(results[0].metadata.name).toEqual(metadata[1].name); + + await store.delete(ids); + const cbCollection = couchbaseClient.bucket(bucketName).scope(scopeName).collection(collectionName) + expect((await cbCollection.exists(ids[0])).exists).toBe(false) + expect((await cbCollection.exists(ids[4])).exists).toBe(false) + + const resultsDeleted = await store.similaritySearch(texts[1], 1); + expect(resultsDeleted.length).not.toEqual(1); + }); +}); diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts deleted file mode 100644 index 734ccae8e233..000000000000 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.test.ts +++ /dev/null @@ -1,59 +0,0 @@ -/* eslint-disable no-process-env */ -import { test } from "@jest/globals"; -import { Cluster } from "couchbase"; -import { OpenAIEmbeddings } from "@langchain/openai"; -import { faker } from "@faker-js/faker"; - -import { - CouchbaseVectorStore, - CouchbaseVectorStoreArgs, -} from "../couchbase.js"; - - - - -test("Test Couchbase Cluster connection ", async () => { - const connectionString = process.env.DB_CONN_STR || "localhost"; - const databaseUsername = process.env.DB_USERNAME; - const databasePassword = process.env.DB_PASSWORD; - const bucketName = "movies-clone"; - const scopeName = "test2"; - const collectionName = "col1"; - const indexName = "movies-clone-test"; - const textFieldKey = "text"; - const embeddingFieldKey = "embedding"; - const isScopedIndex = true; - - const couchbaseClient = await Cluster.connect(connectionString, { - username: databaseUsername, - password: databasePassword, - configProfile: "wanDevelopment", - }); - - console.log("connected"); - - const embeddings = new OpenAIEmbeddings({ - openAIApiKey: process.env.OPENAI_API_KEY, - }); - - const couchbaseConfig: CouchbaseVectorStoreArgs = { - cluster: couchbaseClient, - bucketName, - scopeName, - collectionName, - indexName, - textKey: textFieldKey, - embeddingKey: embeddingFieldKey, - scopedIndex: isScopedIndex, - }; - - const couchbaseVectorStore = await CouchbaseVectorStore.initialize(embeddings, couchbaseConfig); - - const pageContent = faker.lorem.sentence(5); - console.log(pageContent); - // // await couchbaseVectorStore.addDocuments([{ pageContent, metadata: { foo: "bar" } }]) - // // await CouchbaseVectorStore.fromDocuments(docs,embeddings, couchbaseConfig) - const docsWithScore = await couchbaseVectorStore.similaritySearchWithScore("titanic is bad for climate",4) - console.log( docsWithScore); - // expect(docsWithScore.length).toBeGreaterThan(0); -}); From 0b44a50ef1d330decbd60aab2e6773ce4d1aaa42 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Tue, 12 Mar 2024 23:46:27 +0530 Subject: [PATCH 16/18] add similarity search in documentation --- .../integrations/vectorstores/couchbase.mdx | 194 ++++++++++++++++++ examples/package.json | 1 + .../couchbase/similaritySearch.ts | 56 +++++ .../vectorstores/tests/couchbase.int.test.ts | 9 +- yarn.lock | 1 + 5 files changed, 258 insertions(+), 3 deletions(-) create mode 100644 docs/core_docs/docs/integrations/vectorstores/couchbase.mdx create mode 100644 examples/src/indexes/vector_stores/couchbase/similaritySearch.ts diff --git a/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx b/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx new file mode 100644 index 000000000000..7fc67b36a267 --- /dev/null +++ b/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx @@ -0,0 +1,194 @@ +--- +hide_table_of_contents: true +sidebar_class_name: node-only +--- + +import CodeBlock from "@theme/CodeBlock"; + +# Couchbase + +:::tip Compatibility +Only available on Node.js. +::: + +[Couchbase](http://couchbase.com/) is an award-winning distributed NoSQL cloud database that delivers unmatched versatility, performance, scalability, and financial value for all of your cloud, mobile, +AI, and edge computing applications. Couchbase embraces AI with coding assistance for developers and vector search for their applications. + +Vector search is a part of the [Full Text Service](https://docs.couchbase.com/server/current/learn/services-and-indexes/services/search-service.html)(FTS) in Couchbase. + +This tutorial explains how to use vector search in Couchbase. You can work with both [Couchbase Capella](https://www.couchbase.com/products/capella/) and your self-managed Couchbase server. + +## Installation + +You will need couchbase and langchain community to use couchbase vector store. For this tutorial, we will use OpenAI embeddings + +```bash npm2yarn +npm install couchbase @langchain/openai @langchain/community +``` + +## Create Couchbase Connection Object + +We create a connection to the Couchbase cluster initially and then pass the cluster object to the Vector Store. Here, we are connecting using the username and password. +You can also connect using any other supported way to your cluster. + +For more information on connecting to the Couchbase cluster, please check the [Node SDK documentation](https://docs.couchbase.com/nodejs-sdk/current/hello-world/start-using-sdk.html#connect). + +```typescript +import { Cluster } from "couchbase"; + +const connectionString = "couchbase://localhost"; // valid couchbase connection string +const dbUsername = "Administrator"; // valid database user with read access to the bucket being queried +const dbPassword = "Password"; // password for the database user + +const couchbaseClient = await Cluster.connect(connectionString, { + username: dbUsername, + password: dbPassword, + configProfile: "wanDevelopment", +}); +``` + +## Create the Vector Index + +Currently, the vector index needs to be created from the Couchbase Capella or Server UI or using the REST interface. + +Let us define a vector index with the name `vector-index` on the testing bucket + +For this example, let us use the Import Index feature on the Full Text Search on the UI. +We are defining an index on the `testing` bucket's `_default` scope on the `_default` collection with the vector field set to `embedding` and text field set to `text`. +We are also indexing and storing all the fields under `metadata` in the document dynamically. The similarity metric is set to `dot_product`. + +How to Import an Index to the Full Text Search service? + +- Couchbase Server: Click on Search -> Add Index -> Import + - Copy the following Index definition in the Import screen +- [Couchbase Capella](https://docs.couchbase.com/cloud/search/import-search-index.html) + - Copy the following index definition to a new file `index.json` and import that file in Capella using the instructions in the documentation. + +### Index Definition + +```json +{ + "name": "vector-index", + "type": "fulltext-index", + "params": { + "doc_config": { + "docid_prefix_delim": "", + "docid_regexp": "", + "mode": "type_field", + "type_field": "type" + }, + "mapping": { + "default_analyzer": "standard", + "default_datetime_parser": "dateTimeOptional", + "default_field": "_all", + "default_mapping": { + "dynamic": true, + "enabled": true, + "properties": { + "metadata": { + "dynamic": true, + "enabled": true + }, + "embedding": { + "enabled": true, + "dynamic": false, + "fields": [ + { + "dims": 1536, + "index": true, + "name": "embedding", + "similarity": "dot_product", + "type": "vector", + "vector_index_optimized_for": "recall" + } + ] + }, + "text": { + "enabled": true, + "dynamic": false, + "fields": [ + { + "index": true, + "name": "text", + "store": true, + "type": "text" + } + ] + } + } + }, + "default_type": "_default", + "docvalues_dynamic": false, + "index_dynamic": true, + "store_dynamic": true, + "type_field": "_type" + }, + "store": { + "indexType": "scorch", + "segmentVersion": 16 + } + }, + "sourceType": "gocbcore", + "sourceName": "testing", + "sourceParams": {}, + "planParams": { + "maxPartitionsPerPIndex": 103, + "indexPartitions": 10, + "numReplicas": 0 + } +} +``` + +For more details on how to create an FTS index with support for Vector fields, please refer to the documentation: + +- [Couchbase Capella](https://docs.couchbase.com/cloud/search/create-search-indexes.html) +- [Couchbase Server](https://docs.couchbase.com/server/current/search/create-search-indexes.html) + +For using this vector store, CouchbaseVectorStoreArgs needs to be configured. +```typescript +const couchbaseConfig: CouchbaseVectorStoreArgs = { + cluster: couchbaseClient, + bucketName: "testing", + scopeName: "_default", + collectionName: "_default", + indexName: "vector-index", + textKey: "text", + embeddingKey: "embedding", +}; +``` + +## Similarity Search +The following example showcases how to use couchbase vector search and perform similarity search. +For this example, we are going to load the "state_of_the_union.txt" file via the RecursiveCharacterTextSplitter, create langchain documents from the chunks and send to couchbase vector store. +After the data is indexed, we perform a simple query to find the top 4 chunks that are similar to the query "What did president say about Ketanji Brown Jackson". +This example at the end, also shows how to get similarity score + +import SimilaritySearch from "@examples/indexes/vector_stores/couchbase/similaritySearch.ts"; + +{SimilaritySearch} + +## Specifying Fields to Return +You can specify the fields to return from the document using `fields` parameter in the filter during searches. +These fields are returned as part of the `metadata` object. You can fetch any field that is stored in the index. +The `textKey` of the document is returned as part of the document's `pageContent`. + +If you do not specify any fields to be fetched, all the fields stored in the index are returned. + +If you want to fetch one of the fields in the metadata, you need to specify it using `.` +For example, to fetch the `source` field in the metadata, you need to use `metadata.source`. +```typescript +const result = await store.similaritySearch(query, 1, { + fields: ["metadata.source"] +}); +console.log(result[0]); +``` + +## Hybrid Search +Couchbase allows you to do hybrid searches by combining vector search results with searches on non-vector fields of the document like the `metadata` object. + +The results will be based on the combination of the results from both vector search and the searches supported by full text search service. +The scores of each of the component searches are added up to get the total score of the result. + +To perform hybrid searches, there is an optional key, `searchOptions` in `fields` parameter that can be passed to all the similarity searches. +The different search/query possibilities for the `searchOptions` can be found [here](https://docs.couchbase.com/server/current/search/search-request-params.html#query-object). + diff --git a/examples/package.json b/examples/package.json index 5e778c67cc97..9aa874c52a3f 100644 --- a/examples/package.json +++ b/examples/package.json @@ -64,6 +64,7 @@ "axios": "^0.26.0", "chromadb": "^1.5.3", "convex": "^1.3.1", + "couchbase": "^4.2.11", "date-fns": "^3.3.1", "exa-js": "^1.0.12", "faiss-node": "^0.5.1", diff --git a/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts b/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts new file mode 100644 index 000000000000..5c2b6b78b7e7 --- /dev/null +++ b/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts @@ -0,0 +1,56 @@ +import { OpenAIEmbeddings } from "@langchain/openai"; +import { + CouchbaseVectorStoreArgs, + CouchbaseVectorStore, +} from "@langchain/community/vectorstores/couchbase"; +import { Cluster } from "couchbase"; +import { readFileSync } from "fs"; +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; + +const connectionString = process.env.DB_CONN_STR || "localhost"; +const databaseUsername = process.env.DB_USERNAME; +const databasePassword = process.env.DB_PASSWORD; + +const text = readFileSync("state_of_the_union.txt", "utf8"); +const docs = await new RecursiveCharacterTextSplitter().createDocuments([text]); + +const couchbaseClient = await Cluster.connect(connectionString, { + username: databaseUsername, + password: databasePassword, + configProfile: "wanDevelopment", +}); + +const embeddings = new OpenAIEmbeddings({ + openAIApiKey: process.env.OPENAI_API_KEY, +}); + +const couchbaseConfig: CouchbaseVectorStoreArgs = { + cluster: couchbaseClient, + bucketName: "testing", + scopeName: "_default", + collectionName: "_default", + indexName: "vector-index", + textKey: "text", + embeddingKey: "embedding", +}; + +const store = await CouchbaseVectorStore.fromDocuments( + docs, + embeddings, + couchbaseConfig +); + +const query = "What did president say about Ketanji Brown Jackson"; + +const resultsSimilaritySearch = await store.similaritySearch(query); +console.log("resulting documents: ", resultsSimilaritySearch[0]); + +// Similarity Search With Score +const resultsSimilaritySearchWithScore = await store.similaritySearchWithScore(query, 1); +console.log("resulting documents: ", resultsSimilaritySearchWithScore[0][0]); +console.log("resulting scores: ", resultsSimilaritySearchWithScore[0][1]); + +const result = await store.similaritySearch(query, 1, { + fields: ["metadata.source"] +}); +console.log(result[0]); \ No newline at end of file diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts index 1e07dc4ec324..aa0acf28bbd5 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts @@ -125,9 +125,12 @@ describe("Couchbase vector store", () => { expect(results[0].metadata.name).toEqual(metadata[1].name); await store.delete(ids); - const cbCollection = couchbaseClient.bucket(bucketName).scope(scopeName).collection(collectionName) - expect((await cbCollection.exists(ids[0])).exists).toBe(false) - expect((await cbCollection.exists(ids[4])).exists).toBe(false) + const cbCollection = couchbaseClient + .bucket(bucketName) + .scope(scopeName) + .collection(collectionName); + expect((await cbCollection.exists(ids[0])).exists).toBe(false); + expect((await cbCollection.exists(ids[4])).exists).toBe(false); const resultsDeleted = await store.similaritySearch(texts[1], 1); expect(resultsDeleted.length).not.toEqual(1); diff --git a/yarn.lock b/yarn.lock index 924d673a1d34..d780f21ec0ec 100644 --- a/yarn.lock +++ b/yarn.lock @@ -21316,6 +21316,7 @@ __metadata: axios: ^0.26.0 chromadb: ^1.5.3 convex: ^1.3.1 + couchbase: ^4.2.11 date-fns: ^3.3.1 dotenv: ^16.0.3 eslint: ^8.33.0 From f369bb1a7f34637daf671131c77d59b61d5713f4 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Wed, 13 Mar 2024 20:49:02 +0530 Subject: [PATCH 17/18] add hybrid search in documentation and tests --- .../integrations/vectorstores/couchbase.mdx | 163 +++++++++++++++++- .../vector_stores/couchbase/.env.example | 13 ++ .../couchbase/similaritySearch.ts | 16 +- .../vectorstores/tests/couchbase.int.test.ts | 134 ++++++++++++-- 4 files changed, 299 insertions(+), 27 deletions(-) create mode 100644 examples/src/indexes/vector_stores/couchbase/.env.example diff --git a/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx b/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx index 7fc67b36a267..588169f00195 100644 --- a/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx +++ b/docs/core_docs/docs/integrations/vectorstores/couchbase.mdx @@ -145,6 +145,7 @@ For more details on how to create an FTS index with support for Vector fields, p - [Couchbase Server](https://docs.couchbase.com/server/current/search/create-search-indexes.html) For using this vector store, CouchbaseVectorStoreArgs needs to be configured. + ```typescript const couchbaseConfig: CouchbaseVectorStoreArgs = { cluster: couchbaseClient, @@ -158,6 +159,7 @@ const couchbaseConfig: CouchbaseVectorStoreArgs = { ``` ## Similarity Search + The following example showcases how to use couchbase vector search and perform similarity search. For this example, we are going to load the "state_of_the_union.txt" file via the RecursiveCharacterTextSplitter, create langchain documents from the chunks and send to couchbase vector store. After the data is indexed, we perform a simple query to find the top 4 chunks that are similar to the query "What did president say about Ketanji Brown Jackson". @@ -168,27 +170,178 @@ import SimilaritySearch from "@examples/indexes/vector_stores/couchbase/similari {SimilaritySearch} ## Specifying Fields to Return -You can specify the fields to return from the document using `fields` parameter in the filter during searches. -These fields are returned as part of the `metadata` object. You can fetch any field that is stored in the index. + +You can specify the fields to return from the document using `fields` parameter in the filter during searches. +These fields are returned as part of the `metadata` object. You can fetch any field that is stored in the index. The `textKey` of the document is returned as part of the document's `pageContent`. If you do not specify any fields to be fetched, all the fields stored in the index are returned. If you want to fetch one of the fields in the metadata, you need to specify it using `.` For example, to fetch the `source` field in the metadata, you need to use `metadata.source`. + ```typescript const result = await store.similaritySearch(query, 1, { - fields: ["metadata.source"] + fields: ["metadata.source"], }); console.log(result[0]); ``` ## Hybrid Search -Couchbase allows you to do hybrid searches by combining vector search results with searches on non-vector fields of the document like the `metadata` object. -The results will be based on the combination of the results from both vector search and the searches supported by full text search service. +Couchbase allows you to do hybrid searches by combining vector search results with searches on non-vector fields of the document like the `metadata` object. + +The results will be based on the combination of the results from both vector search and the searches supported by full text search service. The scores of each of the component searches are added up to get the total score of the result. To perform hybrid searches, there is an optional key, `searchOptions` in `fields` parameter that can be passed to all the similarity searches. The different search/query possibilities for the `searchOptions` can be found [here](https://docs.couchbase.com/server/current/search/search-request-params.html#query-object). +### Create Diverse Metadata for Hybrid Search + +In order to simulate hybrid search, let us create some random metadata from the existing documents. +We uniformly add three fields to the metadata, `date` between 2010 & 2020, `rating` between 1 & 5 and `author` set to either John Doe or Jane Doe. +We will also declare few sample queries. + +```typescript +for (let i = 0; i < docs.length; i += 1) { + docs[i].metadata.date = `${2010 + (i % 10)}-01-01`; + docs[i].metadata.rating = 1 + (i % 5); + docs[i].metadata.author = ["John Doe", "Jane Doe"][i % 2]; +} + +const store = await CouchbaseVectorStore.fromDocuments( + docs, + embeddings, + couchbaseConfig +); + +const query = "What did the president say about Ketanji Brown Jackson"; +const independenceQuery = "Any mention about independence?"; +``` + +### Example: Search by Exact Value + +We can search for exact matches on a textual field like the author in the `metadata` object. + +```typescript +const exactValueResult = await store.similaritySearch(query, 4, { + fields: ["metadata.author"], + searchOptions: { + query: { field: "metadata.author", match: "John Doe" }, + }, +}); +console.log(exactValueResult[0]); +``` + +### Example: Search by Partial Match + +We can search for partial matches by specifying a fuzziness for the search. This is useful when you want to search for slight variations or misspellings of a search query. + +Here, "Jae" is close (fuzziness of 1) to "Jane". + +```typescript +const partialMatchResult = await store.similaritySearch(query, 4, { + fields: ["metadata.author"], + searchOptions: { + query: { field: "metadata.author", match: "Johny", fuzziness: 1 }, + }, +}); +console.log(partialMatchResult[0]); +``` + +### Example: Search by Date Range Query + +We can search for documents that are within a date range query on a date field like `metadata.date`. + +```typescript +const dateRangeResult = await store.similaritySearch(independenceQuery, 4, { + fields: ["metadata.date", "metadata.author"], + searchOptions: { + query: { + start: "2022-12-31", + end: "2023-01-02", + inclusiveStart: true, + inclusiveEnd: false, + field: "metadata.date", + }, + }, +}); +console.log(dateRangeResult[0]); +``` + +### Example: Search by Numeric Range Query + +We can search for documents that are within a range for a numeric field like `metadata.rating`. + +```typescript +const ratingRangeResult = await store.similaritySearch(independenceQuery, 4, { + fields: ["metadata.rating"], + searchOptions: { + query: { + min: 3, + max: 5, + inclusiveMin: false, + inclusiveMax: true, + field: "metadata.rating", + }, + }, +}); +console.log(ratingRangeResult[0]); +``` + +### Example: Combining Multiple Search Conditions + +Different queries can by combined using AND (conjuncts) or OR (disjuncts) operators. + +In this example, we are checking for documents with a rating between 3 & 4 and dated between 2015 & 2018. + +```typescript +const multipleConditionsResult = await store.similaritySearch(texts[0], 4, { + fields: ["metadata.rating", "metadata.date"], + searchOptions: { + query: { + conjuncts: [ + { min: 3, max: 4, inclusive_max: true, field: "metadata.rating" }, + { start: "2016-12-31", end: "2017-01-02", field: "metadata.date" }, + ], + }, + }, +}); +console.log(multipleConditionsResult[0]); +``` + +### Other Queries + +Similarly, you can use any of the supported Query methods like Geo Distance, Polygon Search, Wildcard, Regular Expressions, etc in the `search_options` parameter. Please refer to the documentation for more details on the available query methods and their syntax. + +- [Couchbase Capella](https://docs.couchbase.com/cloud/search/search-request-params.html#query-object) +- [Couchbase Server](https://docs.couchbase.com/server/current/search/search-request-params.html#query-object) + +
+
+ +# Frequently Asked Questions + +## Question: Should I create the FTS index before creating the CouchbaseVectorStore object? + +Yes, currently you need to create the FTS index before creating the `CouchbaseVectoreStore` object. + +## Question: I am not seeing all the fields that I specified in my search results. + +In Couchbase, we can only return the fields stored in the FTS index. Please ensure that the field that you are trying to access in the search results is part of the index. + +One way to handle this is to store a document's fields dynamically in the index. To do that, you need to select `Store Dynamic Fields` in the Advanced Settings of the FTS index. + +Similarly, if you want to search on dynamic fields, you must index those fields by selecting the option `Index Dynamic Fields` in the FTS index settings. + +Note that these options will increase the size of the index. + +## Question: I am unable to see the metadata object in my search results. + +This is most likely due to the `metadata` field in the document not being indexed by the Couchbase FTS index. In order to index the `metadata` field in the document, you need to add it to the index as a mapping. + +If you select to map all the fields in the mapping, you will be able to search by all metadata fields. Alternatively, you can select the specific fields inside `metadata` object to be indexed. You can refer to the docs to learn more about indexing child mappings. + +- [Couchbase Capella](https://docs.couchbase.com/cloud/search/create-child-mapping.html) +- [Couchbase Server](https://docs.couchbase.com/server/current/fts/fts-creating-index-from-UI-classic-editor-dynamic.html) diff --git a/examples/src/indexes/vector_stores/couchbase/.env.example b/examples/src/indexes/vector_stores/couchbase/.env.example new file mode 100644 index 000000000000..1e6596582a0c --- /dev/null +++ b/examples/src/indexes/vector_stores/couchbase/.env.example @@ -0,0 +1,13 @@ +# Couchbase connection params +DB_CONN_STR= +DB_USERNAME= +DB_PASSWORD= + +# Couchbase vector store args +DB_BUCKET_NAME= +DB_SCOPE_NAME= +DB_COLLECTION_NAME= +DB_INDEX_NAME= + +# Open AI Key for embeddings +OPENAI_API_KEY= \ No newline at end of file diff --git a/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts b/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts index 5c2b6b78b7e7..ca34556d9733 100644 --- a/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts +++ b/examples/src/indexes/vector_stores/couchbase/similaritySearch.ts @@ -7,9 +7,9 @@ import { Cluster } from "couchbase"; import { readFileSync } from "fs"; import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; -const connectionString = process.env.DB_CONN_STR || "localhost"; -const databaseUsername = process.env.DB_USERNAME; -const databasePassword = process.env.DB_PASSWORD; +const connectionString = process.env.DB_CONN_STR ?? "couchbase://localhost"; +const databaseUsername = process.env.DB_USERNAME ?? "Administrator"; +const databasePassword = process.env.DB_PASSWORD ?? "Password"; const text = readFileSync("state_of_the_union.txt", "utf8"); const docs = await new RecursiveCharacterTextSplitter().createDocuments([text]); @@ -20,6 +20,7 @@ const couchbaseClient = await Cluster.connect(connectionString, { configProfile: "wanDevelopment", }); +// Open AI API Key is required to use OpenAIEmbeddings, some other embeddings may also be used const embeddings = new OpenAIEmbeddings({ openAIApiKey: process.env.OPENAI_API_KEY, }); @@ -46,11 +47,14 @@ const resultsSimilaritySearch = await store.similaritySearch(query); console.log("resulting documents: ", resultsSimilaritySearch[0]); // Similarity Search With Score -const resultsSimilaritySearchWithScore = await store.similaritySearchWithScore(query, 1); +const resultsSimilaritySearchWithScore = await store.similaritySearchWithScore( + query, + 1 +); console.log("resulting documents: ", resultsSimilaritySearchWithScore[0][0]); console.log("resulting scores: ", resultsSimilaritySearchWithScore[0][1]); const result = await store.similaritySearch(query, 1, { - fields: ["metadata.source"] + fields: ["metadata.source"], }); -console.log(result[0]); \ No newline at end of file +console.log(result[0]); diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts index aa0acf28bbd5..76f7e8ad266e 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase.int.test.ts @@ -1,3 +1,4 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable no-process-env */ import { describe, test } from "@jest/globals"; import { Cluster } from "couchbase"; @@ -8,18 +9,19 @@ import { CouchbaseVectorStoreArgs, } from "../couchbase.js"; -describe("Couchbase vector store", () => { - const connectionString = process.env.DB_CONN_STR || "localhost"; - const databaseUsername = process.env.DB_USERNAME; - const databasePassword = process.env.DB_PASSWORD; - const bucketName = "movies-clone"; - const scopeName = "test2"; - const collectionName = "col1"; - const indexName = "movies-clone-test"; +describe.skip("Couchbase vector store", () => { + const connectionString = process.env.DB_CONN_STR ?? "couchbase://localhost"; + const databaseUsername = process.env.DB_USERNAME ?? "Administrator"; + const databasePassword = process.env.DB_PASSWORD ?? "Password"; + const bucketName = process.env.DB_BUCKET_NAME ?? "testing"; + const scopeName = process.env.DB_SCOPE_NAME ?? "_default"; + const collectionName = process.env.DB_COLLECTION_NAME ?? "_default"; + const indexName = process.env.DB_INDEX_NAME ?? "vector-index"; const textFieldKey = "text"; const embeddingFieldKey = "embedding"; const isScopedIndex = true; let couchbaseClient: Cluster; + let embeddings: OpenAIEmbeddings; const texts = [ "Couchbase, built on a key-value store, offers efficient data operations.", @@ -43,13 +45,13 @@ describe("Couchbase vector store", () => { password: databasePassword, configProfile: "wanDevelopment", }); - }); - test("from Texts to vector store", async () => { - const embeddings = new OpenAIEmbeddings({ + embeddings = new OpenAIEmbeddings({ openAIApiKey: process.env.OPENAI_API_KEY, }); + }); + test("from Texts to vector store", async () => { const couchbaseConfig: CouchbaseVectorStoreArgs = { cluster: couchbaseClient, bucketName, @@ -75,11 +77,7 @@ describe("Couchbase vector store", () => { expect(results[0][0].metadata.id).toEqual(metadata[0].id); }); - test.skip("Add and delete Documents to vector store", async () => { - const embeddings = new OpenAIEmbeddings({ - openAIApiKey: process.env.OPENAI_API_KEY, - }); - + test("Add and delete Documents to vector store", async () => { const couchbaseConfig: CouchbaseVectorStoreArgs = { cluster: couchbaseClient, bucketName, @@ -135,4 +133,108 @@ describe("Couchbase vector store", () => { const resultsDeleted = await store.similaritySearch(texts[1], 1); expect(resultsDeleted.length).not.toEqual(1); }); + + test("hybrid search", async () => { + const couchbaseConfig: CouchbaseVectorStoreArgs = { + cluster: couchbaseClient, + bucketName, + scopeName, + collectionName, + indexName, + textKey: textFieldKey, + embeddingKey: embeddingFieldKey, + scopedIndex: isScopedIndex, + }; + + const query = `Couchbase offers impressive memory-first performance for your important applications.`; + + const hybridSearchMetadata: { [key: string]: any }[] = []; + + // Add More Metadata + for (let i = 0; i < texts.length; i += 1) { + const doc: { [key: string]: any } = {}; + doc.date = `${2020 + (i % 10)}-01-01`; + doc.rating = 1 + (i % 5); + doc.author = ["John Doe", "Jane Doe"][(i + 1) % 2]; + doc.id = (i + 100).toString(); + hybridSearchMetadata.push(doc); + } + const store = await CouchbaseVectorStore.fromTexts( + texts, + hybridSearchMetadata, + embeddings, + couchbaseConfig + ); + + const resultsSimilaritySearch = await store.similaritySearch(query, 1); + expect(resultsSimilaritySearch.length).toEqual(1); + expect(resultsSimilaritySearch[0].metadata.date).not.toEqual(undefined); + + // search by exact value in metadata + const exactValueResult = await store.similaritySearch(query, 4, { + fields: ["metadata.author"], + searchOptions: { + query: { field: "metadata.author", match: "John Doe" }, + }, + }); + + expect(exactValueResult.length).toEqual(4); + expect(exactValueResult[0].metadata.author).toEqual("John Doe"); + + // search by partial match in metadata + const partialMatchResult = await store.similaritySearch(query, 4, { + fields: ["metadata.author"], + searchOptions: { + query: { field: "metadata.author", match: "Johny", fuzziness: 1 }, + }, + }); + + expect(partialMatchResult.length).toEqual(4); + expect(partialMatchResult[0].metadata.author).toEqual("John Doe"); + + // search by date range + const dateRangeResult = await store.similaritySearch(query, 4, { + fields: ["metadata.date", "metadata.author"], + searchOptions: { + query: { + start: "2022-12-31", + end: "2023-01-02", + inclusiveStart: true, + inclusiveEnd: false, + field: "metadata.date", + }, + }, + }); + + expect(dateRangeResult.length).toEqual(4); + + // search by rating range + const ratingRangeResult = await store.similaritySearch(texts[0], 4, { + fields: ["metadata.rating"], + searchOptions: { + query: { + min: 3, + max: 5, + inclusiveMin: false, + inclusiveMax: true, + field: "metadata.rating", + }, + }, + }); + expect(ratingRangeResult.length).toEqual(4); + + // multiple search conditions + const multipleConditionsResult = await store.similaritySearch(texts[0], 4, { + fields: ["metadata.rating", "metadata.date"], + searchOptions: { + query: { + conjuncts: [ + { min: 3, max: 4, inclusive_max: true, field: "metadata.rating" }, + { start: "2022-12-31", end: "2023-01-02", field: "metadata.date" }, + ], + }, + }, + }); + expect(multipleConditionsResult.length).toEqual(4); + }); }); From af78ef9d627fca8ccdf1258bd1ab8e5c6a9c0628 Mon Sep 17 00:00:00 2001 From: Lokesh Goel Date: Wed, 13 Mar 2024 21:00:08 +0530 Subject: [PATCH 18/18] remove unwanted files --- langchain/src/vectorstores/couchbase.ts | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 langchain/src/vectorstores/couchbase.ts diff --git a/langchain/src/vectorstores/couchbase.ts b/langchain/src/vectorstores/couchbase.ts deleted file mode 100644 index e69de29bb2d1..000000000000