feat: Add Apify integration (#1)

JeremyFabrikapp · Apr 26, 2023 · 7146bee · 7146bee
1 parent 342ca8c
commit 7146bee
Show file tree

Hide file tree

Showing 14 changed files with 504 additions and 4 deletions.
diff --git a/docs/docs/ecosystem/apify.md b/docs/docs/ecosystem/apify.md
@@ -0,0 +1,48 @@
+# Apify
+
+This page covers how to use [Apify](https://apify.com) within LangChain.
+
+## Overview
+
+Apify is a cloud platform for web scraping and data extraction,
+which provides an [ecosystem](https://apify.com/store) of more than a thousand
+ready-made apps called _Actors_ for various scraping, crawling, and extraction use cases.
+
+[![Apify Actors](/img/ApifyActors.png)](https://apify.com/store)
+
+This integration enables you run Actors on the Apify platform and load their results into LangChain to feed your vector
+indexes with documents and data from the web, e.g. to generate answers from websites with documentation,
+blogs, or knowledge bases.
+
+## Installation and Setup
+
+- Install the [Apify API client](https://npmjs.com/package/apify-client) using your favorite package manager:
+
+```bash npm2yarn
+npm install apify-client
+```
+
+- Get your [Apify API token](https://console.apify.com/account/integrations) and either set it as
+  an environment variable (`APIFY_API_TOKEN`) or pass it to the `ApifyWrapper` in the constructor.
+
+## Wrappers
+
+### Utility
+
+You can use the `ApifyWrapper` to run Actors on the Apify platform.
+
+```ts
+import { ApifyWrapper } from "langchain/tools";
+```
+
+For a more detailed walkthrough of this wrapper, see [this guide](../modules/agents/tools/integrations/apify.md).
+
+### Loader
+
+You can also use our `ApifyDatasetLoader` to get data from Apify dataset.
+
+```ts
+import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset";
+```
+
+For a more detailed walkthrough of this loader, see [this guide](../modules/indexes/document_loaders/examples/web_loaders/apify_dataset.md).
diff --git a/docs/docs/modules/agents/tools/integrations/apify.md b/docs/docs/modules/agents/tools/integrations/apify.md
@@ -0,0 +1,89 @@
+# Apify
+
+This guide shows how to use the [Apify integration](../../../../ecosystem/apify.md) for LangChain.
+
+[Apify](https://apify.com) is a cloud platform for web scraping and data extraction,
+which provides an [ecosystem](https://apify.com/store) of more than a thousand
+ready-made apps called _Actors_ for various web scraping, crawling, and data extraction use cases.
+For example, you can use it to extract Google Search results, Instagram and Facebook profiles, products from Amazon or Shopify, Google Maps reviews, etc.
+
+In this example, we'll use the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor,
+which can deeply crawl websites such as documentation, knowledge bases, help centers, or blogs,
+and extract text content from the web pages. Then we feed the documents into a vector index and answer questions from it.
+
+```bash npm2yarn
+npm install apify-client
+```
+
+First, import `ApifyWrapper` and some other classes into your source code:
+
+```ts
+import { OpenAI } from "langchain/llms/openai";
+import { RetrievalQAChain } from "langchain/chains";
+import { HNSWLib } from "langchain/vectorstores/hnswlib";
+import { OpenAIEmbeddings } from "langchain/embeddings/openai";
+import { ApifyWrapper } from "langchain/tools";
+import { Document } from "langchain/document";
+```
+
+Initialize it using your [Apify API token](https://console.apify.com/account/integrations) and for the purpose of this example, also with your OpenAI API key:
+
+```ts
+const OPENAI_API_KEY = "Your OpenAI API key";
+const APIFY_API_TOKEN = "Your Apify API token";
+
+const model = new OpenAI({ openAIApiKey: OPENAI_API_KEY });
+const apify = new ApifyWrapper(APIFY_API_TOKEN);
+```
+
+Then run the Actor, wait for it to finish, and fetch its results from the Apify dataset into a LangChain document loader.
+
+Note that if you already have some results in an Apify dataset, you can load them directly using `ApifyDatasetLoader`, as shown in [this guide](../../../indexes/document_loaders/examples/web_loaders/apify_dataset.md). In that guide, you'll also find the explanation of the `datasetMappingFunction`, which is used to map fields from the Apify dataset records to LangChain `Document` fields.
+
+```ts
+const loader = await apify.callActor(
+  "apify/website-content-crawler",
+  { startUrls: [{ url: "https://js.langchain.com/docs/" }] },
+  (item) =>
+    new Document({
+      pageContent: (item.text || "") as string,
+      metadata: { source: item.url },
+    })
+);
+const docs = await loader.load();
+```
+
+Initialize the vector index from the crawled documents:
+
+```ts
+const vectorStore = await HNSWLib.fromDocuments(
+  docs,
+  new OpenAIEmbeddings({ openAIApiKey: OPENAI_API_KEY })
+);
+```
+
+Next, create the retrieval chain and enter a query:
+
+```ts
+const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever(), {
+  returnSourceDocuments: true,
+});
+const res = await chain.call({ query: "What is LangChain?" });
+```
+
+And finally, output the results:
+
+```ts
+console.log(res.text);
+console.log(res.sourceDocuments.map((d) => d.metadata.source));
+```
+
+```
+LangChain is a framework for developing applications powered by language models.
+[
+  'https://js.langchain.com/docs/',
+  'https://js.langchain.com/docs/modules/chains/',
+  'https://js.langchain.com/docs/modules/chains/llmchain/',
+  'https://js.langchain.com/docs/category/functions-4'
+]
+```
diff --git a/docs/docs/modules/agents/tools/integrations/index.mdx b/docs/docs/modules/agents/tools/integrations/index.mdx
@@ -9,6 +9,7 @@ import DocCardList from "@theme/DocCardList";
 
 LangChain provides the following tools you can use out of the box:
 
+- [`ApifyWrapper`](./apify.md) - A wrapper around the Apify platform. Useful for web scraping and data extraction.
 - `AWSLambda` - A wrapper around the AWS Lambda API, invoked via the Amazon Web Services Node.js SDK. Useful for invoking serverless functions with any behavior which you need to provide to an Agent.
 - `BingSerpAPI` - A wrapper around the Bing Search API. Useful for when you need to answer questions about current events. Input should be a search query.
 - `Calculator` - Useful for getting the result of a math expression. The input to this tool should be a valid mathematical expression that could be executed by a simple calculator.

diff --git a/docs/docs/modules/indexes/document_loaders/examples/web_loaders/apify_dataset.md b/docs/docs/modules/indexes/document_loaders/examples/web_loaders/apify_dataset.md
@@ -0,0 +1,99 @@
+# Apify Dataset
+
+This guide shows how to load Apify datasets to LangChain.
+
+[Apify Dataset](https://docs.apify.com/platform/storage/dataset) is a scaleable append-only storage with sequential access built for storing structured web scraping results, such as a list of products or Google SERPs, and then export them to various formats like JSON, CSV, or Excel. Datasets are mainly used to save results of [Apify Actors](https://apify.com/store)—serverless cloud programs for varius web scraping, crawling, and data extraction use cases.
+
+## Prerequisites
+
+```bash npm2yarn
+npm install apify-client
+```
+
+You need to have an existing dataset on the Apify platform. If you don't have one, please first check out [this guide](../../../../agents/tools/integrations/apify.md) on how to use Apify to extract content from documentation, knowledge bases, help centers, or blogs.
+
+First, import `ApifyDatasetLoader` into your source code:
+
+```ts
+import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset";
+import { Document } from "langchain/document";
+```
+
+Then provide a function that maps Apify dataset record fields to LangChain `Document` format.
+
+For example, if your dataset items are structured like this:
+
+```json
+{
+  "url": "https://apify.com",
+  "text": "Apify is the best web scraping and automation platform."
+}
+```
+
+The mapping function in the code below will convert them to LangChain `Document` format, so that you can use them further with any LLM model (e.g. for question answering).
+
+```ts
+const loader = new ApifyDatasetLoader(
+  "your-dataset-id",
+  (item) =>
+    new Document({
+      pageContent: (item.text || "") as string,
+      metadata: { source: item.url },
+    })
+);
+const docs = await loader.load();
+```
+
+## An example with question answering
+
+In this example, we use data from a dataset to answer a question.
+
+```ts
+import { OpenAI } from "langchain/llms/openai";
+import { RetrievalQAChain } from "langchain/chains";
+import { HNSWLib } from "langchain/vectorstores/hnswlib";
+import { OpenAIEmbeddings } from "langchain/embeddings/openai";
+import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset";
+import { Document } from "langchain/document";
+
+const OPENAI_API_KEY = "Your OpenAI API key";
+
+// Initialize the LLM to use to answer the question.
+const model = new OpenAI({ openAIApiKey: OPENAI_API_KEY });
+// Load the data from Apify Dataset
+const loader = new ApifyDatasetLoader(
+  "your-dataset-id",
+  (item) =>
+    new Document({
+      pageContent: (item.text || "") as string,
+      metadata: { source: item.url },
+    })
+);
+const docs = await loader.load();
+// Create a vector store from the documents.
+const vectorStore = await HNSWLib.fromDocuments(
+  docs,
+  new OpenAIEmbeddings({ openAIApiKey: OPENAI_API_KEY })
+);
+
+// Create a chain that uses the OpenAI LLM and HNSWLib vector store.
+const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever(), {
+  returnSourceDocuments: true,
+});
+const res = await chain.call({
+  query: "What is Apify?",
+});
+// Output the results
+console.log(res.text);
+console.log(res.sourceDocuments.map((d: Document) => d.metadata.source));
+```
+
+```
+Apify is a cloud platform that helps you build reliable web scrapers, fast, and automate anything you can do manually in a web browser.
+[
+  'https://docs.apify.com/platform',
+  'https://docs.apify.com/platform/integrations',
+  'https://docs.apify.com/platform/actors/publishing/monetize',
+  'https://docs.apify.com/platform/security'
+]
+```
diff --git a/docs/static/img/ApifyActors.png b/docs/static/img/ApifyActors.png
diff --git a/langchain/.gitignore b/langchain/.gitignore
@@ -121,6 +121,9 @@ document_loaders.d.ts
 document_loaders/base.cjs
 document_loaders/base.js
 document_loaders/base.d.ts
+document_loaders/web/apify_dataset.cjs
+document_loaders/web/apify_dataset.js
+document_loaders/web/apify_dataset.d.ts
 document_loaders/web/cheerio.cjs
 document_loaders/web/cheerio.js
 document_loaders/web/cheerio.d.ts

diff --git a/langchain/package.json b/langchain/package.json
@@ -133,6 +133,9 @@
     "document_loaders/base.cjs",
     "document_loaders/base.js",
     "document_loaders/base.d.ts",
+    "document_loaders/web/apify_dataset.cjs",
+    "document_loaders/web/apify_dataset.js",
+    "document_loaders/web/apify_dataset.d.ts",
     "document_loaders/web/cheerio.cjs",
     "document_loaders/web/cheerio.js",
     "document_loaders/web/cheerio.d.ts",
@@ -303,6 +306,7 @@
     "@types/uuid": "^9",
     "@typescript-eslint/eslint-plugin": "^5.51.0",
     "@typescript-eslint/parser": "^5.51.0",
+    "apify-client": "^2.7.1",
     "axios": "^0.26.0",
     "cheerio": "^1.0.0-rc.12",
     "chromadb": "^1.4.0",
@@ -348,6 +352,7 @@
     "@pinecone-database/pinecone": "*",
     "@supabase/supabase-js": "^2.10.0",
     "@zilliz/milvus2-sdk-node": "^2.2.0",
+    "apify-client": "^2.7.1",
     "axios": "*",
     "cheerio": "^1.0.0-rc.12",
     "chromadb": "^1.4.0",
@@ -392,6 +397,9 @@
     "@zilliz/milvus2-sdk-node": {
       "optional": true
     },
+    "apify-client": {
+      "optional": true
+    },
     "axios": {
       "optional": true
     },
@@ -701,6 +709,11 @@
       "import": "./document_loaders/base.js",
       "require": "./document_loaders/base.cjs"
     },
+    "./document_loaders/web/apify_dataset": {
+      "types": "./document_loaders/web/apify_dataset.d.ts",
+      "import": "./document_loaders/web/apify_dataset.js",
+      "require": "./document_loaders/web/apify_dataset.cjs"
+    },
     "./document_loaders/web/cheerio": {
       "types": "./document_loaders/web/cheerio.d.ts",
       "import": "./document_loaders/web/cheerio.js",

diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js
@@ -61,6 +61,7 @@ const entrypoints = {
   // document_loaders
   document_loaders: "document_loaders/index",
   "document_loaders/base": "document_loaders/base",
+  "document_loaders/web/apify_dataset": "document_loaders/web/apify_dataset",
   "document_loaders/web/cheerio": "document_loaders/web/cheerio",
   "document_loaders/web/puppeteer": "document_loaders/web/puppeteer",
   "document_loaders/web/playwright": "document_loaders/web/playwright",

diff --git a/langchain/src/document_loaders/index.ts b/langchain/src/document_loaders/index.ts
@@ -10,6 +10,7 @@ export { CollegeConfidentialLoader } from "./web/college_confidential.js";
 export { GitbookLoader } from "./web/gitbook.js";
 export { HNLoader } from "./web/hn.js";
 export { IMSDBLoader } from "./web/imsdb.js";
+export { ApifyDatasetLoader } from "./web/apify_dataset.js";
 export { DirectoryLoader, UnknownHandling } from "./fs/directory.js";
 export { SRTLoader } from "./fs/srt.js";
 export { PDFLoader } from "./fs/pdf.js";

diff --git a/langchain/src/document_loaders/web/apify_dataset.ts b/langchain/src/document_loaders/web/apify_dataset.ts
@@ -0,0 +1,45 @@
+import { Document } from "../../document.js";
+import { BaseDocumentLoader, DocumentLoader } from "../base.js";
+
+export class ApifyDatasetLoader
+  extends BaseDocumentLoader
+  implements DocumentLoader
+{
+  protected datasetId: string;
+
+  protected datasetMappingFunction: (
+    item: Record<string | number, unknown>
+  ) => Document;
+
+  constructor(
+    datasetId: string,
+    datasetMappingFunction: (item: Record<string | number, unknown>) => Document
+  ) {
+    super();
+    this.datasetId = datasetId;
+    this.datasetMappingFunction = datasetMappingFunction;
+  }
+
+  static async imports(): Promise<{
+    ApifyClientClass: typeof import("apify-client").ApifyClient;
+  }> {
+    try {
+      const { ApifyClient } = await import("apify-client");
+      return { ApifyClientClass: ApifyClient };
+    } catch (e) {
+      throw new Error(
+        "Please install apify-client as a dependency with, e.g. `yarn add apify-client`"
+      );
+    }
+  }
+
+  async load(): Promise<Document[]> {
+    const { ApifyClientClass } = await ApifyDatasetLoader.imports();
+    const apifyClient = new ApifyClientClass();
+
+    const datasetItems = (
+      await apifyClient.dataset(this.datasetId).listItems({ clean: true })
+    ).items;
+    return datasetItems.map(this.datasetMappingFunction);
+  }
+}