forked from langchain-ai/langchainjs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
342ca8c
commit 7146bee
Showing
14 changed files
with
504 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Apify | ||
|
||
This page covers how to use [Apify](https://apify.com) within LangChain. | ||
|
||
## Overview | ||
|
||
Apify is a cloud platform for web scraping and data extraction, | ||
which provides an [ecosystem](https://apify.com/store) of more than a thousand | ||
ready-made apps called _Actors_ for various scraping, crawling, and extraction use cases. | ||
|
||
[![Apify Actors](/img/ApifyActors.png)](https://apify.com/store) | ||
|
||
This integration enables you run Actors on the Apify platform and load their results into LangChain to feed your vector | ||
indexes with documents and data from the web, e.g. to generate answers from websites with documentation, | ||
blogs, or knowledge bases. | ||
|
||
## Installation and Setup | ||
|
||
- Install the [Apify API client](https://npmjs.com/package/apify-client) using your favorite package manager: | ||
|
||
```bash npm2yarn | ||
npm install apify-client | ||
``` | ||
|
||
- Get your [Apify API token](https://console.apify.com/account/integrations) and either set it as | ||
an environment variable (`APIFY_API_TOKEN`) or pass it to the `ApifyWrapper` in the constructor. | ||
|
||
## Wrappers | ||
|
||
### Utility | ||
|
||
You can use the `ApifyWrapper` to run Actors on the Apify platform. | ||
|
||
```ts | ||
import { ApifyWrapper } from "langchain/tools"; | ||
``` | ||
|
||
For a more detailed walkthrough of this wrapper, see [this guide](../modules/agents/tools/integrations/apify.md). | ||
|
||
### Loader | ||
|
||
You can also use our `ApifyDatasetLoader` to get data from Apify dataset. | ||
|
||
```ts | ||
import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset"; | ||
``` | ||
|
||
For a more detailed walkthrough of this loader, see [this guide](../modules/indexes/document_loaders/examples/web_loaders/apify_dataset.md). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# Apify | ||
|
||
This guide shows how to use the [Apify integration](../../../../ecosystem/apify.md) for LangChain. | ||
|
||
[Apify](https://apify.com) is a cloud platform for web scraping and data extraction, | ||
which provides an [ecosystem](https://apify.com/store) of more than a thousand | ||
ready-made apps called _Actors_ for various web scraping, crawling, and data extraction use cases. | ||
For example, you can use it to extract Google Search results, Instagram and Facebook profiles, products from Amazon or Shopify, Google Maps reviews, etc. | ||
|
||
In this example, we'll use the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor, | ||
which can deeply crawl websites such as documentation, knowledge bases, help centers, or blogs, | ||
and extract text content from the web pages. Then we feed the documents into a vector index and answer questions from it. | ||
|
||
```bash npm2yarn | ||
npm install apify-client | ||
``` | ||
|
||
First, import `ApifyWrapper` and some other classes into your source code: | ||
|
||
```ts | ||
import { OpenAI } from "langchain/llms/openai"; | ||
import { RetrievalQAChain } from "langchain/chains"; | ||
import { HNSWLib } from "langchain/vectorstores/hnswlib"; | ||
import { OpenAIEmbeddings } from "langchain/embeddings/openai"; | ||
import { ApifyWrapper } from "langchain/tools"; | ||
import { Document } from "langchain/document"; | ||
``` | ||
|
||
Initialize it using your [Apify API token](https://console.apify.com/account/integrations) and for the purpose of this example, also with your OpenAI API key: | ||
|
||
```ts | ||
const OPENAI_API_KEY = "Your OpenAI API key"; | ||
const APIFY_API_TOKEN = "Your Apify API token"; | ||
|
||
const model = new OpenAI({ openAIApiKey: OPENAI_API_KEY }); | ||
const apify = new ApifyWrapper(APIFY_API_TOKEN); | ||
``` | ||
|
||
Then run the Actor, wait for it to finish, and fetch its results from the Apify dataset into a LangChain document loader. | ||
|
||
Note that if you already have some results in an Apify dataset, you can load them directly using `ApifyDatasetLoader`, as shown in [this guide](../../../indexes/document_loaders/examples/web_loaders/apify_dataset.md). In that guide, you'll also find the explanation of the `datasetMappingFunction`, which is used to map fields from the Apify dataset records to LangChain `Document` fields. | ||
|
||
```ts | ||
const loader = await apify.callActor( | ||
"apify/website-content-crawler", | ||
{ startUrls: [{ url: "https://js.langchain.com/docs/" }] }, | ||
(item) => | ||
new Document({ | ||
pageContent: (item.text || "") as string, | ||
metadata: { source: item.url }, | ||
}) | ||
); | ||
const docs = await loader.load(); | ||
``` | ||
|
||
Initialize the vector index from the crawled documents: | ||
|
||
```ts | ||
const vectorStore = await HNSWLib.fromDocuments( | ||
docs, | ||
new OpenAIEmbeddings({ openAIApiKey: OPENAI_API_KEY }) | ||
); | ||
``` | ||
|
||
Next, create the retrieval chain and enter a query: | ||
|
||
```ts | ||
const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever(), { | ||
returnSourceDocuments: true, | ||
}); | ||
const res = await chain.call({ query: "What is LangChain?" }); | ||
``` | ||
|
||
And finally, output the results: | ||
|
||
```ts | ||
console.log(res.text); | ||
console.log(res.sourceDocuments.map((d) => d.metadata.source)); | ||
``` | ||
|
||
``` | ||
LangChain is a framework for developing applications powered by language models. | ||
[ | ||
'https://js.langchain.com/docs/', | ||
'https://js.langchain.com/docs/modules/chains/', | ||
'https://js.langchain.com/docs/modules/chains/llmchain/', | ||
'https://js.langchain.com/docs/category/functions-4' | ||
] | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
99 changes: 99 additions & 0 deletions
99
docs/docs/modules/indexes/document_loaders/examples/web_loaders/apify_dataset.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# Apify Dataset | ||
|
||
This guide shows how to load Apify datasets to LangChain. | ||
|
||
[Apify Dataset](https://docs.apify.com/platform/storage/dataset) is a scaleable append-only storage with sequential access built for storing structured web scraping results, such as a list of products or Google SERPs, and then export them to various formats like JSON, CSV, or Excel. Datasets are mainly used to save results of [Apify Actors](https://apify.com/store)—serverless cloud programs for varius web scraping, crawling, and data extraction use cases. | ||
|
||
## Prerequisites | ||
|
||
```bash npm2yarn | ||
npm install apify-client | ||
``` | ||
|
||
You need to have an existing dataset on the Apify platform. If you don't have one, please first check out [this guide](../../../../agents/tools/integrations/apify.md) on how to use Apify to extract content from documentation, knowledge bases, help centers, or blogs. | ||
|
||
First, import `ApifyDatasetLoader` into your source code: | ||
|
||
```ts | ||
import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset"; | ||
import { Document } from "langchain/document"; | ||
``` | ||
|
||
Then provide a function that maps Apify dataset record fields to LangChain `Document` format. | ||
|
||
For example, if your dataset items are structured like this: | ||
|
||
```json | ||
{ | ||
"url": "https://apify.com", | ||
"text": "Apify is the best web scraping and automation platform." | ||
} | ||
``` | ||
|
||
The mapping function in the code below will convert them to LangChain `Document` format, so that you can use them further with any LLM model (e.g. for question answering). | ||
|
||
```ts | ||
const loader = new ApifyDatasetLoader( | ||
"your-dataset-id", | ||
(item) => | ||
new Document({ | ||
pageContent: (item.text || "") as string, | ||
metadata: { source: item.url }, | ||
}) | ||
); | ||
const docs = await loader.load(); | ||
``` | ||
|
||
## An example with question answering | ||
|
||
In this example, we use data from a dataset to answer a question. | ||
|
||
```ts | ||
import { OpenAI } from "langchain/llms/openai"; | ||
import { RetrievalQAChain } from "langchain/chains"; | ||
import { HNSWLib } from "langchain/vectorstores/hnswlib"; | ||
import { OpenAIEmbeddings } from "langchain/embeddings/openai"; | ||
import { ApifyDatasetLoader } from "langchain/document_loaders/web/apify_dataset"; | ||
import { Document } from "langchain/document"; | ||
|
||
const OPENAI_API_KEY = "Your OpenAI API key"; | ||
|
||
// Initialize the LLM to use to answer the question. | ||
const model = new OpenAI({ openAIApiKey: OPENAI_API_KEY }); | ||
// Load the data from Apify Dataset | ||
const loader = new ApifyDatasetLoader( | ||
"your-dataset-id", | ||
(item) => | ||
new Document({ | ||
pageContent: (item.text || "") as string, | ||
metadata: { source: item.url }, | ||
}) | ||
); | ||
const docs = await loader.load(); | ||
// Create a vector store from the documents. | ||
const vectorStore = await HNSWLib.fromDocuments( | ||
docs, | ||
new OpenAIEmbeddings({ openAIApiKey: OPENAI_API_KEY }) | ||
); | ||
|
||
// Create a chain that uses the OpenAI LLM and HNSWLib vector store. | ||
const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever(), { | ||
returnSourceDocuments: true, | ||
}); | ||
const res = await chain.call({ | ||
query: "What is Apify?", | ||
}); | ||
// Output the results | ||
console.log(res.text); | ||
console.log(res.sourceDocuments.map((d: Document) => d.metadata.source)); | ||
``` | ||
|
||
``` | ||
Apify is a cloud platform that helps you build reliable web scrapers, fast, and automate anything you can do manually in a web browser. | ||
[ | ||
'https://docs.apify.com/platform', | ||
'https://docs.apify.com/platform/integrations', | ||
'https://docs.apify.com/platform/actors/publishing/monetize', | ||
'https://docs.apify.com/platform/security' | ||
] | ||
``` |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import { Document } from "../../document.js"; | ||
import { BaseDocumentLoader, DocumentLoader } from "../base.js"; | ||
|
||
export class ApifyDatasetLoader | ||
extends BaseDocumentLoader | ||
implements DocumentLoader | ||
{ | ||
protected datasetId: string; | ||
|
||
protected datasetMappingFunction: ( | ||
item: Record<string | number, unknown> | ||
) => Document; | ||
|
||
constructor( | ||
datasetId: string, | ||
datasetMappingFunction: (item: Record<string | number, unknown>) => Document | ||
) { | ||
super(); | ||
this.datasetId = datasetId; | ||
this.datasetMappingFunction = datasetMappingFunction; | ||
} | ||
|
||
static async imports(): Promise<{ | ||
ApifyClientClass: typeof import("apify-client").ApifyClient; | ||
}> { | ||
try { | ||
const { ApifyClient } = await import("apify-client"); | ||
return { ApifyClientClass: ApifyClient }; | ||
} catch (e) { | ||
throw new Error( | ||
"Please install apify-client as a dependency with, e.g. `yarn add apify-client`" | ||
); | ||
} | ||
} | ||
|
||
async load(): Promise<Document[]> { | ||
const { ApifyClientClass } = await ApifyDatasetLoader.imports(); | ||
const apifyClient = new ApifyClientClass(); | ||
|
||
const datasetItems = ( | ||
await apifyClient.dataset(this.datasetId).listItems({ clean: true }) | ||
).items; | ||
return datasetItems.map(this.datasetMappingFunction); | ||
} | ||
} |
Oops, something went wrong.