From 2a01739c3534dc313f6b0c149e3721aadecc80a8 Mon Sep 17 00:00:00 2001 From: Shyam Raghuwanshi Date: Fri, 19 Jul 2024 11:28:08 +0530 Subject: [PATCH] adding playwright and updating examples (#399) * adding playwright scraper * adding-playwright * updating-examples * fixing-minor-issue --- JS/edgechains/arakoodev/package.json | 10 +- .../arakooserver/src/tests/hono/hono.test.ts | 2 +- .../src/lib/endpoints/OpenAiEndpoint.ts | 2 +- .../openai/src/tests/openAiEndpoints.test.ts | 17 +- .../arakoodev/src/scraper/src/index.ts | 4 +- .../src/lib/autoPlaywrightPageScrapper.ts | 22 ++ .../src/lib/{webScraper.ts => cheerio.ts} | 2 +- .../src/scraper/src/lib/playwright.ts | 204 ++++++++++++++++++ .../tests/autoPlaywrightPageScrapper.test.ts | 13 ++ .../src/scraper/src/tests/cheerio.test.ts | 12 ++ .../src/scraper/src/tests/playwright.test.ts | 12 ++ .../arakoodev/src/scraper/src/utils/index.ts | 35 +++ .../src/scraper/src/utils/page-parser.ts | 170 +++++++++++++++ .../src/lib/generateResponse.cts | 3 - .../research-agent/jsonnet/main.jsonnet | 7 +- .../examples/research-agent/package.json | 3 +- .../examples/research-agent/src/index.tsx | 7 +- .../research-agent/src/lib/bingWebSearch.cts | 18 +- .../src/lib/generateResponse.cts | 16 +- .../src/lib/scrapPageContent.cts | 18 +- .../examples/summarize-page/.gitignore | 3 +- .../examples/summarize-page/dist/index.js | 12 +- .../dist/lib/generateResponse.cjs | 29 +-- .../dist/lib/getDataFromUrl.cjs | 21 +- .../summarize-page/jsonnet/main.jsonnet | 6 +- .../examples/summarize-page/package.json | 2 +- .../examples/summarize-page/src/index.ts | 14 +- .../src/lib/generateResponse.cts | 30 +-- .../summarize-page/src/lib/getDataFromUrl.cts | 20 +- 29 files changed, 576 insertions(+), 138 deletions(-) create mode 100644 JS/edgechains/arakoodev/src/scraper/src/lib/autoPlaywrightPageScrapper.ts rename JS/edgechains/arakoodev/src/scraper/src/lib/{webScraper.ts => cheerio.ts} (93%) create mode 100644 JS/edgechains/arakoodev/src/scraper/src/lib/playwright.ts create mode 100644 JS/edgechains/arakoodev/src/scraper/src/tests/autoPlaywrightPageScrapper.test.ts create mode 100644 JS/edgechains/arakoodev/src/scraper/src/tests/cheerio.test.ts create mode 100644 JS/edgechains/arakoodev/src/scraper/src/tests/playwright.test.ts create mode 100644 JS/edgechains/arakoodev/src/scraper/src/utils/index.ts create mode 100644 JS/edgechains/arakoodev/src/scraper/src/utils/page-parser.ts diff --git a/JS/edgechains/arakoodev/package.json b/JS/edgechains/arakoodev/package.json index baec87e55..166c55a8b 100644 --- a/JS/edgechains/arakoodev/package.json +++ b/JS/edgechains/arakoodev/package.json @@ -13,13 +13,14 @@ "./arakooserver": "./dist/arakooserver/src/index.js", "./db": "./dist/db/src/index.js", "./scraper": "./dist/scraper/src/index.js", - "./sync-rpc": "./dist/sync-rpc/export.js" + "./sync-rpc": "./dist/sync-rpc/export.js", + "./playwright": "./dist/playwright/src/index.js" }, "scripts": { "build": "rm -rf dist && tsc -b && cp -r src/sync-rpc dist/sync-rpc", "lint": "eslint --ignore-path .eslintignore --ext .js,.ts", "format": "prettier --ignore-path .gitignore --write \"**/*.+(js|ts|json)\"", - "test": "npx jest" + "test": "vitest" }, "dependencies": { "@babel/core": "^7.24.4", @@ -30,15 +31,19 @@ "axios-retry": "^4.1.0", "cheerio": "^1.0.0-rc.12", "cors": "^2.8.5", + "document": "^0.4.7", "dts-bundle-generator": "^9.3.1", "esbuild": "^0.20.2", "eventsource-parser": "^1.1.2", "get-port": "^7.1.0", "hono": "3.9", "jest-environment-jsdom": "^29.7.0", + "jsdom": "^24.1.0", "node-fetch": "^3.3.2", + "node-html-parser": "^6.1.13", "pdf-parse": "^1.1.1", "pg": "^8.11.5", + "playwright": "^1.45.1", "prettier": "^3.2.5", "regenerator-runtime": "^0.14.1", "request": "^2.88.2", @@ -46,6 +51,7 @@ "text-encoding": "^0.7.0", "ts-node": "^10.9.2", "typeorm": "^0.3.20", + "vitest": "^2.0.3", "web-streams-polyfill": "^4.0.0", "youtube-transcript": "^1.2.1", "zod": "^3.23.8", diff --git a/JS/edgechains/arakoodev/src/arakooserver/src/tests/hono/hono.test.ts b/JS/edgechains/arakoodev/src/arakooserver/src/tests/hono/hono.test.ts index 50d731a74..cfea382fd 100644 --- a/JS/edgechains/arakoodev/src/arakooserver/src/tests/hono/hono.test.ts +++ b/JS/edgechains/arakoodev/src/arakooserver/src/tests/hono/hono.test.ts @@ -1,6 +1,6 @@ import { ArakooServer } from "../../../../../dist/arakooserver/src/lib/hono/hono.js"; import { Hono } from "hono"; - +import { describe, expect, it } from 'vitest' describe("ArakooServer", () => { let arakooServer = new ArakooServer(); diff --git a/JS/edgechains/arakoodev/src/openai/src/lib/endpoints/OpenAiEndpoint.ts b/JS/edgechains/arakoodev/src/openai/src/lib/endpoints/OpenAiEndpoint.ts index 7cba7dec9..f0590124f 100644 --- a/JS/edgechains/arakoodev/src/openai/src/lib/endpoints/OpenAiEndpoint.ts +++ b/JS/edgechains/arakoodev/src/openai/src/lib/endpoints/OpenAiEndpoint.ts @@ -199,7 +199,7 @@ export class OpenAI { .post( openAI_url, { - model: chatOptions.model || "gpt-3.5-turbo", + model: chatOptions.model || "gpt-3.5-turbo-16k", messages: [ { role: chatOptions.role || "user", diff --git a/JS/edgechains/arakoodev/src/openai/src/tests/openAiEndpoints.test.ts b/JS/edgechains/arakoodev/src/openai/src/tests/openAiEndpoints.test.ts index 01c5633fe..992b91afe 100644 --- a/JS/edgechains/arakoodev/src/openai/src/tests/openAiEndpoints.test.ts +++ b/JS/edgechains/arakoodev/src/openai/src/tests/openAiEndpoints.test.ts @@ -1,5 +1,5 @@ import axios from "axios"; -import { ChatOpenAi } from "../../../../dist/openai/src/lib/endpoints/OpenAiEndpoint.js"; +import { OpenAI } from "../../../../dist/openai/src/lib/endpoints/OpenAiEndpoint.js"; jest.mock("axios"); @@ -15,8 +15,8 @@ describe("ChatOpenAi", () => { ]; axios.post = jest.fn().mockResolvedValueOnce({ data: { choices: mockResponse } }); - const chatOpenAi = new ChatOpenAi({ openAIApiKey: "test_api_key" }); - const response = await chatOpenAi.generateResponse("test prompt"); + const chatOpenAi = new OpenAI({ apiKey: "test_api_key" }); + const response = await chatOpenAi.chat({prompt:"test prompt"}); expect(response).toEqual("Test response"); }); }); @@ -25,7 +25,7 @@ describe("ChatOpenAi", () => { test("should generate embeddings from OpenAI", async () => { const mockResponse = { embeddings: "Test embeddings" }; axios.post = jest.fn().mockResolvedValue({ data: { data: { choices: mockResponse } } }); - const chatOpenAi = new ChatOpenAi({ openAIApiKey: "test_api_key" }); + const chatOpenAi = new OpenAI({ apiKey: "test_api_key" }); const res = await chatOpenAi.generateEmbeddings("test prompt"); expect(res.choices.embeddings).toEqual("Test embeddings"); }); @@ -46,7 +46,7 @@ describe("ChatOpenAi", () => { }, ]; axios.post = jest.fn().mockResolvedValueOnce({ data: { choices: mockResponse } }); - const chatOpenAi = new ChatOpenAi({ openAIApiKey: "test_api_key" }); + const chatOpenAi = new OpenAI({ apiKey: "test_api_key" }); const chatMessages = [ { role: "user", @@ -57,7 +57,8 @@ describe("ChatOpenAi", () => { content: "message 2", }, ]; - const responses = await chatOpenAi.chatWithAI(chatMessages); + //@ts-ignore + const responses = await chatOpenAi.chat({messages:chatMessages}); expect(responses).toEqual(mockResponse); }); }); @@ -72,8 +73,8 @@ describe("ChatOpenAi", () => { }, ]; axios.post = jest.fn().mockResolvedValueOnce({ data: { choices: mockResponse } }); - const chatOpenAi = new ChatOpenAi({ openAIApiKey: "test_api_key" }); - const response = await chatOpenAi.testResponseGeneration("test prompt"); + const chatOpenAi = new OpenAI({ apiKey: "test_api_key" }); + const response = await chatOpenAi.chat({prompt:"test prompt"}); expect(response).toEqual("Test response"); }); }); diff --git a/JS/edgechains/arakoodev/src/scraper/src/index.ts b/JS/edgechains/arakoodev/src/scraper/src/index.ts index 6835bd510..d31fc149a 100644 --- a/JS/edgechains/arakoodev/src/scraper/src/index.ts +++ b/JS/edgechains/arakoodev/src/scraper/src/index.ts @@ -1 +1,3 @@ -export { WebScraper } from "./lib/webScraper"; +export { Cheerio } from "./lib/cheerio"; +export { AutoPlayWriteWebPageScrapper } from "./lib/autoPlaywrightPageScrapper"; +export { Playwright } from "./lib/playwright"; diff --git a/JS/edgechains/arakoodev/src/scraper/src/lib/autoPlaywrightPageScrapper.ts b/JS/edgechains/arakoodev/src/scraper/src/lib/autoPlaywrightPageScrapper.ts new file mode 100644 index 000000000..12def3a52 --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/lib/autoPlaywrightPageScrapper.ts @@ -0,0 +1,22 @@ + +import { chromium } from "playwright"; + +export class AutoPlayWriteWebPageScrapper { + + constructor() { } + + async getContent(url: string): Promise { + const browser = await chromium.launch({ + headless: true, + }); + const page = await browser.newPage(); + await page.goto(url, { + waitUntil: "domcontentloaded", + }); + const textContent = await page.innerText('html'); + await browser.close(); + const regex = new RegExp("\n", "g"); + return textContent.replace(regex, "").replace(/\s{2,}/g, ' '); + } + +} diff --git a/JS/edgechains/arakoodev/src/scraper/src/lib/webScraper.ts b/JS/edgechains/arakoodev/src/scraper/src/lib/cheerio.ts similarity index 93% rename from JS/edgechains/arakoodev/src/scraper/src/lib/webScraper.ts rename to JS/edgechains/arakoodev/src/scraper/src/lib/cheerio.ts index 57f76aec6..578c30f16 100644 --- a/JS/edgechains/arakoodev/src/scraper/src/lib/webScraper.ts +++ b/JS/edgechains/arakoodev/src/scraper/src/lib/cheerio.ts @@ -1,7 +1,7 @@ import axios from "axios"; import cheerio from "cheerio"; -export class WebScraper { +export class Cheerio { constructor() {} async getContent(url: string): Promise { const content = await axios(url); diff --git a/JS/edgechains/arakoodev/src/scraper/src/lib/playwright.ts b/JS/edgechains/arakoodev/src/scraper/src/lib/playwright.ts new file mode 100644 index 000000000..a4903136d --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/lib/playwright.ts @@ -0,0 +1,204 @@ +import { chromium } from "playwright" +import axios from "axios"; +import { + parseArr, + parseSite, + preprocessJsonInput, +} from '../utils/index'; +import retry from "retry"; +import { removeBlankTags } from "../utils/page-parser"; + +export class Playwright { + + constructor() { } + + async #createPrompt({ task, page, }: { task: string, page: any }) { + return ` + You are a Senior SDET tasked with writing Playwright code for testing purposes. Your role involves implementing specific task-based code segments within a larger test file, following the instructions provided closely. Assume that common imports like 'test' and 'expect' from '@playwright/test' are already at the top of the file. + + Context: + - Your computer is a Mac. Cmd is the meta key, META. + - The browser is already open. + - Current page URL: ${await page.evaluate('location.href')}. + - Current page title: ${await page.evaluate('document.title')}. + - Overview of the site in HTML format: + \\\ + ${removeBlankTags((await parseSite(page))).slice(0, 25000)} + \\\ + + Key Points: + - Start directly with Playwright actions as described in the user task, without adding extraneous steps or assertions. + - Include assertions like 'expect' statements or wait functions such as 'waitForLoadState' only when they are specifically requested in the user task. + - Minimal, relevant comments should be used to clarify complex actions or essential aspects of the test's purpose. + - Apply 'frameLocator' for content in nested iframes, as needed based on the task requirements. + - Store the output in a variable and Return the output not log that + + User Task: [Insert the specific user task here, including any detailed instructions related to the execution, waiting for specific conditions, or explicit requests for assertions and waits.] + + Expected Code Format: + \\\ + // [Insert Playwright code based on the task description. Begin with necessary actions directly, and include 'waitForLoadState', assertions, or 'expect' statements only if explicitly requested in the task. Comments should be concise and pertinent, especially for complex actions or decisions.] + \\\ + + The objective is to create Playwright code that is efficient, precise, and perfectly aligned with the task's requirements, integrating seamlessly into the larger test file. All actions and comments should be relevant and necessary, catering to a senior-level professional's understanding of the testing scenario. + + HumanMessage Write Playwright code for this: ${task} + + Examples: + go to hacker news - await page.goto('https://news.ycombinator.com/') + click on the first link - page.click('a[href="https://blog.sbensu.com/posts/demand-for-visual-programming/"]') + give me all the text of this page - await page.waitForLoadState('networkidle') + + + Some Playwright Actions that should use for you reference: + - await page.goto('https://github.com/login'); + - await page.getByLabel('Username or email address').fill('username'); + - await page.getByLabel('Password').fill('password'); + - await page.getByRole('button', { name: 'Sign in' }).click(); + - await page.innerText('html') + - page.getByRole('button', { name: 'submit' }); + - page.getByRole('listitem').filter({ hasText: 'Product 2' }); + - await page.getByRole('listitem').filter({ hasText: 'Product 2' }).getByRole('button', { name: 'Add to cart' }).click(); + - page.locator('button.buttonIcon.episode-actions-later'); + - await expect(page.getByText('welcome')).toBeVisible(); + - await expect(page.getByText('welcome')).toBeVisible(); + - await page.innerText(selector); + - await page.innerText(selector, options); + - const page = await browser.newPage(); + - await page.goto('https://keycode.info'); + - await page.press('body', 'A'); + - await page.screenshot({ path: 'A.png' }); + - await page.press('body', 'ArrowLeft'); + - await page.screenshot({ path: 'ArrowLeft.png' }); + - await page.press('body', 'Shift+O'); + - await page.screenshot({ path: 'O.png' }); + - await browser.close(); + // click on the links, example + - await page.click('a[href="https://blog.sbensu.com/posts/demand-for-visual-programming/"]'); + ` + } + + #createPromptForTaskArr(task: string) { + return ` + Given the following task description: + + ${task} + + Extract the key actions from this task and return them as an array of strings. Each action should be a separate string in the array. If the task description contains syntax errors or you think a command can be improved for better clarity and effectiveness, please make the necessary corrections and improvements. For example: + + Input: + "Go to Hacker News and click on the first link. Then give me all the text of this page." + + Output: + \`\`\` + [ + "Navigate to the Hacker News website by entering the URL 'https://news.ycombinator.com/' in the browser", + "Identify and click on the first link displayed on the Hacker News homepage", + "Extract and return all the text content from the page" + ] + Ensure that each action is specific, clear, and comprehensive to facilitate precise implementation. + \`\`\` + ` + } + + async #openAIRequest({ chatApi, prompt }: { chatApi: string, prompt: string }) { + return new Promise((resolve, reject) => { + const operation = retry.operation({ + retries: 5, + factor: 3, + minTimeout: 1 * 1000, + maxTimeout: 60 * 1000, + randomize: true, + }); + + operation.attempt(async function (currentAttempt) { + await axios + .post( + "https://api.openai.com/v1/chat/completions", + { + model: "gpt-3.5-turbo-16k", + messages: [{ role: "user", content: prompt }], + max_tokens: 1000, + temperature: 0.7, + }, + { + headers: { + Authorization: "Bearer " + chatApi, + "content-type": "application/json", + }, + } + ) + .then((response) => { + resolve(response.data.choices[0].message.content); + }) + .catch((error) => { + if (error.response) { + console.log("Server responded with status code:", error.response.status); + console.log("Response data:", error.response.data); + } else if (error.request) { + console.log("No response received:", error); + } else { + console.log("Error creating request:", error.message, "\n", "Retrying ", currentAttempt); + } + if (operation.retry(error)) { + return; + }; + reject(error); + }); + }) + } + ) + } + + /** + * Get Playwright code for a specific task + * @param chatApi - OpenAI API key + * @param task - Task description + * @param url - URL to navigate to default is https://www.google.com + * @param headless - Run in headless mode default is false + * @returns Playwright code example - page.goto('https://www.google.com') + **/ + async call({ chatApi, task, url, headless = true }: { chatApi: string, task: string, url?: string, headless?: boolean }) { + + const AsyncFunction = async function () { }.constructor; + + const browser = await chromium.launch({ + headless: headless + }); + + const page = await browser.newPage(); + await page.goto(url || "https://www.google.com"); + + const taskPrompt = this.#createPromptForTaskArr(task); + const taskArr: any = parseArr(await this.#openAIRequest({ chatApi, prompt: taskPrompt })) + + let response: string = ""; + + for (let i = 0; i < taskArr.length; i++) { + if (!response) { + const element = taskArr[i]; + const prompt = await this.#createPrompt({ task: element, page }); + let res: any = preprocessJsonInput(await this.#openAIRequest({ chatApi, prompt })); + const dependencies = [ + { param: "page", value: page }, + ]; + + const func = AsyncFunction(...dependencies.map((d) => d.param), res); + const args = dependencies.map((d) => d.value); + + try { + const res = await func(...args); + if (res) { + response = res; + } + + } catch (error: any) { + console.log(error); + } + } + } + + await browser.close(); + return response; + } +} \ No newline at end of file diff --git a/JS/edgechains/arakoodev/src/scraper/src/tests/autoPlaywrightPageScrapper.test.ts b/JS/edgechains/arakoodev/src/scraper/src/tests/autoPlaywrightPageScrapper.test.ts new file mode 100644 index 000000000..085e356eb --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/tests/autoPlaywrightPageScrapper.test.ts @@ -0,0 +1,13 @@ +import { AutoPlayWriteWebPageScrapper } from "../../../../dist/scraper/src/index.js"; +import { describe, expect, it } from 'vitest' + +describe("should scrape the page", async () => { + it("should scrape the text and return", async () => { + + const url = "https://en.wikipedia.org/wiki/Akbar" + const scrapper = new AutoPlayWriteWebPageScrapper(); + + const result = await scrapper.getContent(url); + expect(`${result}`).contains("Akbar") + }); +}); diff --git a/JS/edgechains/arakoodev/src/scraper/src/tests/cheerio.test.ts b/JS/edgechains/arakoodev/src/scraper/src/tests/cheerio.test.ts new file mode 100644 index 000000000..25717fa7b --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/tests/cheerio.test.ts @@ -0,0 +1,12 @@ +import { Cheerio } from "../../../../dist/scraper/src/index.js"; +import { describe, expect, it } from 'vitest' +describe("should scrape the page", async () => { + it("should scrape the text and return", async () => { + + const url = "https://en.wikipedia.org/wiki/Akbar" + const cheerio = new Cheerio(); + + const result = await cheerio.getContent(url); + expect(`${result}`).contains("Akbar") + }); +}); diff --git a/JS/edgechains/arakoodev/src/scraper/src/tests/playwright.test.ts b/JS/edgechains/arakoodev/src/scraper/src/tests/playwright.test.ts new file mode 100644 index 000000000..269dfc41f --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/tests/playwright.test.ts @@ -0,0 +1,12 @@ +import { Playwright } from "../../../../dist/scraper/src/index.js"; +import { describe, expect, it } from 'vitest' + +describe("should scrape the page", async () => { + it("should scrape the text and return", async () => { + + const playwright = new Playwright(); + + const result = await playwright.call({ chatApi: "11111", task: "go to wikipedia and search for Akbar and click on any articial and scrap the hole page text", headless: false }) + expect(result).toContain(String); + }); +}, 1000000); diff --git a/JS/edgechains/arakoodev/src/scraper/src/utils/index.ts b/JS/edgechains/arakoodev/src/scraper/src/utils/index.ts new file mode 100644 index 000000000..c5b4826bb --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/utils/index.ts @@ -0,0 +1,35 @@ + +const codeRegex = /```(.*)(\r\n|\r|\n)(?[\w\W\n]+)(\r\n|\r|\n)```/; + +export function preprocessJsonInput(text) { + try { + return text.match(codeRegex).groups.code.trim(); + } catch (e) { + return text.trim() + } +} + +export function parseArr(text) { + try { + if (text.startsWith("[") && text.endsWith("]")) { + return JSON.parse(text); + } + return text.match(codeRegex).groups.code.trim(); + } catch (e) { + throw new Error("No code found") + // try { + // const regexPattern = /\[(.*?)\]/g; + // const matches = text.match(regexPattern)[1]; + // console.log({ matches }) + // if (!matches) { + // throw new Error("No code found") + // } + // return matches; + // } catch (error) { + // throw new Error("No code found") + // } + } +} + + +export { parseSite } from './page-parser.js'; diff --git a/JS/edgechains/arakoodev/src/scraper/src/utils/page-parser.ts b/JS/edgechains/arakoodev/src/scraper/src/utils/page-parser.ts new file mode 100644 index 000000000..40f50f8e3 --- /dev/null +++ b/JS/edgechains/arakoodev/src/scraper/src/utils/page-parser.ts @@ -0,0 +1,170 @@ +import { parse } from 'node-html-parser'; +import { JSDOM } from 'jsdom'; +import fs from "fs"; +const { document } = new JSDOM(`...`).window; + +const tagsToLog = [ + 'a', + 'p', + 'span', + 'div', + 'button', + 'label', + 'input', + 'textarea', + 'section', + 'select', + 'option', + 'table', + 'td', + 'th', + 'ul', + 'ol', + 'li', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'iframe', +]; + +function createElement(node) { + const elem = document.createElement(node.tagName); + fs.writeFileSync("node.json", JSON.stringify(node.attributes) + "\n\n\n\n", { flag: "a" }); + + const dataAttributes = Object.entries(node.attributes).filter( + (a) => + (tagsToLog.includes(node.tagName) && + (a[0].startsWith('name') || + a[0].startsWith('value') || + a[0].startsWith('data-component') || + a[0].startsWith('data-name') || + a[0].startsWith('aria-') || + a[0] === 'class' || + a[0] === 'type' || + a[0] === 'role')) || + // always log these + a[0] === 'href' || + a[0] === 'id' + ); + dataAttributes.forEach(([attr, value]) => { + elem.setAttribute(attr, value); + }); + + return elem; +} + +function createTextNode(text) { + return document.createTextNode(text); +} + +function isAdsIframe(node) { + const style = node.getAttribute('style') || ''; + const id = node.getAttribute('id') || ''; + return ( + node.getAttribute('height') === 0 || + style.includes('display: none') || + id.startsWith('google_ads_iframe') + ); +} + +async function dfs(node, parentElem, childFrames = []) { + for (const childNode of node.childNodes) { + if (childNode.nodeType === 1) { + if (childNode.tagName === 'IFRAME') { + // optimize for performance later + for (let { childFrame, attributes } of childFrames) { + if ( + Object.entries(attributes).every( + ([attr, value]) => childNode.getAttribute(attr) === value + ) + ) { + // skip blocks that look like ads + if (isAdsIframe(childNode)) { + continue; + } + + const childElem = createElement(childNode); + parentElem.appendChild(childElem); + const newChildFrame = await toChildFramesWithAttributes(childFrame); + //@ts-ignore + const bodyNode = await childFrame.locator('body', { timeout: 1000 }); + const bodyHtml = await bodyNode.innerHTML(); + await dfs(parseFrame(bodyHtml), childElem, newChildFrame); + + // ignore other matches that might be the same parent + break; + } + } + } else { + const childElem = createElement(childNode); + parentElem.appendChild(childElem); + await dfs(childNode, childElem, childFrames); + } + } else if (childNode.nodeType === 3) { + if (!childNode.isWhitespace) { + const textElem = createTextNode(childNode); + parentElem.appendChild(textElem); + } + } + } +} + +async function toChildFramesWithAttributes(frame) { + const childFramesWithAttributes: any = []; + for (let childFrame of frame.childFrames()) { + const childFrameElement = await childFrame.frameElement(); + const attributes = await childFrameElement.evaluate((node) => { + const attrs = {}; + for (let i = 0; i < node.attributes.length; i++) { + const attr = node.attributes[i]; + attrs[attr.name] = attr.value; + } + return attrs; + }); + childFramesWithAttributes.push({ childFrame, attributes }); + } + return childFramesWithAttributes; +} + +async function getStructure(frame) { + const bodyNode = await frame.locator('body', { timeout: 1000 }); + const bodyHtml = await bodyNode.innerHTML(); + const node = parseFrame(bodyHtml); + + const rootElem = createElement(node); + await dfs(node, rootElem, await toChildFramesWithAttributes(frame)); + return rootElem; +} + +function parseFrame(html) { + return parse(html, { + blockTextElements: { + script: false, + noscript: false, + style: false, + pre: true, // keep text content when parsing + }, + }); +} + +export async function parseSite(page) { + let mainFrame = page.mainFrame(); + const structure = await getStructure(mainFrame); + return structure.innerHTML; +} + + +export function removeBlankTags(text) { + const emptyTagRegex = /<(\w+)([^>]*)><\/\1>|<(\w+)([^>]*)>\s*<\/\3>/g; + + let newText = text; + + while (emptyTagRegex.test(newText)) { + newText = newText.replace(emptyTagRegex, ''); + } + + return newText; +} \ No newline at end of file diff --git a/JS/edgechains/examples/chat-with-llm/src/lib/generateResponse.cts b/JS/edgechains/examples/chat-with-llm/src/lib/generateResponse.cts index 26c4c214b..530892972 100644 --- a/JS/edgechains/examples/chat-with-llm/src/lib/generateResponse.cts +++ b/JS/edgechains/examples/chat-with-llm/src/lib/generateResponse.cts @@ -1,8 +1,5 @@ -const path = require("path"); const { OpenAI } = require("@arakoodev/edgechains.js/openai"); import { z } from "zod"; -const Jsonnet = require("@arakoodev/jsonnet"); -const jsonnet = new Jsonnet(); const schema = z.object({ answer: z.string().describe("The answer to the question"), diff --git a/JS/edgechains/examples/research-agent/jsonnet/main.jsonnet b/JS/edgechains/examples/research-agent/jsonnet/main.jsonnet index f871d7591..f94826b79 100644 --- a/JS/edgechains/examples/research-agent/jsonnet/main.jsonnet +++ b/JS/edgechains/examples/research-agent/jsonnet/main.jsonnet @@ -10,9 +10,12 @@ local PromptTemplate = ||| |||; +local bingKey = std.extVar("BingKey"); +local openAIkey = std.extVar("openAIkey"); + local generatePrompt() = local query = std.extVar("query"); - local getWebSearch = std.parseJson(arakoo.native("bingWebSearch")(query)); + local getWebSearch = std.parseJson(arakoo.native("bingWebSearch")({ query:query, key:bingKey })); local data = ""; local range = std.range(0, std.length(getWebSearch) - 1); @@ -27,7 +30,7 @@ local generatePrompt() = local updatedPromptTemplateWithQuery = std.strReplace(PromptTemplate, "{question}", query); local updatedPromptTemplateWithSummary = std.strReplace(updatedPromptTemplateWithQuery, "{researchSummary}", finalData); - local openAICall = arakoo.native("openAICall")(updatedPromptTemplateWithSummary); + local openAICall = arakoo.native("openAICall")({ prompt:updatedPromptTemplateWithSummary, openAIApiKey:openAIkey }); openAICall; diff --git a/JS/edgechains/examples/research-agent/package.json b/JS/edgechains/examples/research-agent/package.json index 72a893ac6..71294d280 100644 --- a/JS/edgechains/examples/research-agent/package.json +++ b/JS/edgechains/examples/research-agent/package.json @@ -11,8 +11,9 @@ }, "license": "ISC", "dependencies": { - "@arakoodev/edgechains.js": "^0.24.1", + "@arakoodev/edgechains.js": "file:../../arakoodev", "@arakoodev/jsonnet": "^0.24.0", + "axios": "^1.7.2", "file-uri-to-path": "^2.0.0", "hono": "^4.4.5", "path": "^0.12.7", diff --git a/JS/edgechains/examples/research-agent/src/index.tsx b/JS/edgechains/examples/research-agent/src/index.tsx index dfbc03f82..8315dc3b6 100644 --- a/JS/edgechains/examples/research-agent/src/index.tsx +++ b/JS/edgechains/examples/research-agent/src/index.tsx @@ -5,7 +5,6 @@ import Home from "./pages/Home.js"; import Jsonnet from "@arakoodev/jsonnet"; import fileURLToPath from "file-uri-to-path"; import path from "path"; -import { response } from "express"; const server = new ArakooServer(); const jsonnet = new Jsonnet(); @@ -13,6 +12,10 @@ const app = server.createApp(); const __dirname = fileURLToPath(import.meta.url); +const secretsPath = path.join(__dirname, "../../jsonnet/secrets.jsonnet"); +const key = JSON.parse(jsonnet.evaluateFile(secretsPath)).bing_api_key; +const openAIkey = JSON.parse(jsonnet.evaluateFile(secretsPath)).openai_api_key; + const openAICall = createClient(path.join(__dirname, "../lib/generateResponse.cjs")); const bingWebSearch = createClient(path.join(__dirname, "../lib/bingWebSearch.cjs")); const WebScraper = createClient(path.join(__dirname, "../lib/scrapPageContent.cjs")); @@ -25,6 +28,8 @@ app.post("/research", async (c: any) => { console.time("Time taken"); const { query } = await c.req.parseBody(); jsonnet.extString("query", query); + jsonnet.extString("BingKey", key); + jsonnet.extString("openAIkey", openAIkey); jsonnet.javascriptCallback("openAICall", openAICall); jsonnet.javascriptCallback("bingWebSearch", bingWebSearch); jsonnet.javascriptCallback("webScraper", WebScraper); diff --git a/JS/edgechains/examples/research-agent/src/lib/bingWebSearch.cts b/JS/edgechains/examples/research-agent/src/lib/bingWebSearch.cts index 69ebdd7ca..b50198655 100644 --- a/JS/edgechains/examples/research-agent/src/lib/bingWebSearch.cts +++ b/JS/edgechains/examples/research-agent/src/lib/bingWebSearch.cts @@ -1,24 +1,14 @@ -import { WebScraper } from "@arakoodev/edgechains.js/scraper"; import axios from "axios"; -const Jsonnet = require("@arakoodev/jsonnet"); -const path = require("path"); - -const scraper = new WebScraper(); -const jsonnet = new Jsonnet(); -const secretsPath = path.join(__dirname, "../../jsonnet/secrets.jsonnet"); -const key = JSON.parse(jsonnet.evaluateFile(secretsPath)).bing_api_key; - -function bingWebSearch(query: string) { +async function bingWebSearch({ query, key }: { query: string, key: string }) { try { - return axios( + const response = await axios( `https://api.bing.microsoft.com/v7.0/search?q=${encodeURIComponent(query)}&count=10`, { headers: { "Ocp-Apim-Subscription-Key": key }, } - ).then((response) => { - return JSON.stringify(response.data.webPages.value); - }); + ); + return JSON.stringify(response.data.webPages.value); } catch (error) { console.log(error); } diff --git a/JS/edgechains/examples/research-agent/src/lib/generateResponse.cts b/JS/edgechains/examples/research-agent/src/lib/generateResponse.cts index b6e5f56c9..898f15e2a 100644 --- a/JS/edgechains/examples/research-agent/src/lib/generateResponse.cts +++ b/JS/edgechains/examples/research-agent/src/lib/generateResponse.cts @@ -1,18 +1,10 @@ -import path from "path"; import { OpenAI } from "@arakoodev/edgechains.js/openai"; -import Jsonnet from "@arakoodev/jsonnet"; -const jsonnet = new Jsonnet(); -const secretsPath = path.join(__dirname, "../../jsonnet/secrets.jsonnet"); -const openAIApiKey = JSON.parse(jsonnet.evaluateFile(secretsPath)).openai_api_key; - -const openai = new OpenAI({ apiKey: openAIApiKey }); - -function openAICall(prompt: any) { +async function openAICall({ prompt, openAIApiKey }: { prompt: string, openAIApiKey: string }) { try { - return openai.chat({ prompt, max_tokens: 2000 }).then((res: any) => { - return JSON.stringify(res.content); - }); + const openai = new OpenAI({ apiKey: openAIApiKey }); + const response = await openai.chat({ prompt, max_tokens: 2000 }); + return JSON.stringify(response.content); } catch (error) { return error; } diff --git a/JS/edgechains/examples/research-agent/src/lib/scrapPageContent.cts b/JS/edgechains/examples/research-agent/src/lib/scrapPageContent.cts index 7670374ed..84b172ec4 100644 --- a/JS/edgechains/examples/research-agent/src/lib/scrapPageContent.cts +++ b/JS/edgechains/examples/research-agent/src/lib/scrapPageContent.cts @@ -1,18 +1,10 @@ -import { WebScraper } from "@arakoodev/edgechains.js/scraper"; -const scraper = new WebScraper(); +import { AutoPlayWriteWebPageScrapper } from "@arakoodev/edgechains.js/scraper"; -function getContent(url: string) { +const scraper = new AutoPlayWriteWebPageScrapper(); + +async function getContent(url: string) { try { - return scraper - .getContent(url) - .then((res) => { - console.log("Scraped Successfully: " + url); - return res; - }) - .catch((error) => { - console.log("Error Scraping: " + url); - return " "; - }); + return await scraper.getContent(url) } catch (error) { console.log("Error Scraping: " + url); return " "; diff --git a/JS/edgechains/examples/summarize-page/.gitignore b/JS/edgechains/examples/summarize-page/.gitignore index 1521c8b76..76add878f 100644 --- a/JS/edgechains/examples/summarize-page/.gitignore +++ b/JS/edgechains/examples/summarize-page/.gitignore @@ -1 +1,2 @@ -dist +node_modules +dist \ No newline at end of file diff --git a/JS/edgechains/examples/summarize-page/dist/index.js b/JS/edgechains/examples/summarize-page/dist/index.js index 467414a5b..cca17c152 100644 --- a/JS/edgechains/examples/summarize-page/dist/index.js +++ b/JS/edgechains/examples/summarize-page/dist/index.js @@ -1,21 +1,23 @@ import { ArakooServer } from "@arakoodev/edgechains.js/arakooserver"; import Jsonnet from "@arakoodev/jsonnet"; //@ts-ignore -import createClient from "sync-rpc"; +import createClient from "@arakoodev/edgechains.js/sync-rpc"; import fileURLToPath from "file-uri-to-path"; import path from "path"; const server = new ArakooServer(); const app = server.createApp(); const jsonnet = new Jsonnet(); -const __dirname = fileURLToPath(import.meta.url); -const openAICall = createClient(path.join(__dirname, "../lib/generateResponse.cjs")); -const getPageContent = createClient(path.join(__dirname, "../lib/getDataFromUrl.cjs")); +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const openAICall = createClient(path.join(__dirname, "./lib/generateResponse.cjs")); +const getPageContent = createClient(path.join(__dirname, "./lib/getDataFromUrl.cjs")); app.get("/", async (c) => { const pageUrl = c.req.query("pageUrl"); + const key = JSON.parse(jsonnet.evaluateFile(path.join(__dirname, "../jsonnet/secrets.jsonnet"))).openai_api_key; jsonnet.extString("pageUrl", pageUrl || ""); + jsonnet.extString("openai_api_key", key); jsonnet.javascriptCallback("openAICall", openAICall); jsonnet.javascriptCallback("getPageContent", getPageContent); - let response = jsonnet.evaluateFile(path.join(__dirname, "../../jsonnet/main.jsonnet")); + let response = jsonnet.evaluateFile(path.join(__dirname, "../jsonnet/main.jsonnet")); return c.json(response); }); server.listen(3000); diff --git a/JS/edgechains/examples/summarize-page/dist/lib/generateResponse.cjs b/JS/edgechains/examples/summarize-page/dist/lib/generateResponse.cjs index 303138131..3bbe461da 100644 --- a/JS/edgechains/examples/summarize-page/dist/lib/generateResponse.cjs +++ b/JS/edgechains/examples/summarize-page/dist/lib/generateResponse.cjs @@ -1,28 +1,19 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -const path = require("path"); const { OpenAI } = require("@arakoodev/edgechains.js/openai"); const zod_1 = require("zod"); -const Jsonnet = require("@arakoodev/jsonnet"); -const jsonnet = new Jsonnet(); -const secretsPath = path.join(__dirname, "../../jsonnet/secrets.jsonnet"); -const openAIApiKey = JSON.parse(jsonnet.evaluateFile(secretsPath)).openai_api_key; const schema = zod_1.z.object({ answer: zod_1.z.string().describe("The answer to the question"), }); -const openai = new OpenAI({ - apiKey: openAIApiKey, - temperature: 0, -}); -function openAICall() { - return function (prompt) { - try { - return openai.zodSchemaResponse({ prompt, schema }).then((res) => { - return JSON.stringify(res); - }); - } catch (error) { - return error; - } - }; +async function openAICall({ prompt, openAIApiKey }) { + try { + const openai = new OpenAI({ apiKey: openAIApiKey }); + let res = await openai.zodSchemaResponse({ prompt, schema: schema }); + console.log({ res }); + return JSON.stringify(res); + } + catch (error) { + return error; + } } module.exports = openAICall; diff --git a/JS/edgechains/examples/summarize-page/dist/lib/getDataFromUrl.cjs b/JS/edgechains/examples/summarize-page/dist/lib/getDataFromUrl.cjs index adab87782..2fdfe4a78 100644 --- a/JS/edgechains/examples/summarize-page/dist/lib/getDataFromUrl.cjs +++ b/JS/edgechains/examples/summarize-page/dist/lib/getDataFromUrl.cjs @@ -1,16 +1,13 @@ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); -const { WebScraper } = require("@arakoodev/edgechains.js/scraper"); -const scraper = new WebScraper(); -function getPageContent() { - return (url) => { - try { - return scraper.getContent(url).then((res) => { - return res; - }); - } catch (error) { - console.log(error); - } - }; +const scraper_1 = require("@arakoodev/edgechains.js/scraper"); +const scraper = new scraper_1.Playwright(); +async function getPageContent({ pageUrl, openai }) { + try { + return await scraper.call({ chatApi: openai, task: `go to ${pageUrl} and scrap the hole page text`, headless: false }); + } + catch (error) { + console.log(error); + } } module.exports = getPageContent; diff --git a/JS/edgechains/examples/summarize-page/jsonnet/main.jsonnet b/JS/edgechains/examples/summarize-page/jsonnet/main.jsonnet index d8d047804..a07af83c0 100644 --- a/JS/edgechains/examples/summarize-page/jsonnet/main.jsonnet +++ b/JS/edgechains/examples/summarize-page/jsonnet/main.jsonnet @@ -12,13 +12,15 @@ local promptTemplate = ||| local pageUrl = std.extVar("pageUrl"); +local key = std.extVar('openai_api_key'); local getPageContent(pageUrl) = - local pageContent = arakoo.native("getPageContent")(pageUrl); + local content = arakoo.native("getPageContent")({pageUrl:pageUrl, openai:key}); + local pageContent = std.slice(content, 0, 20000, 1); local promptWithPageContent = std.strReplace(promptTemplate,'{content}', pageContent + "\n"); promptWithPageContent; local main(prompt) = - local response = arakoo.native("openAICall")(prompt); +local response = arakoo.native("openAICall")({ prompt: prompt, openAIApiKey: key }); response; main(getPageContent(pageUrl)) \ No newline at end of file diff --git a/JS/edgechains/examples/summarize-page/package.json b/JS/edgechains/examples/summarize-page/package.json index c288cbeaf..966ec1456 100644 --- a/JS/edgechains/examples/summarize-page/package.json +++ b/JS/edgechains/examples/summarize-page/package.json @@ -11,7 +11,7 @@ }, "license": "ISC", "dependencies": { - "@arakoodev/edgechains.js": "^0.23.6", + "@arakoodev/edgechains.js": "file:../../arakoodev", "@arakoodev/jsonnet": "^0.23.6", "file-uri-to-path": "^2.0.0", "path": "^0.12.7", diff --git a/JS/edgechains/examples/summarize-page/src/index.ts b/JS/edgechains/examples/summarize-page/src/index.ts index 111d5896c..2a4c7dfac 100644 --- a/JS/edgechains/examples/summarize-page/src/index.ts +++ b/JS/edgechains/examples/summarize-page/src/index.ts @@ -1,7 +1,7 @@ import { ArakooServer } from "@arakoodev/edgechains.js/arakooserver"; import Jsonnet from "@arakoodev/jsonnet"; //@ts-ignore -import createClient from "sync-rpc"; +import createClient from "@arakoodev/edgechains.js/sync-rpc"; import fileURLToPath from "file-uri-to-path"; import path from "path"; @@ -10,17 +10,21 @@ const server = new ArakooServer(); const app = server.createApp(); const jsonnet = new Jsonnet(); -const __dirname = fileURLToPath(import.meta.url); +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const openAICall = createClient(path.join(__dirname, "./lib/generateResponse.cjs")); +const getPageContent = createClient(path.join(__dirname, "./lib/getDataFromUrl.cjs")); -const openAICall = createClient(path.join(__dirname, "../lib/generateResponse.cjs")); -const getPageContent = createClient(path.join(__dirname, "../lib/getDataFromUrl.cjs")); app.get("/", async (c: any) => { const pageUrl = c.req.query("pageUrl"); + const key = JSON.parse( + jsonnet.evaluateFile(path.join(__dirname, "../jsonnet/secrets.jsonnet")) + ).openai_api_key; jsonnet.extString("pageUrl", pageUrl || ""); + jsonnet.extString("openai_api_key", key); jsonnet.javascriptCallback("openAICall", openAICall); jsonnet.javascriptCallback("getPageContent", getPageContent); - let response = jsonnet.evaluateFile(path.join(__dirname, "../../jsonnet/main.jsonnet")); + let response = jsonnet.evaluateFile(path.join(__dirname, "../jsonnet/main.jsonnet")); return c.json(response); }); diff --git a/JS/edgechains/examples/summarize-page/src/lib/generateResponse.cts b/JS/edgechains/examples/summarize-page/src/lib/generateResponse.cts index 2b2872b61..b6ff11897 100644 --- a/JS/edgechains/examples/summarize-page/src/lib/generateResponse.cts +++ b/JS/edgechains/examples/summarize-page/src/lib/generateResponse.cts @@ -1,31 +1,19 @@ -const path = require("path"); const { OpenAI } = require("@arakoodev/edgechains.js/openai"); import { z } from "zod"; -const Jsonnet = require("@arakoodev/jsonnet"); -const jsonnet = new Jsonnet(); - -const secretsPath = path.join(__dirname, "../../jsonnet/secrets.jsonnet"); -const openAIApiKey = JSON.parse(jsonnet.evaluateFile(secretsPath)).openai_api_key; const schema = z.object({ answer: z.string().describe("The answer to the question"), }); -const openai = new OpenAI({ - apiKey: openAIApiKey, - temperature: 0, -}); - -function openAICall() { - return function (prompt: string) { - try { - return openai.zodSchemaResponse({ prompt, schema }).then((res: any) => { - return JSON.stringify(res); - }); - } catch (error) { - return error; - } - }; +async function openAICall({ prompt, openAIApiKey }: any) { + try { + const openai = new OpenAI({ apiKey: openAIApiKey }); + let res = await openai.zodSchemaResponse({ prompt, schema: schema }); + console.log({ res }) + return JSON.stringify(res); + } catch (error) { + return error; + } } module.exports = openAICall; diff --git a/JS/edgechains/examples/summarize-page/src/lib/getDataFromUrl.cts b/JS/edgechains/examples/summarize-page/src/lib/getDataFromUrl.cts index 93d4167db..6948de625 100644 --- a/JS/edgechains/examples/summarize-page/src/lib/getDataFromUrl.cts +++ b/JS/edgechains/examples/summarize-page/src/lib/getDataFromUrl.cts @@ -1,17 +1,13 @@ -const { WebScraper } = require("@arakoodev/edgechains.js/scraper"); +import { Playwright } from "@arakoodev/edgechains.js/scraper" +const scraper = new Playwright(); -const scraper = new WebScraper(); +async function getPageContent({ pageUrl, openai }: { pageUrl: string, openai: string }) { + try { + return await scraper.call({ chatApi: openai, task: `go to ${pageUrl} and scrap the hole page text`, headless: false }) -function getPageContent() { - return (url: string) => { - try { - return scraper.getContent(url).then((res: any) => { - return res; - }); - } catch (error) { - console.log(error); - } - }; + } catch (error) { + console.log(error); + } } module.exports = getPageContent;