From 68b152b74f4ab15748d601877047ea9b74fc871d Mon Sep 17 00:00:00 2001 From: satoshi toyama Date: Mon, 28 Oct 2024 22:00:08 +0900 Subject: [PATCH] Add file parsing functionality using Unstructured API - Add parseFile server action to process uploaded files - Integrate with Unstructured API to extract structured content - Generate both JSON and Markdown outputs from parsed content - Update file processing flow to include structured data URLs - Store processed results in Vercel Blob storage --- .../beta-proto/files/server-actions.ts | 70 ++++++++++++++ .../p/[agentId]/beta-proto/files/types.ts | 4 +- .../graph/v2/composition/add-source.ts | 93 ++++++++++++++++++- 3 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 app/(playground)/p/[agentId]/beta-proto/files/server-actions.ts diff --git a/app/(playground)/p/[agentId]/beta-proto/files/server-actions.ts b/app/(playground)/p/[agentId]/beta-proto/files/server-actions.ts new file mode 100644 index 00000000..6409dbca --- /dev/null +++ b/app/(playground)/p/[agentId]/beta-proto/files/server-actions.ts @@ -0,0 +1,70 @@ +import { put } from "@vercel/blob"; +import { UnstructuredClient } from "unstructured-client"; +import { Strategy } from "unstructured-client/sdk/models/shared"; +import { elementsToMarkdown } from "../utils/unstructured"; +import type { FileId } from "./types"; + +type UploadFileInput = { + fileId: FileId; + file: File; +}; +export async function uploadFile({ input }: { input: UploadFileInput }) { + const blob = await put( + `files/${input.fileId}/${input.file.name}`, + input.file, + { + access: "public", + contentType: input.file.type, + }, + ); + return blob; +} + +type ParseFileInput = { + id: FileId; + name: string; + blobUrl: string; +}; +export async function parseFile(args: ParseFileInput) { + if (process.env.UNSTRUCTURED_API_KEY === undefined) { + throw new Error("UNSTRUCTURED_API_KEY is not set"); + } + const client = new UnstructuredClient({ + security: { + apiKeyAuth: process.env.UNSTRUCTURED_API_KEY, + }, + }); + const response = await fetch(args.blobUrl); + const content = await response.blob(); + const partitionReponse = await client.general.partition({ + partitionParameters: { + files: { + fileName: args.name, + content, + }, + strategy: Strategy.Fast, + splitPdfPage: false, + splitPdfConcurrencyLevel: 1, + }, + }); + if (partitionReponse.statusCode !== 200) { + console.error(partitionReponse.rawResponse); + throw new Error(`Failed to parse file: ${partitionReponse.statusCode}`); + } + const jsonString = JSON.stringify(partitionReponse.elements, null, 2); + const blob = new Blob([jsonString], { type: "application/json" }); + + await put(`files/${args.id}/partition.json`, blob, { + access: "public", + contentType: blob.type, + }); + + const markdown = elementsToMarkdown(partitionReponse.elements ?? []); + const markdownBlob = new Blob([markdown], { type: "text/markdown" }); + const vercelBlob = await put(`files/${args.id}/markdown.md`, markdownBlob, { + access: "public", + contentType: markdownBlob.type, + }); + + return vercelBlob; +} diff --git a/app/(playground)/p/[agentId]/beta-proto/files/types.ts b/app/(playground)/p/[agentId]/beta-proto/files/types.ts index 11577add..bcb9ebeb 100644 --- a/app/(playground)/p/[agentId]/beta-proto/files/types.ts +++ b/app/(playground)/p/[agentId]/beta-proto/files/types.ts @@ -15,14 +15,14 @@ type DraftFile = { file: File; status: Extract; }; -type UploadedFile = { +export type UploadedFile = { id: FileId; blobUrl: string; object: "file"; name: string; status: Extract; }; -type ProcessedFile = { +export type ProcessedFile = { id: FileId; blobUrl: string; structuredDataBlobUrl: string; diff --git a/app/(playground)/p/[agentId]/beta-proto/graph/v2/composition/add-source.ts b/app/(playground)/p/[agentId]/beta-proto/graph/v2/composition/add-source.ts index e83a8eea..71760bc5 100644 --- a/app/(playground)/p/[agentId]/beta-proto/graph/v2/composition/add-source.ts +++ b/app/(playground)/p/[agentId]/beta-proto/graph/v2/composition/add-source.ts @@ -1,5 +1,12 @@ import type { ArtifactReference } from "../../../artifact/types"; import { buildConnector } from "../../../connector/utils"; +import { parseFile, uploadFile } from "../../../files/server-actions"; +import { + type GiselleFile, + type ProcessedFile, + type StructuredData, + fileStatuses, +} from "../../../files/types"; import { createStringParameter } from "../../../giselle-node/parameter/factory"; import type { GiselleNodeId } from "../../../giselle-node/types"; import type { TextContent } from "../../../text-content/types"; @@ -9,7 +16,7 @@ import type { CompositeAction } from "../../context"; import { addConnector } from "./add-connector"; import { updateNode } from "./update-node"; -export type Source = ArtifactReference | TextContent | WebSearch; +export type Source = ArtifactReference | TextContent | WebSearch | GiselleFile; type AddSourceInput = { source: Source; /** @@ -137,3 +144,87 @@ export function connectRelevanceNodes({ } }; } + +interface UploadSourceInput { + source: Source; + /** + * Instruction Node + */ + nodeId: GiselleNodeId; +} +export function uploadSource({ + input, +}: { input: UploadSourceInput }): CompositeAction { + return async (dispatch, getState) => { + switch (input.source.object) { + case "file": { + if (input.source.status === fileStatuses.uploading) { + const fileVercelBlob = await uploadFile({ + input: { + file: input.source.file, + fileId: input.source.id, + }, + }); + const node = getState().graph.nodes.find( + (node) => node.id === input.nodeId, + ); + if (node === undefined) { + throw new Error(`Node not found: ${input.nodeId}`); + } + + // Add the source to the instruction node property + const currentSources = node.properties.sources ?? []; + if (!Array.isArray(currentSources)) { + throw new Error(`${node.id}'s sources property is not an array`); + } + dispatch( + updateNode({ + input: { + nodeId: input.nodeId, + properties: { + ...node.properties, + sources: currentSources.map((source) => + source.id === input.source.id + ? { + ...source, + blobUrl: fileVercelBlob.url, + status: fileStatuses.processing, + } + : source, + ), + }, + }, + }), + ); + + const structuredDataVercelBlob = await parseFile({ + id: input.source.id, + name: input.source.name, + blobUrl: fileVercelBlob.url, + }); + + dispatch( + updateNode({ + input: { + nodeId: input.nodeId, + properties: { + ...node.properties, + sources: currentSources.map((source) => + source.id === input.source.id + ? ({ + ...source, + blobUrl: fileVercelBlob.url, + structuredDataBlobUrl: structuredDataVercelBlob.url, + status: fileStatuses.processed, + } satisfies ProcessedFile) + : source, + ), + }, + }, + }), + ); + } + } + } + }; +}