Skip to content

Commit

Permalink
Add file parsing functionality using Unstructured API
Browse files Browse the repository at this point in the history
- Add parseFile server action to process uploaded files
- Integrate with Unstructured API to extract structured content
- Generate both JSON and Markdown outputs from parsed content
- Update file processing flow to include structured data URLs
- Store processed results in Vercel Blob storage
  • Loading branch information
toyamarinyon committed Oct 29, 2024
1 parent 25926f6 commit 68b152b
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 3 deletions.
70 changes: 70 additions & 0 deletions app/(playground)/p/[agentId]/beta-proto/files/server-actions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import { put } from "@vercel/blob";
import { UnstructuredClient } from "unstructured-client";
import { Strategy } from "unstructured-client/sdk/models/shared";
import { elementsToMarkdown } from "../utils/unstructured";
import type { FileId } from "./types";

type UploadFileInput = {
fileId: FileId;
file: File;
};
export async function uploadFile({ input }: { input: UploadFileInput }) {
const blob = await put(
`files/${input.fileId}/${input.file.name}`,
input.file,
{
access: "public",
contentType: input.file.type,
},
);
return blob;
}

type ParseFileInput = {
id: FileId;
name: string;
blobUrl: string;
};
export async function parseFile(args: ParseFileInput) {
if (process.env.UNSTRUCTURED_API_KEY === undefined) {
throw new Error("UNSTRUCTURED_API_KEY is not set");
}
const client = new UnstructuredClient({
security: {
apiKeyAuth: process.env.UNSTRUCTURED_API_KEY,
},
});
const response = await fetch(args.blobUrl);
const content = await response.blob();
const partitionReponse = await client.general.partition({
partitionParameters: {
files: {
fileName: args.name,
content,
},
strategy: Strategy.Fast,
splitPdfPage: false,
splitPdfConcurrencyLevel: 1,
},
});
if (partitionReponse.statusCode !== 200) {
console.error(partitionReponse.rawResponse);
throw new Error(`Failed to parse file: ${partitionReponse.statusCode}`);
}
const jsonString = JSON.stringify(partitionReponse.elements, null, 2);
const blob = new Blob([jsonString], { type: "application/json" });

await put(`files/${args.id}/partition.json`, blob, {
access: "public",
contentType: blob.type,
});

const markdown = elementsToMarkdown(partitionReponse.elements ?? []);
const markdownBlob = new Blob([markdown], { type: "text/markdown" });
const vercelBlob = await put(`files/${args.id}/markdown.md`, markdownBlob, {
access: "public",
contentType: markdownBlob.type,
});

return vercelBlob;
}
4 changes: 2 additions & 2 deletions app/(playground)/p/[agentId]/beta-proto/files/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ type DraftFile = {
file: File;
status: Extract<FileStatus, "uploading">;
};
type UploadedFile = {
export type UploadedFile = {
id: FileId;
blobUrl: string;
object: "file";
name: string;
status: Extract<FileStatus, "processing">;
};
type ProcessedFile = {
export type ProcessedFile = {
id: FileId;
blobUrl: string;
structuredDataBlobUrl: string;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import type { ArtifactReference } from "../../../artifact/types";
import { buildConnector } from "../../../connector/utils";
import { parseFile, uploadFile } from "../../../files/server-actions";
import {
type GiselleFile,
type ProcessedFile,
type StructuredData,
fileStatuses,
} from "../../../files/types";
import { createStringParameter } from "../../../giselle-node/parameter/factory";
import type { GiselleNodeId } from "../../../giselle-node/types";
import type { TextContent } from "../../../text-content/types";
Expand All @@ -9,7 +16,7 @@ import type { CompositeAction } from "../../context";
import { addConnector } from "./add-connector";
import { updateNode } from "./update-node";

export type Source = ArtifactReference | TextContent | WebSearch;
export type Source = ArtifactReference | TextContent | WebSearch | GiselleFile;
type AddSourceInput = {
source: Source;
/**
Expand Down Expand Up @@ -137,3 +144,87 @@ export function connectRelevanceNodes({
}
};
}

interface UploadSourceInput {
source: Source;
/**
* Instruction Node
*/
nodeId: GiselleNodeId;
}
export function uploadSource({
input,
}: { input: UploadSourceInput }): CompositeAction {
return async (dispatch, getState) => {
switch (input.source.object) {
case "file": {
if (input.source.status === fileStatuses.uploading) {
const fileVercelBlob = await uploadFile({
input: {
file: input.source.file,
fileId: input.source.id,
},
});
const node = getState().graph.nodes.find(
(node) => node.id === input.nodeId,
);
if (node === undefined) {
throw new Error(`Node not found: ${input.nodeId}`);
}

// Add the source to the instruction node property
const currentSources = node.properties.sources ?? [];
if (!Array.isArray(currentSources)) {
throw new Error(`${node.id}'s sources property is not an array`);
}
dispatch(
updateNode({
input: {
nodeId: input.nodeId,
properties: {
...node.properties,
sources: currentSources.map((source) =>
source.id === input.source.id
? {
...source,
blobUrl: fileVercelBlob.url,
status: fileStatuses.processing,
}
: source,
),
},
},
}),
);

const structuredDataVercelBlob = await parseFile({
id: input.source.id,
name: input.source.name,
blobUrl: fileVercelBlob.url,
});

dispatch(
updateNode({
input: {
nodeId: input.nodeId,
properties: {
...node.properties,
sources: currentSources.map((source) =>
source.id === input.source.id
? ({
...source,
blobUrl: fileVercelBlob.url,
structuredDataBlobUrl: structuredDataVercelBlob.url,
status: fileStatuses.processed,
} satisfies ProcessedFile)
: source,
),
},
},
}),
);
}
}
}
};
}

0 comments on commit 68b152b

Please sign in to comment.