From f426817cf0e29cc26ffebab2cdcd13c5c8905119 Mon Sep 17 00:00:00 2001 From: Julian Bilcke Date: Wed, 31 Jul 2024 21:41:44 +0200 Subject: [PATCH] add the beta scene analysis --- src/components/tasks/useTasks.tsx | 21 +- .../toolbars/top-menu/assistant/index.tsx | 16 ++ src/lib/core/constants.ts | 2 +- .../extractCaptionFromFrameMoondream.ts | 0 .../extractCaptionsFromFrames.ts | 65 ++++-- .../getDefaultAutocompleteState.ts | 8 + src/services/autocomplete/types.ts | 20 ++ src/services/autocomplete/useAutocomplete.ts | 214 ++++++++++++++++++ src/services/io/parseFileIntoSegments.ts | 8 +- src/services/io/useIO.ts | 34 --- 10 files changed, 323 insertions(+), 65 deletions(-) rename src/services/{io => autocomplete}/extractCaptionFromFrameMoondream.ts (100%) rename src/services/{io => autocomplete}/extractCaptionsFromFrames.ts (68%) create mode 100644 src/services/autocomplete/getDefaultAutocompleteState.ts create mode 100644 src/services/autocomplete/types.ts create mode 100644 src/services/autocomplete/useAutocomplete.ts diff --git a/src/components/tasks/useTasks.tsx b/src/components/tasks/useTasks.tsx index 8477786f..58d6c682 100644 --- a/src/components/tasks/useTasks.tsx +++ b/src/components/tasks/useTasks.tsx @@ -103,15 +103,18 @@ export const useTasks = create((set, get) => ({ }): Task[] => { const { tasks } = get() - let list = Object.values(tasks) - - if (params?.status) { - list = list.filter((t) => t.status === params?.status) - } - - if (params?.category) { - list = list.filter((t) => t.category === params?.category) - } + let list = Object.values(tasks).filter((t) => { + if (params?.status && t.status !== params.status) { + return false + } + if (params?.category && t.category !== params.category) { + return false + } + if (params?.visibility && t.visibility !== params.visibility) { + return false + } + return true + }) return list }, diff --git a/src/components/toolbars/top-menu/assistant/index.tsx b/src/components/toolbars/top-menu/assistant/index.tsx index c5713a33..042cbb0c 100644 --- a/src/components/toolbars/top-menu/assistant/index.tsx +++ b/src/components/toolbars/top-menu/assistant/index.tsx @@ -15,9 +15,13 @@ import { useUI } from '@/services/ui' import { SettingsCategory } from '@aitube/clapper-services' import { AssistantModelList } from '../lists/AssistantModelList' import { useVoiceAssistant } from '@/services/assistant/useVoiceAssistant' +import { useAutocomplete } from '@/services/autocomplete/useAutocomplete' export function TopMenuAssistant() { const setShowSettings = useUI((s) => s.setShowSettings) + const storyboardsToStory = useAutocomplete((s) => s.storyboardsToStory) + + const hasBetaAccess = useUI((s) => s.hasBetaAccess) // this should only be called on and at only one place in the project! useVoiceAssistant() @@ -36,6 +40,18 @@ export function TopMenuAssistant() { + {hasBetaAccess && ( + <> + + { + storyboardsToStory() + }} + > + Storyboards-to-captions (beta, client-side AI) + + + )} Usage and costs: not implemented diff --git a/src/lib/core/constants.ts b/src/lib/core/constants.ts index 05f2dac2..862e083b 100644 --- a/src/lib/core/constants.ts +++ b/src/lib/core/constants.ts @@ -3,7 +3,7 @@ export const HARD_LIMIT_NB_MAX_ASSETS_TO_GENERATE_IN_PARALLEL = 32 export const APP_NAME = 'Clapper.app' -export const APP_REVISION = '20240730+1240' +export const APP_REVISION = '20240731+2141' export const APP_DOMAIN = 'Clapper.app' export const APP_LINK = 'https://clapper.app' diff --git a/src/services/io/extractCaptionFromFrameMoondream.ts b/src/services/autocomplete/extractCaptionFromFrameMoondream.ts similarity index 100% rename from src/services/io/extractCaptionFromFrameMoondream.ts rename to src/services/autocomplete/extractCaptionFromFrameMoondream.ts diff --git a/src/services/io/extractCaptionsFromFrames.ts b/src/services/autocomplete/extractCaptionsFromFrames.ts similarity index 68% rename from src/services/io/extractCaptionsFromFrames.ts rename to src/services/autocomplete/extractCaptionsFromFrames.ts index 2017b1ac..461ef790 100644 --- a/src/services/io/extractCaptionsFromFrames.ts +++ b/src/services/autocomplete/extractCaptionsFromFrames.ts @@ -5,6 +5,49 @@ import { RawImage, } from '@xenova/transformers' +export const cache: { + model?: Promise + processor?: Promise + tokenizer?: Promise +} = {} + +export async function loadModel( + modelId: string, + onProgress: (progress: number) => void +) { + onProgress(0) + const model = await (cache.model + ? cache.model + : (cache.model = Florence2ForConditionalGeneration.from_pretrained( + modelId, + { + dtype: 'fp32', + } + ))) + + onProgress(33) + + const processor = await (cache.processor + ? cache.processor + : (cache.processor = AutoProcessor.from_pretrained(modelId))) + + onProgress(66) + + const tokenizer = await (cache.tokenizer + ? cache.tokenizer + : (cache.tokenizer = AutoTokenizer.from_pretrained(modelId))) + + onProgress(100) + + return { model, processor, tokenizer } +} + +export function closeModel() { + cache.model = undefined + cache.processor = undefined + cache.tokenizer = undefined +} + export async function extractCaptionsFromFrames( images: string[] = [], onProgress: ( @@ -31,34 +74,24 @@ Linux experimental support also requires launching the browser with --enable-fea } let progress = 0 - onProgress(progress, 0, images.length) + // for code example, see: // https://github.com/xenova/transformers.js/pull/545#issuecomment-2183625876 // Load model, processor, and tokenizer const model_id = 'onnx-community/Florence-2-base-ft' - const model = await Florence2ForConditionalGeneration.from_pretrained( - model_id, - { - dtype: 'fp32', - } - ) - onProgress((progress = 5), 0, images.length) - - const processor = await AutoProcessor.from_pretrained(model_id) - - onProgress((progress = 10), 0, images.length) - - const tokenizer = await AutoTokenizer.from_pretrained(model_id) - - onProgress((progress = 15), 0, images.length) + const { model, processor, tokenizer } = await loadModel(model_id, (p) => { + onProgress((progress = p * 15), 0, images.length) + }) // not all prompts will work properly, see the official examples: // https://huggingface.co/microsoft/Florence-2-base-ft/blob/e7a5acc73559546de6e12ec0319cd7cc1fa2437c/processing_florence2.py#L115-L117 // Prepare text inputs const prompts = 'Describe with a paragraph what is shown in the image.' + // const prompts = 'Decompose the following video frame into era, genre, location, weather, characters, and action. Give the answer in YAML.' + const text_inputs = tokenizer(prompts) let i = 1 diff --git a/src/services/autocomplete/getDefaultAutocompleteState.ts b/src/services/autocomplete/getDefaultAutocompleteState.ts new file mode 100644 index 00000000..619dd765 --- /dev/null +++ b/src/services/autocomplete/getDefaultAutocompleteState.ts @@ -0,0 +1,8 @@ +import { AutocompleteState } from './types' + +export function getDefaultAutocompleteState(): AutocompleteState { + const state: AutocompleteState = { + isRunning: false, + } + return state +} diff --git a/src/services/autocomplete/types.ts b/src/services/autocomplete/types.ts new file mode 100644 index 00000000..fff8be65 --- /dev/null +++ b/src/services/autocomplete/types.ts @@ -0,0 +1,20 @@ +export type AutocompleteState = { + isRunning: boolean +} +export type AutocompleteControls = { + /** + * Take a range of storyboards and infer the corresponding story + * + * This will directly update the screenplay and timeline, + * creating the appropriate segments, line coordinates etc + * + * + * @param params + * @returns + */ + storyboardsToStory: (params?: { + startTimeInMs?: number + endTimeInMs?: number + }) => Promise +} +export type AutocompleteStore = AutocompleteState & AutocompleteControls diff --git a/src/services/autocomplete/useAutocomplete.ts b/src/services/autocomplete/useAutocomplete.ts new file mode 100644 index 00000000..b0c4846f --- /dev/null +++ b/src/services/autocomplete/useAutocomplete.ts @@ -0,0 +1,214 @@ +'use client' + +import { create } from 'zustand' +import { TaskCategory, TaskVisibility } from '@aitube/clapper-services' +import { + ClapAssetSource, + ClapSegmentCategory, + ClapSegmentFilteringMode, + ClapSegmentStatus, + filterSegments, + newSegment, +} from '@aitube/clap' +import { + clapSegmentToTimelineSegment, + TimelineSegment, + TimelineStore, + useTimeline, +} from '@aitube/timeline' + +import { useTasks } from '@/components/tasks/useTasks' +import { isValidNumber } from '@/lib/utils' +import { extractCaptionsFromFrames } from './extractCaptionsFromFrames' +import { getDefaultAutocompleteState } from './getDefaultAutocompleteState' +import { AutocompleteStore } from './types' + +export const useAutocomplete = create((set, get) => ({ + ...getDefaultAutocompleteState(), + + storyboardsToStory: async ( + params: { + startTimeInMs?: number + endTimeInMs?: number + } = {} + ): Promise => { + const { isRunning } = get() + + if (isRunning) { + console.log(`Job is already running..`) + return + } + + const timeline: TimelineStore = useTimeline.getState() + + const startTimeInMs = isValidNumber(params?.startTimeInMs) + ? params?.startTimeInMs! + : 0 + const endTimeInMs = isValidNumber(params?.endTimeInMs) + ? params?.endTimeInMs! + : timeline.totalDurationInMs + + const range = { startTimeInMs, endTimeInMs } + + const task = useTasks.getState().add({ + category: TaskCategory.IMPORT, + // visibility: TaskVisibility.BLOCKER, + + // since this is very long task, we can run it in the background + visibility: TaskVisibility.BACKGROUND, + initialMessage: `Analyzing storyboards..`, + successMessage: `Analyzing storyboards.. 100% done`, + value: 0, + }) + + set({ isRunning: true }) + + try { + const storyboards = filterSegments( + ClapSegmentFilteringMode.ANY, + range, + timeline.segments, + ClapSegmentCategory.STORYBOARD + ).filter((storyboard) => storyboard.assetUrl.startsWith('data:')) + + let i = 0 + let progress = 0 + // to keep things light and in the background, we use an async for loop + for (const storyboard of storyboards) { + const isStillRunning = get().isRunning + if (!isStillRunning) { + break + } + + try { + console.log(`analyzing storyboard:`, storyboard) + const frames = [storyboard.assetUrl] + const captions = await extractCaptionsFromFrames( + frames, + ( + progress: number, + storyboardIndex: number, + nbStoryboards: number + ) => { + // this will be counting from to 100%, for each call to extractCaptionsFromFrames() + // so TODO @Julian: adjust this for the right calculation + // task.setProgress({ + // message: `Analyzing storyboards (${progress}%)`, + // value: progress, + // }) + } + ) + + i++ + + const relativeProgress = i / storyboards.length + + progress += relativeProgress * 100 + + task.setProgress({ + message: `Analyzing storyboards (${Math.round(progress)}%)`, + value: progress, + }) + + const caption = `${captions[0] || ''}` + + // the model currently used is a browser-side model, + // and isn't smart enough to give the result as a YAML with categories etc + console.log(" '--> caption:", caption) + + const sentences = caption + .split('. ') + .map((x) => x.trim()) + .filter((x) => x) + const categories = { + [ClapSegmentCategory.CHARACTER]: [] as string[], + [ClapSegmentCategory.LOCATION]: [] as string[], + [ClapSegmentCategory.LIGHTING]: [] as string[], + [ClapSegmentCategory.STYLE]: [] as string[], + [ClapSegmentCategory.ACTION]: [] as string[], + [ClapSegmentCategory.GENERIC]: [] as string[], + } + + // those regex are a temporary solution, until we can embed larger models + // able to decompose a scene automatically + for (const sentence of sentences) { + if (sentence.match(/(?:is wearing|wears)/)) { + categories.CHARACTER.push(sentence) + } else if (sentence.match(/(?:the (?:image|screen) (?:is|has))/)) { + categories.STYLE.push(sentence) + } else if ( + sentence.match( + /(?:the (?:lighting|lights|light|fire|sun|moon)|bright|dim|neon|candle|lit up)/ + ) + ) { + categories.LIGHTING.push(sentence) + } else if ( + sentence.match( + /(?:the (?:man|woman|kid|child|person|animal|person|robot)|(?:she|he) (?:has|is))/ + ) + ) { + categories.CHARACTER.push(sentence) + } else if ( + sentence.match(/(?:behind the|background|room|location|place)/) + ) { + categories.LOCATION.push(sentence) + } else { + categories.GENERIC.push(sentence) + } + } + + console.log('categories:', categories) + + const segments: TimelineSegment[] = [] + + // this is temporary, we can do better later + segments.push( + await clapSegmentToTimelineSegment( + newSegment({ + category: ClapSegmentCategory.CAMERA, + prompt: 'medium-shot', + label: 'medium-shot', + startTimeInMs: storyboard.startTimeInMs, + endTimeInMs: storyboard.endTimeInMs, + status: ClapSegmentStatus.COMPLETED, + track: timeline.findFreeTrack({ startTimeInMs, endTimeInMs }), // track row index + }) + ) + ) + + for (const [cat, prompts] of Object.entries(categories)) { + const category = cat as ClapSegmentCategory + for (const prompt of prompts) { + if (!prompt) { + continue + } + const segment = await clapSegmentToTimelineSegment( + newSegment({ + category, + prompt, + label: prompt, + startTimeInMs: storyboard.startTimeInMs, + endTimeInMs: storyboard.endTimeInMs, + status: ClapSegmentStatus.COMPLETED, + track: timeline.findFreeTrack({ startTimeInMs, endTimeInMs }), // track row index + }) + ) + segments.push(segment) + } + } + + await timeline.addSegments({ segments }) + } catch (err) { + console.error(`failed to analyze a storyboard:`, err) + } + + // TODO: use a special prompt to get categorized captions + } + } catch (err) { + console.error(`storyboardsToStory(): failed to analyze storyboards:`, err) + } finally { + task.success() + set({ isRunning: false }) + } + }, +})) diff --git a/src/services/io/parseFileIntoSegments.ts b/src/services/io/parseFileIntoSegments.ts index a52eca5e..e772c6c7 100644 --- a/src/services/io/parseFileIntoSegments.ts +++ b/src/services/io/parseFileIntoSegments.ts @@ -68,7 +68,6 @@ export async function parseFileIntoSegments({ // I expect people will use AI service providers for sound and voice, // maybe in some case music too, but there are also many people // who will want to use their own track eg. to create a music video - const category = ClapSegmentCategory.STORYBOARD const assetUrl = await blobToBase64DataUri(file) @@ -122,7 +121,8 @@ export async function parseFileIntoSegments({ status: ClapSegmentStatus.COMPLETED, // track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index label: `${file.name}`, // a short label to name the segment (optional, can be human or LLM-defined) - category, + category: ClapSegmentCategory.STORYBOARD, + assetUrl, assetDurationInMs: durationInMs, assetSourceType: ClapAssetSource.DATA, @@ -145,10 +145,8 @@ export async function parseFileIntoSegments({ // we assume we want it to be immediately visible storyboard.visibility = SegmentVisibility.VISIBLE - // console.log("newSegment:", audioSegment) - - // poof! type disappears.. it's magic newSegments.push(storyboard) + break } diff --git a/src/services/io/useIO.ts b/src/services/io/useIO.ts index 469bf872..3b99b9b6 100644 --- a/src/services/io/useIO.ts +++ b/src/services/io/useIO.ts @@ -49,7 +49,6 @@ import { sleep } from '@/lib/utils/sleep' import { FFMPegAudioInput, FFMPegVideoInput } from './ffmpegUtils' import { createFullVideo } from './createFullVideo' import { extractScenesFromVideo } from './extractScenesFromVideo' -import { extractCaptionsFromFrames } from './extractCaptionsFromFrames' import { base64DataUriToFile } from '@/lib/utils/base64DataUriToFile' import { useUI } from '../ui' import { getTypeAndExtension } from '@/lib/utils/getTypeAndExtension' @@ -234,39 +233,6 @@ export const useIO = create((set, get) => ({ sceneExtractionTask.success() - const enableCaptioning = false - - if (enableCaptioning) { - const captioningTask = useTasks.getState().add({ - category: TaskCategory.IMPORT, - // visibility: TaskVisibility.BLOCKER, - - // since this is very long task, we can run it in the background - visibility: TaskVisibility.BACKGROUND, - initialMessage: `Analyzing storyboards..`, - successMessage: `Analyzing storyboards.. 100% done`, - value: 0, - }) - - console.log('calling extractCaptionsFromFrames() with:', frames) - /* - const captions = await extractCaptionsFromFrames( - frames, - (progress: number, storyboardIndex: number, nbStoryboards: number) => { - captioningTask.setProgress({ - message: `Analyzing storyboards (${progress}%)`, - value: progress, - }) - } - ) - - console.log('captions:', captions) - */ - // TODO: add - - captioningTask.success() - } - useUI.getState().setShowWelcomeScreen(false) }, openScreenplay: async (