From f426817cf0e29cc26ffebab2cdcd13c5c8905119 Mon Sep 17 00:00:00 2001
From: Julian Bilcke <julian.bilcke@huggingface.co>
Date: Wed, 31 Jul 2024 21:41:44 +0200
Subject: [PATCH] add the beta scene analysis

---
 src/components/tasks/useTasks.tsx             |  21 +-
 .../toolbars/top-menu/assistant/index.tsx     |  16 ++
 src/lib/core/constants.ts                     |   2 +-
 .../extractCaptionFromFrameMoondream.ts       |   0
 .../extractCaptionsFromFrames.ts              |  65 ++++--
 .../getDefaultAutocompleteState.ts            |   8 +
 src/services/autocomplete/types.ts            |  20 ++
 src/services/autocomplete/useAutocomplete.ts  | 214 ++++++++++++++++++
 src/services/io/parseFileIntoSegments.ts      |   8 +-
 src/services/io/useIO.ts                      |  34 ---
 10 files changed, 323 insertions(+), 65 deletions(-)
 rename src/services/{io => autocomplete}/extractCaptionFromFrameMoondream.ts (100%)
 rename src/services/{io => autocomplete}/extractCaptionsFromFrames.ts (68%)
 create mode 100644 src/services/autocomplete/getDefaultAutocompleteState.ts
 create mode 100644 src/services/autocomplete/types.ts
 create mode 100644 src/services/autocomplete/useAutocomplete.ts
diff --git a/src/components/tasks/useTasks.tsx b/src/components/tasks/useTasks.tsx
index 8477786f..58d6c682 100644
--- a/src/components/tasks/useTasks.tsx
+++ b/src/components/tasks/useTasks.tsx
@@ -103,15 +103,18 @@ export const useTasks = create<TasksStore>((set, get) => ({
   }): Task[] => {
     const { tasks } = get()
 
-    let list = Object.values(tasks)
-
-    if (params?.status) {
-      list = list.filter((t) => t.status === params?.status)
-    }
-
-    if (params?.category) {
-      list = list.filter((t) => t.category === params?.category)
-    }
+    let list = Object.values(tasks).filter((t) => {
+      if (params?.status && t.status !== params.status) {
+        return false
+      }
+      if (params?.category && t.category !== params.category) {
+        return false
+      }
+      if (params?.visibility && t.visibility !== params.visibility) {
+        return false
+      }
+      return true
+    })
 
     return list
   },
diff --git a/src/components/toolbars/top-menu/assistant/index.tsx b/src/components/toolbars/top-menu/assistant/index.tsx
index c5713a33..042cbb0c 100644
--- a/src/components/toolbars/top-menu/assistant/index.tsx
+++ b/src/components/toolbars/top-menu/assistant/index.tsx
@@ -15,9 +15,13 @@ import { useUI } from '@/services/ui'
 import { SettingsCategory } from '@aitube/clapper-services'
 import { AssistantModelList } from '../lists/AssistantModelList'
 import { useVoiceAssistant } from '@/services/assistant/useVoiceAssistant'
+import { useAutocomplete } from '@/services/autocomplete/useAutocomplete'
 
 export function TopMenuAssistant() {
   const setShowSettings = useUI((s) => s.setShowSettings)
+  const storyboardsToStory = useAutocomplete((s) => s.storyboardsToStory)
+
+  const hasBetaAccess = useUI((s) => s.hasBetaAccess)
 
   // this should only be called on and at only one place in the project!
   useVoiceAssistant()
@@ -36,6 +40,18 @@ export function TopMenuAssistant() {
           </MenubarItem>
           <MenubarSeparator />
           <AssistantModelList />
+          {hasBetaAccess && (
+            <>
+              <MenubarSeparator />
+              <MenubarItem
+                onClick={() => {
+                  storyboardsToStory()
+                }}
+              >
+                Storyboards-to-captions (beta, client-side AI)
+              </MenubarItem>
+            </>
+          )}
           <MenubarSeparator />
           <MenubarItem disabled>Usage and costs: not implemented</MenubarItem>
         </MenubarSub>
diff --git a/src/lib/core/constants.ts b/src/lib/core/constants.ts
index 05f2dac2..862e083b 100644
--- a/src/lib/core/constants.ts
+++ b/src/lib/core/constants.ts
@@ -3,7 +3,7 @@
 export const HARD_LIMIT_NB_MAX_ASSETS_TO_GENERATE_IN_PARALLEL = 32
 
 export const APP_NAME = 'Clapper.app'
-export const APP_REVISION = '20240730+1240'
+export const APP_REVISION = '20240731+2141'
 
 export const APP_DOMAIN = 'Clapper.app'
 export const APP_LINK = 'https://clapper.app'
diff --git a/src/services/io/extractCaptionFromFrameMoondream.ts b/src/services/autocomplete/extractCaptionFromFrameMoondream.ts
similarity index 100%
rename from src/services/io/extractCaptionFromFrameMoondream.ts
rename to src/services/autocomplete/extractCaptionFromFrameMoondream.ts
diff --git a/src/services/io/extractCaptionsFromFrames.ts b/src/services/autocomplete/extractCaptionsFromFrames.ts
similarity index 68%
rename from src/services/io/extractCaptionsFromFrames.ts
rename to src/services/autocomplete/extractCaptionsFromFrames.ts
index 2017b1ac..461ef790 100644
--- a/src/services/io/extractCaptionsFromFrames.ts
+++ b/src/services/autocomplete/extractCaptionsFromFrames.ts
@@ -5,6 +5,49 @@ import {
   RawImage,
 } from '@xenova/transformers'
 
+export const cache: {
+  model?: Promise<any>
+  processor?: Promise<any>
+  tokenizer?: Promise<any>
+} = {}
+
+export async function loadModel(
+  modelId: string,
+  onProgress: (progress: number) => void
+) {
+  onProgress(0)
+  const model = await (cache.model
+    ? cache.model
+    : (cache.model = Florence2ForConditionalGeneration.from_pretrained(
+        modelId,
+        {
+          dtype: 'fp32',
+        }
+      )))
+
+  onProgress(33)
+
+  const processor = await (cache.processor
+    ? cache.processor
+    : (cache.processor = AutoProcessor.from_pretrained(modelId)))
+
+  onProgress(66)
+
+  const tokenizer = await (cache.tokenizer
+    ? cache.tokenizer
+    : (cache.tokenizer = AutoTokenizer.from_pretrained(modelId)))
+
+  onProgress(100)
+
+  return { model, processor, tokenizer }
+}
+
+export function closeModel() {
+  cache.model = undefined
+  cache.processor = undefined
+  cache.tokenizer = undefined
+}
+
 export async function extractCaptionsFromFrames(
   images: string[] = [],
   onProgress: (
@@ -31,34 +74,24 @@ Linux experimental support also requires launching the browser with --enable-fea
   }
 
   let progress = 0
-  onProgress(progress, 0, images.length)
+
   // for code example, see:
   // https://github.com/xenova/transformers.js/pull/545#issuecomment-2183625876
 
   // Load model, processor, and tokenizer
   const model_id = 'onnx-community/Florence-2-base-ft'
-  const model = await Florence2ForConditionalGeneration.from_pretrained(
-    model_id,
-    {
-      dtype: 'fp32',
-    }
-  )
 
-  onProgress((progress = 5), 0, images.length)
-
-  const processor = await AutoProcessor.from_pretrained(model_id)
-
-  onProgress((progress = 10), 0, images.length)
-
-  const tokenizer = await AutoTokenizer.from_pretrained(model_id)
-
-  onProgress((progress = 15), 0, images.length)
+  const { model, processor, tokenizer } = await loadModel(model_id, (p) => {
+    onProgress((progress = p * 15), 0, images.length)
+  })
 
   // not all prompts will work properly, see the official examples:
   // https://huggingface.co/microsoft/Florence-2-base-ft/blob/e7a5acc73559546de6e12ec0319cd7cc1fa2437c/processing_florence2.py#L115-L117
 
   // Prepare text inputs
   const prompts = 'Describe with a paragraph what is shown in the image.'
+  // const prompts = 'Decompose the following video frame into era, genre, location, weather, characters, and action. Give the answer in YAML.'
+
   const text_inputs = tokenizer(prompts)
 
   let i = 1
diff --git a/src/services/autocomplete/getDefaultAutocompleteState.ts b/src/services/autocomplete/getDefaultAutocompleteState.ts
new file mode 100644
index 00000000..619dd765
--- /dev/null
+++ b/src/services/autocomplete/getDefaultAutocompleteState.ts
@@ -0,0 +1,8 @@
+import { AutocompleteState } from './types'
+
+export function getDefaultAutocompleteState(): AutocompleteState {
+  const state: AutocompleteState = {
+    isRunning: false,
+  }
+  return state
+}
diff --git a/src/services/autocomplete/types.ts b/src/services/autocomplete/types.ts
new file mode 100644
index 00000000..fff8be65
--- /dev/null
+++ b/src/services/autocomplete/types.ts
@@ -0,0 +1,20 @@
+export type AutocompleteState = {
+  isRunning: boolean
+}
+export type AutocompleteControls = {
+  /**
+   * Take a range of storyboards and infer the corresponding story
+   *
+   * This will directly update the screenplay and timeline,
+   * creating the appropriate segments, line coordinates etc
+   *
+   *
+   * @param params
+   * @returns
+   */
+  storyboardsToStory: (params?: {
+    startTimeInMs?: number
+    endTimeInMs?: number
+  }) => Promise<void>
+}
+export type AutocompleteStore = AutocompleteState & AutocompleteControls
diff --git a/src/services/autocomplete/useAutocomplete.ts b/src/services/autocomplete/useAutocomplete.ts
new file mode 100644
index 00000000..b0c4846f
--- /dev/null
+++ b/src/services/autocomplete/useAutocomplete.ts
@@ -0,0 +1,214 @@
+'use client'
+
+import { create } from 'zustand'
+import { TaskCategory, TaskVisibility } from '@aitube/clapper-services'
+import {
+  ClapAssetSource,
+  ClapSegmentCategory,
+  ClapSegmentFilteringMode,
+  ClapSegmentStatus,
+  filterSegments,
+  newSegment,
+} from '@aitube/clap'
+import {
+  clapSegmentToTimelineSegment,
+  TimelineSegment,
+  TimelineStore,
+  useTimeline,
+} from '@aitube/timeline'
+
+import { useTasks } from '@/components/tasks/useTasks'
+import { isValidNumber } from '@/lib/utils'
+import { extractCaptionsFromFrames } from './extractCaptionsFromFrames'
+import { getDefaultAutocompleteState } from './getDefaultAutocompleteState'
+import { AutocompleteStore } from './types'
+
+export const useAutocomplete = create<AutocompleteStore>((set, get) => ({
+  ...getDefaultAutocompleteState(),
+
+  storyboardsToStory: async (
+    params: {
+      startTimeInMs?: number
+      endTimeInMs?: number
+    } = {}
+  ): Promise<void> => {
+    const { isRunning } = get()
+
+    if (isRunning) {
+      console.log(`Job is already running..`)
+      return
+    }
+
+    const timeline: TimelineStore = useTimeline.getState()
+
+    const startTimeInMs = isValidNumber(params?.startTimeInMs)
+      ? params?.startTimeInMs!
+      : 0
+    const endTimeInMs = isValidNumber(params?.endTimeInMs)
+      ? params?.endTimeInMs!
+      : timeline.totalDurationInMs
+
+    const range = { startTimeInMs, endTimeInMs }
+
+    const task = useTasks.getState().add({
+      category: TaskCategory.IMPORT,
+      // visibility: TaskVisibility.BLOCKER,
+
+      // since this is very long task, we can run it in the background
+      visibility: TaskVisibility.BACKGROUND,
+      initialMessage: `Analyzing storyboards..`,
+      successMessage: `Analyzing storyboards.. 100% done`,
+      value: 0,
+    })
+
+    set({ isRunning: true })
+
+    try {
+      const storyboards = filterSegments(
+        ClapSegmentFilteringMode.ANY,
+        range,
+        timeline.segments,
+        ClapSegmentCategory.STORYBOARD
+      ).filter((storyboard) => storyboard.assetUrl.startsWith('data:'))
+
+      let i = 0
+      let progress = 0
+      // to keep things light and in the background, we use an async for loop
+      for (const storyboard of storyboards) {
+        const isStillRunning = get().isRunning
+        if (!isStillRunning) {
+          break
+        }
+
+        try {
+          console.log(`analyzing storyboard:`, storyboard)
+          const frames = [storyboard.assetUrl]
+          const captions = await extractCaptionsFromFrames(
+            frames,
+            (
+              progress: number,
+              storyboardIndex: number,
+              nbStoryboards: number
+            ) => {
+              // this will be counting from to 100%, for each call to extractCaptionsFromFrames()
+              // so TODO @Julian: adjust this for the right calculation
+              // task.setProgress({
+              //   message: `Analyzing storyboards (${progress}%)`,
+              //   value: progress,
+              // })
+            }
+          )
+
+          i++
+
+          const relativeProgress = i / storyboards.length
+
+          progress += relativeProgress * 100
+
+          task.setProgress({
+            message: `Analyzing storyboards (${Math.round(progress)}%)`,
+            value: progress,
+          })
+
+          const caption = `${captions[0] || ''}`
+
+          // the model currently used is a browser-side model,
+          // and isn't smart enough to give the result as a YAML with categories etc
+          console.log("  '--> caption:", caption)
+
+          const sentences = caption
+            .split('. ')
+            .map((x) => x.trim())
+            .filter((x) => x)
+          const categories = {
+            [ClapSegmentCategory.CHARACTER]: [] as string[],
+            [ClapSegmentCategory.LOCATION]: [] as string[],
+            [ClapSegmentCategory.LIGHTING]: [] as string[],
+            [ClapSegmentCategory.STYLE]: [] as string[],
+            [ClapSegmentCategory.ACTION]: [] as string[],
+            [ClapSegmentCategory.GENERIC]: [] as string[],
+          }
+
+          // those regex are a temporary solution, until we can embed larger models
+          // able to decompose a scene automatically
+          for (const sentence of sentences) {
+            if (sentence.match(/(?:is wearing|wears)/)) {
+              categories.CHARACTER.push(sentence)
+            } else if (sentence.match(/(?:the (?:image|screen) (?:is|has))/)) {
+              categories.STYLE.push(sentence)
+            } else if (
+              sentence.match(
+                /(?:the (?:lighting|lights|light|fire|sun|moon)|bright|dim|neon|candle|lit up)/
+              )
+            ) {
+              categories.LIGHTING.push(sentence)
+            } else if (
+              sentence.match(
+                /(?:the (?:man|woman|kid|child|person|animal|person|robot)|(?:she|he) (?:has|is))/
+              )
+            ) {
+              categories.CHARACTER.push(sentence)
+            } else if (
+              sentence.match(/(?:behind the|background|room|location|place)/)
+            ) {
+              categories.LOCATION.push(sentence)
+            } else {
+              categories.GENERIC.push(sentence)
+            }
+          }
+
+          console.log('categories:', categories)
+
+          const segments: TimelineSegment[] = []
+
+          // this is temporary, we can do better later
+          segments.push(
+            await clapSegmentToTimelineSegment(
+              newSegment({
+                category: ClapSegmentCategory.CAMERA,
+                prompt: 'medium-shot',
+                label: 'medium-shot',
+                startTimeInMs: storyboard.startTimeInMs,
+                endTimeInMs: storyboard.endTimeInMs,
+                status: ClapSegmentStatus.COMPLETED,
+                track: timeline.findFreeTrack({ startTimeInMs, endTimeInMs }), // track row index
+              })
+            )
+          )
+
+          for (const [cat, prompts] of Object.entries(categories)) {
+            const category = cat as ClapSegmentCategory
+            for (const prompt of prompts) {
+              if (!prompt) {
+                continue
+              }
+              const segment = await clapSegmentToTimelineSegment(
+                newSegment({
+                  category,
+                  prompt,
+                  label: prompt,
+                  startTimeInMs: storyboard.startTimeInMs,
+                  endTimeInMs: storyboard.endTimeInMs,
+                  status: ClapSegmentStatus.COMPLETED,
+                  track: timeline.findFreeTrack({ startTimeInMs, endTimeInMs }), // track row index
+                })
+              )
+              segments.push(segment)
+            }
+          }
+
+          await timeline.addSegments({ segments })
+        } catch (err) {
+          console.error(`failed to analyze a storyboard:`, err)
+        }
+
+        // TODO: use a special prompt to get categorized captions
+      }
+    } catch (err) {
+      console.error(`storyboardsToStory(): failed to analyze storyboards:`, err)
+    } finally {
+      task.success()
+      set({ isRunning: false })
+    }
+  },
+}))
diff --git a/src/services/io/parseFileIntoSegments.ts b/src/services/io/parseFileIntoSegments.ts
index a52eca5e..e772c6c7 100644
--- a/src/services/io/parseFileIntoSegments.ts
+++ b/src/services/io/parseFileIntoSegments.ts
@@ -68,7 +68,6 @@ export async function parseFileIntoSegments({
       // I expect people will use AI service providers for sound and voice,
       // maybe in some case music too, but there are also many people
       // who will want to use their own track eg. to create a music video
-      const category = ClapSegmentCategory.STORYBOARD
 
       const assetUrl = await blobToBase64DataUri(file)
 
@@ -122,7 +121,8 @@ export async function parseFileIntoSegments({
         status: ClapSegmentStatus.COMPLETED,
         // track: findFreeTrack({ segments, startTimeInMs, endTimeInMs }), // track row index
         label: `${file.name}`, // a short label to name the segment (optional, can be human or LLM-defined)
-        category,
+        category: ClapSegmentCategory.STORYBOARD,
+
         assetUrl,
         assetDurationInMs: durationInMs,
         assetSourceType: ClapAssetSource.DATA,
@@ -145,10 +145,8 @@ export async function parseFileIntoSegments({
       // we assume we want it to be immediately visible
       storyboard.visibility = SegmentVisibility.VISIBLE
 
-      // console.log("newSegment:", audioSegment)
-
-      // poof! type disappears.. it's magic
       newSegments.push(storyboard)
+
       break
     }
 
diff --git a/src/services/io/useIO.ts b/src/services/io/useIO.ts
index 469bf872..3b99b9b6 100644
--- a/src/services/io/useIO.ts
+++ b/src/services/io/useIO.ts
@@ -49,7 +49,6 @@ import { sleep } from '@/lib/utils/sleep'
 import { FFMPegAudioInput, FFMPegVideoInput } from './ffmpegUtils'
 import { createFullVideo } from './createFullVideo'
 import { extractScenesFromVideo } from './extractScenesFromVideo'
-import { extractCaptionsFromFrames } from './extractCaptionsFromFrames'
 import { base64DataUriToFile } from '@/lib/utils/base64DataUriToFile'
 import { useUI } from '../ui'
 import { getTypeAndExtension } from '@/lib/utils/getTypeAndExtension'
@@ -234,39 +233,6 @@ export const useIO = create<IOStore>((set, get) => ({
 
     sceneExtractionTask.success()
 
-    const enableCaptioning = false
-
-    if (enableCaptioning) {
-      const captioningTask = useTasks.getState().add({
-        category: TaskCategory.IMPORT,
-        // visibility: TaskVisibility.BLOCKER,
-
-        // since this is very long task, we can run it in the background
-        visibility: TaskVisibility.BACKGROUND,
-        initialMessage: `Analyzing storyboards..`,
-        successMessage: `Analyzing storyboards.. 100% done`,
-        value: 0,
-      })
-
-      console.log('calling extractCaptionsFromFrames() with:', frames)
-      /*
-      const captions = await extractCaptionsFromFrames(
-        frames,
-        (progress: number, storyboardIndex: number, nbStoryboards: number) => {
-          captioningTask.setProgress({
-            message: `Analyzing storyboards (${progress}%)`,
-            value: progress,
-          })
-        }
-      )
-  
-      console.log('captions:', captions)
-      */
-      // TODO: add
-
-      captioningTask.success()
-    }
-
     useUI.getState().setShowWelcomeScreen(false)
   },
   openScreenplay: async (