-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
403 lines (368 loc) · 17.7 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="description"
content="Goldfish">
<meta name="keywords" content="Long video understanding, vision-language, video question answering">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Goldfish</title>
<meta name="google-site-verification" content="6lbYN1vX7A4sD8SrVniq84UEKyEUSBgxeP7d3FjuuK0" />
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet"
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<!-- <link rel="icon" href="./static/images/icon.png"> -->
<link rel="stylesheet" href="./static/css/index.css">
<link rel="shortcut icon" href="path/to/favicon.ico" type="image/x-icon">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
</head>
<style>
#main{
position: relative;;
width: 1200px;
}
.box{
float: left;
padding: 15px 0 0 15px;
/* background-color: red;*/
}
.pic{
width: 500px;
padding: 10px;
border: 1px solid #ccc;
border-radius: 5px;
background-color: #fff;
}
.pic img{
width: 800px;
}
</style>
<body>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<div class="title_container">
<img src="repo_imgs\goldfishai_png.png" width="80" height="80">
<h1 class="title is-1 publication-title">Goldfish</h1>
</div>
<h2 class="title is-2 publication-title">Vision-Language Understanding of Arbitrarily Long Videos</h2>
<div class="is-size-5">
<span class="author-block">
<a href="https://scholar.google.com/citations?user=6gRlYHAAAAAJ&hl=en" style="color:#008AD7;font-weight:normal;">Kirolos Ataallah
</a>,
</span>
<span class="author-block">
<a href="https://xiaoqian-shen.github.io/" style="color:#008AD7;font-weight:normal;">Xiaoqian Shen</a>,</span>
<span class="author-block">
<a href="https://eslambakr.github.io/" style="color:#008AD7;font-weight:normal;">Eslam Abdelrahman</a>,
</span>
<span class="author-block">
<a href="https://essamsleiman.com" style="color:#03953b;font-weight:normal;">Essam Sleiman</a>,
</span>
<span class="author-block">
<a href="https://metauto.ai/" style="color:#008AD7;font-weight:normal;">Mingchen Zhuge</a>,
</span>
<span class="author-block">
<a href="https://dingjiansw101.github.io/" style="color:#008AD7;font-weight:normal;">Jian Ding</a>,
</span>
<span class="author-block">
<a href="https://tsutikgiau.github.io/" style="color:#008AD7;font-weight:normal;">Deyao Zhu</a>,
</span>
<span class="author-block">
<a href="https://people.idsia.ch/~juergen/" style="color:#F2A900;font-weight:normal;">Jürgen Schmidhuber</a>,
</span>
<span class="author-block">
<a href="https://www.mohamed-elhoseiny.com/" style="color:#008AD7;font-weight:normal;">Mohamed Elhoseiny</a>
</span>
</div>
<br>
<div class="is-size-5 publication-authors">
<span class="author-block"><b style="color:#008AD7; font-weight:normal">▶ </b> King Abdullah University of Science and Technology </span>
<span class="author-block"><b style="color:#03953b; font-weight:normal">▶ </b>Harvard University</span>
<span class="author-block"><b style="color:#F2A900; font-weight:normal">▶ </b>The Swiss AI Lab IDSIA, USI, SUPSI</span>
</div>
<div class="is-size-5 publication-authors">
</div>
<br>
<div class="column has-text-centered">
<div class="publication-links">
<span class="link-block">
<a href="https://arxiv.org/abs/2407.12679" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>Paper</span>
</a>
</span>
<span class="link-block">
<a href="https://github.com/Vision-CAIR/MiniGPT4-video" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>
</span>
<span class="link-block">
<a href="#videoDemo" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-youtube"></i>
</span>
<span>Video</span>
</a>
</span>
<span class="link-block">
<a href="https://huggingface.co/datasets/Vision-CAIR/TVQA-Long/tree/main" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fa fa-database"></i>
</span>
<span>Dataset</span>
</a>
</span>
<span class="link-block">
<a href="https://huggingface.co/Vision-CAIR/MiniGPT4-Video" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fa fa-laugh"></i>
</span>
<span>Model</span>
</a>
</span>
<span class="link-block">
<a href="https://goldfish.loophole.site/" target="_blank" class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<img src="repo_imgs\goldfishai_png.png" alt="Demo Icon">
</span>
<span>Demo</span>
</a>
</span>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<link rel="stylesheet" type="text/css" href="js/simple_style.css" />
<script type="text/javascript" src="js/simple_swiper.js"></script>
<section class="section">
<div class="container is-max-desktop">
<img src="repo_imgs\teaser_fig_final_final.jpg" alt="teaser figure">
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
Most current LLM-based models for video understanding
can process videos within minutes. However, they struggle with lengthy
videos due to challenges such as “noise and redundancy", as well as “mem-
ory and computation" constraints. In this paper, we present Goldfish,
a methodology tailored for comprehending videos of arbitrary lengths.
We also introduce the TVQA-long benchmark, specifically designed to
evaluate models' capabilities in understanding long videos with questions
in both vision and text content. Goldfish approaches these challenges
with an efficient retrieval mechanism that initially gathers the top-k
video clips relevant to the instruction before proceeding to provide the
desired response. This design of the retrieval mechanism enables the
Goldfish to efficiently process arbitrarily long video sequences, facilitating
its application in contexts such as movies or television series. To facili-
tate the retrieval process, we developed MiniGPT4-Video that generates
detailed descriptions for the video clips. In addressing the scarcity of
benchmarks for long video evaluation, we adapted the TVQA short video
benchmark for extended content analysis by aggregating questions from
entire episodes, thereby shifting the evaluation from partial to full episode
comprehension. We attained a 41.78% accuracy rate on the TVQA-long
benchmark, surpassing previous methods by 14.94%. Our MiniGPT4-
Video also shows exceptional performance in short video comprehension,
exceeding existing state-of-the-art methods by 3.23%, 2.03%, 16.5% and
23.59% on the MSVD, MSRVTT, TGIF,and TVQA short video bench-
marks, respectively. These results indicate that our models have significant
improvements in both long and short-video understanding.
</b>
</p>
</div>
</div>
</div>
<!--/ Abstract. -->
<br>
<br>
<div class="container">
<h2 class="title has-text-centered">Video Demo</h2>
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<div class="publication-video" id="videoDemo">
<video width="760" height="515" controls>
<source src="goldfish_sample2.mp4" type="video/mp4">
Your browser does not support the video tag.
</video>
</div>
</div>
</div>
</div>
<br>
<br>
<div class="container">
<h2 class="title has-text-centered">Paper summary in 5 min from <a href='https://irepod.com/podcast/ai-breakdown/arxiv-preprint-goldfish-vision-language-understand'>AI Breakdown podcast</a></h2>
<div class="columns is-centered has-text-centered">
<audio id="background-audio" src="paper_summary.mp3" controls autoplay></audio>
</div>
</div>
<br>
<br>
<!-- Paper Model. -->
<div class="columns is-centered has-text-centered">
<div class="column is-six-fifths">
<h2 class="title is-3">Goldfish frame work</h2>
<div class="content has-text-justified">
</div>
<img id="model" width="80%" src="repo_imgs\goldfish_framework.JPG">
<h3 class="subtitle has-text-centered">
<p style="font-family:Times New Roman"><b>Goldfish framework,First break down the long video into clips, then encode them in Video
Descriptor according to their timing and corresponding subtitles, then encode the use query and
retrieve the most related clips in the retrieval module, and finally send the top-K clips information
to the answer module to get the final answer.</b></p>
</h3>
<br>
<br>
<h2 class="title is-3">Video Discriptor (MiniGPT4-video)</h2>
<img id="model" width="80%" src="repo_imgs\MiniGPT4-video_fig.jpg">
<h3 class="subtitle has-text-centered">
<p style="font-family:Times New Roman"><b>MiniGPT4-video architecture: For each frame, we use EVA-CLIP to get the visual tokens and
concatenate each adjacent visual token into a singular token then convert these tokens to the language
model space using a linear layer and get the language token from LLM tokenizer. Concatenate both
the visual and subtitle text tokens together and do this for all the sampled frames and appending
the instruction tokens at the end of the input sequence.</b></p>
</h3>
<br>
<br>
</div>
</div>
<br>
<!-- Paper Model. -->
<div class="columns is-centered has-text-centered">
<div class="column is-six-fifths">
<h2 class="title is-3">Results</h2>
<div class="content has-text-justified">
<h2>Long Video Benchmarking</h2>
<p>
<b>We evaluate the efficacy of our proposed frame-
work, Goldfish:, across several well-established benchmarks, specifically the
LLama-Vid, MovieChat, Movie QA, and TVQA-Long datasets.
To thoroughly examine our framework's capabilities, we analyze input modalities
in two configurations: vision-only (V) and vision combined with input subtitles
(V+T).
Our findings, detailed in Table 3, indicate that our framework surpasses all
existing long video baselines in the vision modality.We establish state-of-the-art
(SOTA) performance on these challenging benchmarks. This achievement holds
true even under an unfair comparison against LLama-Vid [20], which benefits
from using the MovieNet dataset while training and these movies are in both LLama-Vid
benchmark and Movie QA. Despite this advantage, our results significantly
outperform the competition.
Incorporating both video frames and aligned subtitles into our model leads
to an average performance boost of 8% across the benchmarks. As highlighted in
Table 3, this enhanced approach enables us to outperform LLama-Vid on
the TVQA benchmark, providing a fair comparison since LLama-Vid utilizes
the other benchmarks during its training phase.</b>:
</p>
<img id="model" width="100%" src="repo_imgs\Goldfish_results_table.JPG">
<br>
<br>
<h2>Short Video Benchmarking</h2>
<p>
<b>On short-video understanding, we continue to
secure state-of-the-art (SOTA) results, outperforming contemporaneous works,
including LLama-Vid. To validate our framework's proficiency in short-video
analysis, we conducted evaluations against current SOTA methodologies across
an extensive suite of five benchmarks: Video ChatGPT, MSVD, MSRVTT, TGIF,
and TVQA. These benchmarks collectively offer a comprehensive platform for
assessing short-video comprehension capabilities, with five focusing on open-ended
questions and TVQA featuring multiple-choice questions.
Our results, presented in Tables 4 and 5, demonstrate our framework's superi-
ority over competing methods by a significant margin, affirming our considerable
advancements across a varied and demanding collection of benchmarks. To thor-
oughly evaluate our approach, we devised two variations of our framework: one
analyzing purely visual elements and another incorporating subtitles. The per-
formance enhancements achieved with these models are noteworthy, registering
gains of 3.23%, 2.03%, 16.5% and 23.59% on the MSVD, MSRVTT, TGIF, and
TVQA benchmarks respectively. This underscores our framework's ability to
achieve SOTA results across the board, markedly elevating performance in the
domain of short-video understanding.</b></p>
<img id="model" width="100%" src="repo_imgs\short_video_results.JPG">
</div>
</div>
</div>
</div>
</section>
<script src="js/Underscore-min.js"></script>
<script src="js/index.js"></script>
<section class="section">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column is-six-fifths">
<h2 class="title is-3">Examples</h2>
<div>
<img src="repo_imgs\demo_1.JPG" alt="">
<img src="repo_imgs\demo_2.JPG" alt="">
<img src="images/2.png" alt="">
<img src="images/3.png" alt="">
<img src="images/4.png" alt="">
<img src="images/5.png" alt="">
<img src="images/6.png" alt="">
<img src="images/7.png" alt="">
<img src="images/8.png" alt="">
</div>
</div>
</div>
</div>
</section>
<section class="section" id="BibTeX">
<div class="container is-max-desktop content">
<h2 class="title">BibTeX</h2>
<pre><code>
@misc{ataallah2024goldfishvisionlanguageunderstandingarbitrarily,
title={Goldfish: Vision-Language Understanding of Arbitrarily Long Videos},
author={Kirolos Ataallah and Xiaoqian Shen and Eslam Abdelrahman and Essam Sleiman and Mingchen Zhuge and Jian Ding and Deyao Zhu and Jürgen Schmidhuber and Mohamed Elhoseiny},
year={2024},
eprint={2407.12679},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2407.12679},
}
@misc{ataallah2024minigpt4videoadvancingmultimodalllms,
title={MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens},
author={Kirolos Ataallah and Xiaoqian Shen and Eslam Abdelrahman and Essam Sleiman and Deyao Zhu and Jian Ding and Mohamed Elhoseiny},
year={2024},
eprint={2404.03413},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2404.03413},
}
</code></pre>
</div>
</section>
<section class="section" id="Acknowledgement">
<div class="container is-max-desktop content">
<h2 class="title">Acknowledgement</h2>
<p>
This website is adapted from <a
href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>, licensed under a <a rel="license"
href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
Commons Attribution-ShareAlike 4.0 International License</a>.
</p>
</div>
</section>
</body>
</html>