Spaces:

LanguageBind
/

Video-LLaVA

Runtime error

App Files Files Community

LinB203 commited on Nov 18, 2023

Commit

0e023c7

1 Parent(s): 514c1e1

update

Browse files

Files changed (4) hide show

LICENSE +201 -0
TRAIN_AND_VALIDATE.md +279 -0
app.py +257 -0
pyproject.toml +36 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

TRAIN_AND_VALIDATE.md ADDED Viewed

	@@ -0,0 +1,279 @@

+## Data preparation
+### data for training
+- The images pretraining dataset is from [LLaVA](https://github.com/haotian-liu/LLaVA).
+- The images tuning dataset is from [LLaVA](https://github.com/haotian-liu/LLaVA).
+- The videos pretraining dataset is from [Valley](https://github.com/RupertLuo/Valley).
+- The videos tuning dataset is from [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT).
+- Download the training annotations. You can download from [Baidu Disk](https://pan.baidu.com/s/1BipI3_f--GRWqaWTGYp-Jg?pwd=wkl0), [Google Disk](https://drive.google.com/file/d/11-1NBXNeiNQE2wPbue1dFph_Na_EHRYG/view?usp=drive_link) or [Peking University Disk](https://disk.pku.edu.cn:443/link/84783AB54553DFA150C1C5E82C16EB29)
+We also provide the processed data as follows.
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Datasets</th><th>Baidu Disk</th>
+    </tr>
+    <tr align="center">
+        <td>Image pretraining</td><td><a href="">Link</a></td>
+    </tr>
+    </tr>
+    <tr align="center">
+        <td>Image tuning</td><td><a href="">Link</a></td>
+    </tr>
+    </tr>
+    <tr align="center">
+        <td>Video pretraining</td><td><a href="">Link</a></td>
+    </tr>
+    </tr>
+    <tr align="center">
+        <td>Video tuning</td><td><a href="">Link</a></td>
+    </tr>
+</table>
+</div>
+After downloading all of them, organize the data as follows in ```DATA_ROOT```.
+```Shell
+DATA_ROOT
+├── llava_image
+├── llava_image_tune
+├── valley
+└── videochatgpt_tune
+```
+### data for validating
+- For image, follow LLaVA's instructions. ***You MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `eval`. This also provides a general structure for all datasets.*
+- For video, videos and annotations can be downloaded from Video-ChatGPT. We also provide the processed data as follows.
+<div align="center">
+<table border="1" width="100%">
+    <tr align="center">
+        <th>Datasets</th><th>Baidu Disk</th><th>Google Disk</th><th>Peking University Disk</th>
+    </tr>
+    <tr align="center">
+        <td>Activitynet_Zero_Shot_QA</td><td><a href="https://pan.baidu.com/s/1d_AVx9Mz_57nA3exhQZGyA?pwd=9amr ">Link</a></td><td>-</td><td>-</td>
+    </tr>
+    </tr>
+    <tr align="center">
+        <td>MSRVTT_Zero_Shot_QA</td><td><a href="https://pan.baidu.com/s/1QHUtwHXm4Vc-Wc12XFCFsA?pwd=1rj8">Link</a></td><td><a href="https://drive.google.com/file/d/1yXh9lz7flQ5Ui2IRSd6Qi6RqSEeUJwl3/view?usp=drive_link">Link</a></td><td>-</td>
+    </tr>
+    </tr>
+    <tr align="center">
+        <td>MSVD_Zero_Shot_QA</td><td><a href="https://pan.baidu.com/s/1PJSHkjHG2BPl_ddUnBj9AA?pwd=jj34">Link</a></td><td><a href="https://drive.google.com/file/d/1_q4eiSdb7i8P3Hmh4lCfgY1uBGyzU_7X/view?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/8B0D01747D8AA65534820B7E60CBFEFC">Link</a></td>
+    </tr>
+    </tr>
+    <tr align="center">
+        <td>TGIF_Zero_Shot_QA</td><td><a href="https://pan.baidu.com/s/11ubtWbTtubyBmN9UPvAyow?pwd=98yr">Link</a></td><td><a href="https://drive.google.com/file/d/1so6L9rg_gdC8Segur7rKML-ffd4Ix_I6/view?usp=drive_link">Link</a></td><td><a href="https://disk.pku.edu.cn:443/link/B9AB387EFE8817158F181FF3D7A97163">Link</a></td>
+    </tr>
+</table>
+</div>
+After downloading all of them, organize the data as follows in `eval`.
+```Shell
+eval
+├── GPT_Zero_Shot_QA
+│   ├── Activitynet_Zero_Shot_QA
+│   ├── MSRVTT_Zero_Shot_QA
+│   ├── MSVD_Zero_Shot_QA
+│   └── TGIF_Zero_Shot_QA
+├── gqa
+│   ├── answers
+│   ├── data
+│   └── llava_gqa_testdev_balanced.jsonl
+├── llava-bench-in-the-wild
+│   ├── answers
+│   ├── answers_gpt4.jsonl
+│   ├── bard_0718.jsonl
+│   ├── bing_chat_0629.jsonl
+│   ├── context.jsonl
+│   ├── images
+│   ├── questions.jsonl
+│   ├── README.md
+│   └── reviews
+├── mmbench
+│   ├── answers
+│   ├── answers_upload
+│   ├── mmbench_dev_20230712.tsv
+│   └── mmbench_dev_en_20231003.tsv
+├── MME
+│   ├── answers
+│   ├── convert_answer_to_mme.py
+│   └── llava_mme.jsonl
+├── mm-vet
+│   ├── answers
+│   ├── bard_set.json
+│   ├── convert_answers.py
+│   ├── images
+│   ├── llava-mm-vet.jsonl
+│   ├── mm-vet.json
+│   └── results
+├── pope
+│   ├── answers
+│   ├── coco
+│   ├── llava_pope_test.jsonl
+│   └── val2014
+├── scienceqa
+│   ├── answers
+│   ├── images
+│   ├── llava_test_CQM-A.json
+│   ├── pid_splits.json
+│   └── problems.json
+├── seed_bench
+│   ├── answers
+│   ├── answers_upload
+│   ├── extract_video_frames.py
+│   └── llava-seed-bench.jsonl
+├── textvqa
+│   ├── answers
+│   ├── llava_textvqa_val_v051_ocr.jsonl
+│   ├── TextVQA_0.5.1_val.json
+│   └── train_images
+├── vizwiz
+│   ├── answers
+│   ├── answers_upload
+│   ├── llava_test.jsonl
+│   ├── test
+│   ├── test.json
+│   ├── train.json
+│   └── val.json
+└── vqav2
+    ├── answers
+    ├── answers_upload
+    ├── llava_vqav2_mscoco_test2015.jsonl
+    ├── llava_vqav2_mscoco_test-dev2015.jsonl
+    └── test2015
+```
+## Training
+Specify your `DATA_ROOT` according to the data preparation.
+- Stage 1 pretraining script: [pretrain.sh](scripts/v1_5/pretrain.sh).
+- Stage 2 tuning script: [finetune.sh](scripts/v1_5/finetune.sh).
+## Validating
+Our image validation code comes from LLaVA and our video validation code comes from Video-ChatGPT, thanks for their contribution!
+You can refer to the official repository for validation, but we also provide [off-the-shelf](scripts/v1_5/eval) scripts.
+### MSRVTT-QA
+1. Inference to get the result.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/run_qa_msrvtt.sh
+```
+2. GPT-Assistant evaluation.
+```Shell
+bash scripts/v1_5/eval/eval_qa_msrvtt.sh
+```
+### MSVD-QA
+1. Inference to get the result.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/run_qa_msvd.sh
+```
+2. GPT-Assistant evaluation.
+```Shell
+bash scripts/v1_5/eval/eval_qa_msvd.sh
+```
+### TGIF-QA
+1. Inference to get the result.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/run_qa_tgif.sh
+```
+2. GPT-Assistant evaluation.
+```Shell
+bash scripts/v1_5/eval/eval_qa_tgif.sh
+```
+### ActivityNet-QA
+1. Inference to get the result.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/run_qa_activitynet.sh
+```
+2. GPT-Assistant evaluation.
+```Shell
+bash scripts/v1_5/eval/eval_qa_activitynet.sh
+```
+### VQAv2
+1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `eval/vqav2`.
+2. Multi-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/eval_image_vqav2.sh
+```
+3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `eval/vqav2/answers_upload`.
+### GQA
+1. Download the data following the official instructions [here](https://cs.stanford.edu/people/dorarad/gqa/download.html) and put under `eval/gqa/data`.
+2. Multi-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/eval_image_gqa.sh
+```
+### VisWiz
+1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `eval/vizwiz`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_vizwiz.sh
+```
+3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/1911/my-submission): `eval/vizwiz/answers_upload`.
+### ScienceQA
+1. Under `eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA).
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_sqa.sh
+```
+### TextVQA
+1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `eval/textvqa`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_textvqa.sh
+```
+### POPE
+1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `eval/pope`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_pope.sh
+```
+### MMBench
+1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `eval/mmbench`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_mmbench.sh
+```
+3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `eval/mmbench/answers_upload/mmbench_dev_20230712`.
+### LLaVA-Bench-in-the-Wild
+1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `eval/llava-bench-in-the-wild`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_llavabench.sh
+```
+### MM-Vet
+1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `eval/mmvet`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/eval_image_mmvet.sh
+```

app.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import shutil
+import subprocess
+import torch
+import gradio as gr
+from fastapi import FastAPI
+import os
+from PIL import Image
+import tempfile
+from decord import VideoReader, cpu
+from transformers import TextStreamer
+from llava.constants import DEFAULT_X_TOKEN, X_TOKEN_INDEX
+from llava.conversation import conv_templates, SeparatorStyle, Conversation
+from llava.serve.gradio_utils import Chat, tos_markdown, learn_more_markdown, title_markdown, block_css
+def save_image_to_local(image):
+    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
+    image = Image.open(image)
+    image.save(filename)
+    # print(filename)
+    return filename
+def save_video_to_local(video_path):
+    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
+    shutil.copyfile(video_path, filename)
+    return filename
+def generate(image1, video, textbox_in, first_run, state, state_, images_tensor):
+    flag = 1
+    if not textbox_in:
+        if len(state_.messages) > 0:
+            textbox_in = state_.messages[-1][1]
+            state_.messages.pop(-1)
+            flag = 0
+        else:
+            return "Please enter instruction"
+    image1 = image1 if image1 else "none"
+    video = video if video else "none"
+    # assert not (os.path.exists(image1) and os.path.exists(video))
+    if type(state) is not Conversation:
+        state = conv_templates[conv_mode].copy()
+        state_ = conv_templates[conv_mode].copy()
+        images_tensor = [[], []]
+    first_run = False if len(state.messages) > 0 else True
+    text_en_in = textbox_in.replace("picture", "image")
+    # images_tensor = [[], []]
+    image_processor = handler.image_processor
+    if os.path.exists(image1) and not os.path.exists(video):
+        tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
+        # print(tensor.shape)
+        tensor = tensor.to(handler.model.device, dtype=dtype)
+        images_tensor[0] = images_tensor[0] + [tensor]
+        images_tensor[1] = images_tensor[1] + ['image']
+    video_processor = handler.video_processor
+    if not os.path.exists(image1) and os.path.exists(video):
+        tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
+        # print(tensor.shape)
+        tensor = tensor.to(handler.model.device, dtype=dtype)
+        images_tensor[0] = images_tensor[0] + [tensor]
+        images_tensor[1] = images_tensor[1] + ['video']
+    if os.path.exists(image1) and os.path.exists(video):
+        tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
+        # print(tensor.shape)
+        tensor = tensor.to(handler.model.device, dtype=dtype)
+        images_tensor[0] = images_tensor[0] + [tensor]
+        images_tensor[1] = images_tensor[1] + ['video']
+        tensor = image_processor.preprocess(image1, return_tensors='pt')['pixel_values'][0]
+        # print(tensor.shape)
+        tensor = tensor.to(handler.model.device, dtype=dtype)
+        images_tensor[0] = images_tensor[0] + [tensor]
+        images_tensor[1] = images_tensor[1] + ['image']
+    if os.path.exists(image1) and not os.path.exists(video):
+        text_en_in = DEFAULT_X_TOKEN['IMAGE'] + '\n' + text_en_in
+    if not os.path.exists(image1) and os.path.exists(video):
+        text_en_in = DEFAULT_X_TOKEN['VIDEO'] + '\n' + text_en_in
+    if os.path.exists(image1) and os.path.exists(video):
+        text_en_in = DEFAULT_X_TOKEN['VIDEO'] + '\n' + text_en_in + '\n' + DEFAULT_X_TOKEN['IMAGE']
+    text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
+    state_.messages[-1] = (state_.roles[1], text_en_out)
+    text_en_out = text_en_out.split('#')[0]
+    textbox_out = text_en_out
+    show_images = ""
+    if os.path.exists(image1):
+        filename = save_image_to_local(image1)
+        show_images += f'<img src="./file={filename}" style="display: inline-block;width: 250px;max-height: 400px;">'
+    if os.path.exists(video):
+        filename = save_video_to_local(video)
+        show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={filename}"></video>'
+    if flag:
+        state.append_message(state.roles[0], textbox_in + "\n" + show_images)
+    state.append_message(state.roles[1], textbox_out)
+    return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(image1) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
+def regenerate(state, state_):
+    state.messages.pop(-1)
+    state_.messages.pop(-1)
+    if len(state.messages) > 0:
+        return state, state_, state.to_gradio_chatbot(), False
+    return (state, state_, state.to_gradio_chatbot(), True)
+def clear_history(state, state_):
+    state = conv_templates[conv_mode].copy()
+    state_ = conv_templates[conv_mode].copy()
+    return (gr.update(value=None, interactive=True),
+        gr.update(value=None, interactive=True),\
+        gr.update(value=None, interactive=True),\
+        True, state, state_, state.to_gradio_chatbot(), [[], []])
+conv_mode = "llava_v1"
+model_path = 'LanguageBind/Video-LLaVA-7B'
+device = 'cuda'
+load_8bit = False
+load_4bit = True
+dtype = torch.float16
+handler = Chat(model_path, conv_mode=conv_mode, load_8bit=load_8bit, load_4bit=load_8bit, device=device)
+# handler.model.to(dtype=dtype)
+if not os.path.exists("temp"):
+    os.makedirs("temp")
+app = FastAPI()
+textbox = gr.Textbox(
+        show_label=False, placeholder="Enter text and press ENTER", container=False
+    )
+with gr.Blocks(title='Video-LLaVA🚀', theme=gr.themes.Default(), css=block_css) as demo:
+    gr.Markdown(title_markdown)
+    state = gr.State()
+    state_ = gr.State()
+    first_run = gr.State()
+    images_tensor = gr.State()
+    with gr.Row():
+        with gr.Column(scale=3):
+            image1 = gr.Image(label="Input Image", type="filepath")
+            video = gr.Video(label="Input Video")
+            cur_dir = os.path.dirname(os.path.abspath(__file__))
+            gr.Examples(
+                examples=[
+                    [
+                        f"{cur_dir}/examples/extreme_ironing.jpg",
+                        "What is unusual about this image?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/waterview.jpg",
+                        "What are the things I should be cautious about when I visit here?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/glove.jpg",
+                        "What happens when the glove drops?",
+                    ],
+                    [
+                        f"{cur_dir}/examples/desert.jpg",
+                        "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?",
+                    ],
+                ],
+                inputs=[image1, textbox],
+            )
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(label="Video-LLaVA", bubble_full_width=True).style(height=850)
+            with gr.Row():
+                with gr.Column(scale=8):
+                    textbox.render()
+                with gr.Column(scale=1, min_width=50):
+                    submit_btn = gr.Button(
+                        value="Send", variant="primary", interactive=True
+                    )
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
+                downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
+                flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
+                # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
+                clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                [
+                    f"{cur_dir}/examples/sample_img_22.png",
+                    f"{cur_dir}/examples/sample_demo_22.mp4",
+                    "Are the instruments in the pictures used in the video?",
+                ],
+                [
+                    f"{cur_dir}/examples/sample_img_13.png",
+                    f"{cur_dir}/examples/sample_demo_13.mp4",
+                    "Does the flag in the image appear in the video?",
+                ],
+                [
+                    f"{cur_dir}/examples/sample_img_8.png",
+                    f"{cur_dir}/examples/sample_demo_8.mp4",
+                    "Are the image and the video depicting the same place?",
+                ],
+            ],
+            inputs=[image1, video, textbox],
+        )
+        gr.Examples(
+            examples=[
+                [
+                    f"{cur_dir}/examples/sample_demo_1.mp4",
+                    "Why is this video funny?",
+                ],
+                [
+                    f"{cur_dir}/examples/sample_demo_3.mp4",
+                    "Can you identify any safety hazards in this video?"
+                ],
+                [
+                    f"{cur_dir}/examples/sample_demo_9.mp4",
+                    "Describe the video.",
+                ],
+                [
+                    f"{cur_dir}/examples/sample_demo_22.mp4",
+                    "Describe the activity in the video.",
+                ],
+            ],
+            inputs=[video, textbox],
+        )
+    gr.Markdown(tos_markdown)
+    gr.Markdown(learn_more_markdown)
+    submit_btn.click(generate, [image1, video, textbox, first_run, state, state_, images_tensor],
+                     [state, state_, chatbot, first_run, textbox, images_tensor, image1, video])
+    regenerate_btn.click(regenerate, [state, state_], [state, state_, chatbot, first_run]).then(
+        generate, [image1, video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, image1, video])
+    clear_btn.click(clear_history, [state, state_],
+                    [image1, video, textbox, first_run, state, state_, chatbot, images_tensor])
+# app = gr.mount_gradio_app(app, demo, path="/")
+demo.launch()
+# uvicorn llava.serve.gradio_web_server:app

pyproject.toml ADDED Viewed

	@@ -0,0 +1,36 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "llava"
+version = "1.1.3"
+description = "Towards GPT-4 like large language and visual assistant."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "torch==2.0.1", "torchvision==0.15.2",
+    "transformers==4.31.0", "tokenizers>=0.12.1,<0.14", "sentencepiece==0.1.99", "shortuuid",
+    "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0",
+    "pydantic<2,>=1", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
+    "gradio==3.35.2", "gradio_client==0.2.9",
+    "requests", "httpx==0.24.0", "uvicorn", "fastapi",
+    "einops==0.6.1", "einops-exts==0.0.4", "timm==0.6.13",
+]
+[project.optional-dependencies]
+train = ["deepspeed==0.9.5", "ninja", "wandb"]
+[project.urls]
+"Homepage" = "https://llava-vl.github.io"
+"Bug Tracker" = "https://github.com/haotian-liu/LLaVA/issues"
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]