Video-Text-to-Text
Transformers
English
video
video-question-answering
multimodal
vision-language
qwen3-vl
inference-time
frame-selection
clip
Instructions to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use commandeaw/DW-KhotTaeVL-2B-QueryFrames with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("commandeaw/DW-KhotTaeVL-2B-QueryFrames", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """Example: run DW-KhotTaeVL-2B-QueryFrames on a single video MCQ. | |
| Requirements:: | |
| pip install torch transformers pillow decord huggingface_hub | |
| This script loads the QueryFrames wrapper, samples 32 candidate frames | |
| from the input video, picks the 8 most relevant to the question via | |
| CLIP-ViT-L/14, and answers via stock Qwen3-VL-2B-Instruct. | |
| """ | |
| from dw_queryframes import QueryFrames | |
| def main() -> None: | |
| fv = QueryFrames( | |
| base_model="Qwen/Qwen3-VL-2B-Instruct", | |
| clip_model="openai/clip-vit-large-patch14", | |
| device="auto", | |
| n_frames=8, | |
| n_candidates=32, | |
| ) | |
| # MCQ mode (no task_type) — default. | |
| result = fv.answer_mcq( | |
| video_path="example.mp4", | |
| question="What does the chef do after pouring the oil into the pot?", | |
| options=[ | |
| "Chops fresh green herbs", | |
| "Pours broth into the pot", | |
| "Stirs the oil in the pot", | |
| "Adds salt to the pot", | |
| ], | |
| ) | |
| print("[MCQ mode (no task_type)]") | |
| print(f" pred : {result['pred']}") | |
| print(f" raw output : {result['raw']!r}") | |
| print(f" frames used : {result['frames_used']}") | |
| print(f" CLIP latency : {result['latency_clip_s']} s") | |
| print(f" GEN latency : {result['latency_gen_s']} s") | |
| # Task-aware MCQ mode (when a task taxonomy is supplied, e.g. by | |
| # Video-MME or by an operational workflow). | |
| result2 = fv.answer_mcq( | |
| video_path="example.mp4", | |
| question="What is happening to the cabbage in the frying pan?", | |
| options=[ | |
| "It is being stirred", | |
| "It is being chopped", | |
| "It is being served", | |
| "It is being washed", | |
| ], | |
| task_type="Object Reasoning", # → uniform-fallback path | |
| ) | |
| print("\n[Task-aware MCQ mode]") | |
| print(f" pred : {result2['pred']}") | |
| print(f" frames used : {result2['frames_used']}") # 'uniform_fallback' | |
| if __name__ == "__main__": | |
| main() | |