"""Example: run DW-KhotTaeVL-2B-QueryFrames on a single video MCQ. Requirements:: pip install torch transformers pillow decord huggingface_hub This script loads the QueryFrames wrapper, samples 32 candidate frames from the input video, picks the 8 most relevant to the question via CLIP-ViT-L/14, and answers via stock Qwen3-VL-2B-Instruct. """ from dw_queryframes import QueryFrames def main() -> None: fv = QueryFrames( base_model="Qwen/Qwen3-VL-2B-Instruct", clip_model="openai/clip-vit-large-patch14", device="auto", n_frames=8, n_candidates=32, ) # MCQ mode (no task_type) — default. result = fv.answer_mcq( video_path="example.mp4", question="What does the chef do after pouring the oil into the pot?", options=[ "Chops fresh green herbs", "Pours broth into the pot", "Stirs the oil in the pot", "Adds salt to the pot", ], ) print("[MCQ mode (no task_type)]") print(f" pred : {result['pred']}") print(f" raw output : {result['raw']!r}") print(f" frames used : {result['frames_used']}") print(f" CLIP latency : {result['latency_clip_s']} s") print(f" GEN latency : {result['latency_gen_s']} s") # Task-aware MCQ mode (when a task taxonomy is supplied, e.g. by # Video-MME or by an operational workflow). result2 = fv.answer_mcq( video_path="example.mp4", question="What is happening to the cabbage in the frying pan?", options=[ "It is being stirred", "It is being chopped", "It is being served", "It is being washed", ], task_type="Object Reasoning", # → uniform-fallback path ) print("\n[Task-aware MCQ mode]") print(f" pred : {result2['pred']}") print(f" frames used : {result2['frames_used']}") # 'uniform_fallback' if __name__ == "__main__": main()