DW-KhotTaeVL-2B-QueryFrames / example_usage.py
commandeaw's picture
example_usage: rename Wild-mode → MCQ-mode terminology
7df1847 verified
"""Example: run DW-KhotTaeVL-2B-QueryFrames on a single video MCQ.
Requirements::
pip install torch transformers pillow decord huggingface_hub
This script loads the QueryFrames wrapper, samples 32 candidate frames
from the input video, picks the 8 most relevant to the question via
CLIP-ViT-L/14, and answers via stock Qwen3-VL-2B-Instruct.
"""
from dw_queryframes import QueryFrames
def main() -> None:
fv = QueryFrames(
base_model="Qwen/Qwen3-VL-2B-Instruct",
clip_model="openai/clip-vit-large-patch14",
device="auto",
n_frames=8,
n_candidates=32,
)
# MCQ mode (no task_type) — default.
result = fv.answer_mcq(
video_path="example.mp4",
question="What does the chef do after pouring the oil into the pot?",
options=[
"Chops fresh green herbs",
"Pours broth into the pot",
"Stirs the oil in the pot",
"Adds salt to the pot",
],
)
print("[MCQ mode (no task_type)]")
print(f" pred : {result['pred']}")
print(f" raw output : {result['raw']!r}")
print(f" frames used : {result['frames_used']}")
print(f" CLIP latency : {result['latency_clip_s']} s")
print(f" GEN latency : {result['latency_gen_s']} s")
# Task-aware MCQ mode (when a task taxonomy is supplied, e.g. by
# Video-MME or by an operational workflow).
result2 = fv.answer_mcq(
video_path="example.mp4",
question="What is happening to the cabbage in the frying pan?",
options=[
"It is being stirred",
"It is being chopped",
"It is being served",
"It is being washed",
],
task_type="Object Reasoning", # → uniform-fallback path
)
print("\n[Task-aware MCQ mode]")
print(f" pred : {result2['pred']}")
print(f" frames used : {result2['frames_used']}") # 'uniform_fallback'
if __name__ == "__main__":
main()