example_usage: rename Wild-mode → MCQ-mode terminology

7df1847 verified 17 days ago

1.99 kB

	"""Example: run DW-KhotTaeVL-2B-QueryFrames on a single video MCQ.

	Requirements::

	pip install torch transformers pillow decord huggingface_hub

	This script loads the QueryFrames wrapper, samples 32 candidate frames
	from the input video, picks the 8 most relevant to the question via
	CLIP-ViT-L/14, and answers via stock Qwen3-VL-2B-Instruct.
	"""
	from dw_queryframes import QueryFrames


	def main() -> None:
	fv = QueryFrames(
	base_model="Qwen/Qwen3-VL-2B-Instruct",
	clip_model="openai/clip-vit-large-patch14",
	device="auto",
	n_frames=8,
	n_candidates=32,
	)

	# MCQ mode (no task_type) — default.
	result = fv.answer_mcq(
	video_path="example.mp4",
	question="What does the chef do after pouring the oil into the pot?",
	options=[
	"Chops fresh green herbs",
	"Pours broth into the pot",
	"Stirs the oil in the pot",
	"Adds salt to the pot",
	],
	)
	print("[MCQ mode (no task_type)]")
	print(f" pred : {result['pred']}")
	print(f" raw output : {result['raw']!r}")
	print(f" frames used : {result['frames_used']}")
	print(f" CLIP latency : {result['latency_clip_s']} s")
	print(f" GEN latency : {result['latency_gen_s']} s")

	# Task-aware MCQ mode (when a task taxonomy is supplied, e.g. by
	# Video-MME or by an operational workflow).
	result2 = fv.answer_mcq(
	video_path="example.mp4",
	question="What is happening to the cabbage in the frying pan?",
	options=[
	"It is being stirred",
	"It is being chopped",
	"It is being served",
	"It is being washed",
	],
	task_type="Object Reasoning", # → uniform-fallback path
	)
	print("\n[Task-aware MCQ mode]")
	print(f" pred : {result2['pred']}")
	print(f" frames used : {result2['frames_used']}") # 'uniform_fallback'


	if __name__ == "__main__":
	main()