Instructions to use GD-ML/Code2World with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use GD-ML/Code2World with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="GD-ML/Code2World")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("GD-ML/Code2World")
model = AutoModelForImageTextToText.from_pretrained("GD-ML/Code2World")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use GD-ML/Code2World with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "GD-ML/Code2World"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "GD-ML/Code2World",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/GD-ML/Code2World

SGLang

How to use GD-ML/Code2World with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "GD-ML/Code2World" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "GD-ML/Code2World",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "GD-ML/Code2World" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "GD-ML/Code2World",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use GD-ML/Code2World with Docker Model Runner:
```
docker model run hf.co/GD-ML/Code2World
```

Code2World / visual_hint.py

yhzheng1031

Upload folder using huggingface_hub

4da1734 verified about 1 month ago

raw

history blame contribute delete

3.45 kB

	import math
	from PIL import Image, ImageDraw


	def draw_arrow_refined(draw, start, end, color=(255, 0, 0, 255), width=15, arrow_len=50):
	x1, y1 = start
	x2, y2 = end

	length = math.hypot(x2 - x1, y2 - y1)
	if length < 1e-5:
	return

	angle = math.atan2(y2 - y1, x2 - x1)

	p1_x = x2 - arrow_len * math.cos(angle - math.pi / 6)
	p1_y = y2 - arrow_len * math.sin(angle - math.pi / 6)
	p2_x = x2 - arrow_len * math.cos(angle + math.pi / 6)
	p2_y = y2 - arrow_len * math.sin(angle + math.pi / 6)

	back_off = arrow_len * 0.8
	line_end_x = x2 - (back_off / length) * (x2 - x1)
	line_end_y = y2 - (back_off / length) * (y2 - y1)

	draw.line([start, (line_end_x, line_end_y)], fill=color, width=width)
	draw.polygon([(x2, y2), (p1_x, p1_y), (p2_x, p2_y)], fill=color)


	def build_visual_hint(image, action):
	"""
	根据 action 在图像上叠加 visual hint:
	- click / long_press / input_text: 红圈
	- scroll / swipe: 红箭头

	支持的 action 格式示例：
	1) click:
	{
	"action_type": "click",
	"x": 540,
	"y": 1470
	}

	2) scroll:
	{
	"action_type": "scroll",
	"direction": "down",
	"x1": 540,
	"y1": 1600,
	"x2": 540,
	"y2": 900
	}
	"""
	image = image.convert("RGBA")
	overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
	draw = ImageDraw.Draw(overlay)

	width, height = image.size
	action_type = action.get("action_type", "")

	fill_color = (255, 0, 0, 100)
	outline_color = (255, 0, 0, 255)

	if action_type in ["scroll", "swipe"]:
	x1 = action.get("x1")
	y1 = action.get("y1")
	x2 = action.get("x2")
	y2 = action.get("y2")
	direction = action.get("direction", "down").lower()

	if None not in [x1, y1, x2, y2]:
	start_point = (int(x1), int(y1))
	end_point = (int(x2), int(y2))
	else:
	cx, cy = width // 2, height // 2
	arrow_len = 300

	if direction == "down":
	start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
	elif direction == "up":
	start_point, end_point = (cx, cy - arrow_len // 2), (cx, cy + arrow_len // 2)
	elif direction == "right":
	start_point, end_point = (cx + arrow_len // 2, cy), (cx - arrow_len // 2, cy)
	elif direction == "left":
	start_point, end_point = (cx - arrow_len // 2, cy), (cx + arrow_len // 2, cy)
	else:
	start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)

	draw_arrow_refined(
	draw,
	start_point,
	end_point,
	color=outline_color,
	width=15,
	arrow_len=50,
	)

	elif action_type in ["click", "long_press", "input_text", "open_app"]:
	x = action.get("x")
	y = action.get("y")

	if x is not None and y is not None:
	x = int(x)
	y = int(y)
	radius = 30 if action_type == "input_text" else 40

	draw.ellipse(
	(x - radius, y - radius, x + radius, y + radius),
	fill=fill_color,
	outline=outline_color,
	width=5,
	)

	return Image.alpha_composite(image, overlay).convert("RGB")