Instructions to use GD-ML/Code2World with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use GD-ML/Code2World with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="GD-ML/Code2World")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("GD-ML/Code2World")
model = AutoModelForImageTextToText.from_pretrained("GD-ML/Code2World")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use GD-ML/Code2World with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "GD-ML/Code2World"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "GD-ML/Code2World",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/GD-ML/Code2World

SGLang

How to use GD-ML/Code2World with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "GD-ML/Code2World" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "GD-ML/Code2World",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "GD-ML/Code2World" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "GD-ML/Code2World",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use GD-ML/Code2World with Docker Model Runner:
```
docker model run hf.co/GD-ML/Code2World
```

Code2World

File size: 3,450 Bytes

4da1734

import math
from PIL import Image, ImageDraw


def draw_arrow_refined(draw, start, end, color=(255, 0, 0, 255), width=15, arrow_len=50):
    x1, y1 = start
    x2, y2 = end

    length = math.hypot(x2 - x1, y2 - y1)
    if length < 1e-5:
        return

    angle = math.atan2(y2 - y1, x2 - x1)

    p1_x = x2 - arrow_len * math.cos(angle - math.pi / 6)
    p1_y = y2 - arrow_len * math.sin(angle - math.pi / 6)
    p2_x = x2 - arrow_len * math.cos(angle + math.pi / 6)
    p2_y = y2 - arrow_len * math.sin(angle + math.pi / 6)

    back_off = arrow_len * 0.8
    line_end_x = x2 - (back_off / length) * (x2 - x1)
    line_end_y = y2 - (back_off / length) * (y2 - y1)

    draw.line([start, (line_end_x, line_end_y)], fill=color, width=width)
    draw.polygon([(x2, y2), (p1_x, p1_y), (p2_x, p2_y)], fill=color)


def build_visual_hint(image, action):
    """
    根据 action 在图像上叠加 visual hint:
    - click / long_press / input_text: 红圈
    - scroll / swipe: 红箭头

    支持的 action 格式示例：
    1) click:
        {
            "action_type": "click",
            "x": 540,
            "y": 1470
        }

    2) scroll:
        {
            "action_type": "scroll",
            "direction": "down",
            "x1": 540,
            "y1": 1600,
            "x2": 540,
            "y2": 900
        }
    """
    image = image.convert("RGBA")
    overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
    draw = ImageDraw.Draw(overlay)

    width, height = image.size
    action_type = action.get("action_type", "")

    fill_color = (255, 0, 0, 100)
    outline_color = (255, 0, 0, 255)

    if action_type in ["scroll", "swipe"]:
        x1 = action.get("x1")
        y1 = action.get("y1")
        x2 = action.get("x2")
        y2 = action.get("y2")
        direction = action.get("direction", "down").lower()

        if None not in [x1, y1, x2, y2]:
            start_point = (int(x1), int(y1))
            end_point = (int(x2), int(y2))
        else:
            cx, cy = width // 2, height // 2
            arrow_len = 300

            if direction == "down":
                start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
            elif direction == "up":
                start_point, end_point = (cx, cy - arrow_len // 2), (cx, cy + arrow_len // 2)
            elif direction == "right":
                start_point, end_point = (cx + arrow_len // 2, cy), (cx - arrow_len // 2, cy)
            elif direction == "left":
                start_point, end_point = (cx - arrow_len // 2, cy), (cx + arrow_len // 2, cy)
            else:
                start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)

        draw_arrow_refined(
            draw,
            start_point,
            end_point,
            color=outline_color,
            width=15,
            arrow_len=50,
        )

    elif action_type in ["click", "long_press", "input_text", "open_app"]:
        x = action.get("x")
        y = action.get("y")

        if x is not None and y is not None:
            x = int(x)
            y = int(y)
            radius = 30 if action_type == "input_text" else 40

            draw.ellipse(
                (x - radius, y - radius, x + radius, y + radius),
                fill=fill_color,
                outline=outline_color,
                width=5,
            )

    return Image.alpha_composite(image, overlay).convert("RGB")