Instructions to use GD-ML/Code2World with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use GD-ML/Code2World with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="GD-ML/Code2World") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("GD-ML/Code2World") model = AutoModelForImageTextToText.from_pretrained("GD-ML/Code2World") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use GD-ML/Code2World with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "GD-ML/Code2World" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "GD-ML/Code2World", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/GD-ML/Code2World
- SGLang
How to use GD-ML/Code2World with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "GD-ML/Code2World" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "GD-ML/Code2World", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "GD-ML/Code2World" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "GD-ML/Code2World", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use GD-ML/Code2World with Docker Model Runner:
docker model run hf.co/GD-ML/Code2World
| import math | |
| from PIL import Image, ImageDraw | |
| def draw_arrow_refined(draw, start, end, color=(255, 0, 0, 255), width=15, arrow_len=50): | |
| x1, y1 = start | |
| x2, y2 = end | |
| length = math.hypot(x2 - x1, y2 - y1) | |
| if length < 1e-5: | |
| return | |
| angle = math.atan2(y2 - y1, x2 - x1) | |
| p1_x = x2 - arrow_len * math.cos(angle - math.pi / 6) | |
| p1_y = y2 - arrow_len * math.sin(angle - math.pi / 6) | |
| p2_x = x2 - arrow_len * math.cos(angle + math.pi / 6) | |
| p2_y = y2 - arrow_len * math.sin(angle + math.pi / 6) | |
| back_off = arrow_len * 0.8 | |
| line_end_x = x2 - (back_off / length) * (x2 - x1) | |
| line_end_y = y2 - (back_off / length) * (y2 - y1) | |
| draw.line([start, (line_end_x, line_end_y)], fill=color, width=width) | |
| draw.polygon([(x2, y2), (p1_x, p1_y), (p2_x, p2_y)], fill=color) | |
| def build_visual_hint(image, action): | |
| """ | |
| 根据 action 在图像上叠加 visual hint: | |
| - click / long_press / input_text: 红圈 | |
| - scroll / swipe: 红箭头 | |
| 支持的 action 格式示例: | |
| 1) click: | |
| { | |
| "action_type": "click", | |
| "x": 540, | |
| "y": 1470 | |
| } | |
| 2) scroll: | |
| { | |
| "action_type": "scroll", | |
| "direction": "down", | |
| "x1": 540, | |
| "y1": 1600, | |
| "x2": 540, | |
| "y2": 900 | |
| } | |
| """ | |
| image = image.convert("RGBA") | |
| overlay = Image.new("RGBA", image.size, (255, 255, 255, 0)) | |
| draw = ImageDraw.Draw(overlay) | |
| width, height = image.size | |
| action_type = action.get("action_type", "") | |
| fill_color = (255, 0, 0, 100) | |
| outline_color = (255, 0, 0, 255) | |
| if action_type in ["scroll", "swipe"]: | |
| x1 = action.get("x1") | |
| y1 = action.get("y1") | |
| x2 = action.get("x2") | |
| y2 = action.get("y2") | |
| direction = action.get("direction", "down").lower() | |
| if None not in [x1, y1, x2, y2]: | |
| start_point = (int(x1), int(y1)) | |
| end_point = (int(x2), int(y2)) | |
| else: | |
| cx, cy = width // 2, height // 2 | |
| arrow_len = 300 | |
| if direction == "down": | |
| start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2) | |
| elif direction == "up": | |
| start_point, end_point = (cx, cy - arrow_len // 2), (cx, cy + arrow_len // 2) | |
| elif direction == "right": | |
| start_point, end_point = (cx + arrow_len // 2, cy), (cx - arrow_len // 2, cy) | |
| elif direction == "left": | |
| start_point, end_point = (cx - arrow_len // 2, cy), (cx + arrow_len // 2, cy) | |
| else: | |
| start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2) | |
| draw_arrow_refined( | |
| draw, | |
| start_point, | |
| end_point, | |
| color=outline_color, | |
| width=15, | |
| arrow_len=50, | |
| ) | |
| elif action_type in ["click", "long_press", "input_text", "open_app"]: | |
| x = action.get("x") | |
| y = action.get("y") | |
| if x is not None and y is not None: | |
| x = int(x) | |
| y = int(y) | |
| radius = 30 if action_type == "input_text" else 40 | |
| draw.ellipse( | |
| (x - radius, y - radius, x + radius, y + radius), | |
| fill=fill_color, | |
| outline=outline_color, | |
| width=5, | |
| ) | |
| return Image.alpha_composite(image, overlay).convert("RGB") |