Robotics
English
MobileManipulation
MolmoBot-RBY1Multitask / config.yaml
snehal-allenai's picture
Upload MolmoBot RBY1 Multitask weights (step74000 unsharded)
0786434 verified
run_name: molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
model:
model_name: molmoact
data_formatter:
prompt_templates: uber_model_v2
message_format: qwen3
system_prompt: demo_or_style_v2
always_start_with_space: false
default_inference_len: 65
select_answer: best
debug: false
image_last: false
format_message_list: null
p_one_message: 0.0
eval_system_prompt_mapping: null
p_choice_content_in_mc: 1.0
template_video_mc_questions: true
pointing_format: html-v2
points_decimal_places: 1
use_seperate_non_pointing_qa_style: false
timestamp_mode: 50-percent-seconds
output_timestamp_mode: seconds
seconds_decimal_places: 1
p_multi_point_all_image: 0.5
use_seperate_count_without_pointing_style: false
sample_random_initial_point: true
llm:
d_model: 2560
n_heads: 32
n_kv_heads: 8
head_dim: 128
qkv_bias: false
clip_qkv: null
n_layers: 36
mlp_ratio: 4
mlp_hidden_size: 19456
activation_type: swiglu
block_type: sequential
rope: true
rope_full_precision: true
rope_theta: 5000000.0
rope_type: default
rope_factor: null
rope_high_freq_factor: null
rope_low_freq_factor: null
rope_original_max_position_embeddings: null
rope_attention_factor: null
rope_beta_fast: null
rope_beta_slow: null
rope_mscale: null
rope_mscale_all_dim: null
rope_truncate: null
attention_type: sdpa
full_attention_layers: null
sliding_attention_rope_scaling: false
float32_attention: true
attention_dropout: 0.0
attention_layer_norm: true
attention_layer_norm_type: qwen3
residual_dropout: 0.1
response_residual_dropout: 0.0
layer_norm_type: rms
layer_norm_with_affine: true
layer_norm_eps: 1.0e-06
attention_layer_norm_with_affine: true
max_sequence_length: 8192
max_position_embeddings: null
include_bias: false
bias_for_layer_norm: null
norm_after: false
moe_num_experts: 8
moe_top_k: 2
moe_mlp_impl: sparse
moe_log_expert_assignment: false
moe_shared_expert: false
moe_lbl_in_fp32: false
moe_interleave: false
moe_loss_weight: 0.1
moe_zloss_weight: null
moe_dropless: true
moe_capacity_factor: 1.25
embedding_dropout: 0.0
scale_logits: false
vocab_size: 151936
additional_vocab_size: 128
weight_tying: true
embedding_size: 151936
use_position_ids: true
tokenizer:
identifier: Qwen/Qwen3-4B-Instruct-2507
tokenizer_dir: null
init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt
init_incremental: null
new_embedding_init_range: 0.02
initializer_range: 0.02
normalize_input_embeds: false
activation_checkpoint: whole_layer
compile: blocks
fix_pad_tokenizer: false
init_std: 0.02
init_fn: normal
init_cutoff_factor: null
vision_backbone:
vit:
image_model_type: siglip
image_default_input_size:
- 378
- 378
image_patch_size: 14
image_pos_patch_size: 14
image_emb_dim: 1152
image_num_heads: 16
image_num_key_value_heads: 16
image_num_layers: 27
image_head_dim: 72
image_mlp_dim: 4304
image_mlp_activations: gelu_pytorch_tanh
image_dropout_rate: 0.0
image_num_pos: 729
image_norm_eps: 1.0e-06
attention_dropout: 0.0
residual_dropout: 0.0
initializer_range: 0.02
float32_attention: true
attention_type: sdpa
sdpa_backend: all
activation_checkpointing: true
init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
resize_mode: siglip
pad_value: 0.0
normalize: siglip
image_pooling_2d: attention_meanq
pooling_attention_mask: true
image_projector: mlp
image_padding_embed: null
vit_layers:
- -3
- -9
skip_unused_layers: true
use_deepstack: false
share_connector: false
image_feature_dropout: 0.0
connector_activation_checkpointing: true
compile_vit: blocks
pool_size_embeds: null
compile_connector: null
normalize_on_gpu: true
use_image_augmentation: true
use_resize_bottleneck: false
mm_preprocessor:
max_answer_len: null
last_message_loss_only: false
max_text_tokens: null
loss_token_weighting: root_subsegments_root_tokens
max_frames: 1
frame_sample_mode: uniform_last_frame
candidate_sampling_fps:
- 0.25
- 0.5
- 1.0
- 2.0
- 4.0
- 6.0
- 8.0
- 16.0
cache_videos: true
loading_method: torchcodec_exact
max_fps:
- 2.0
time_sampling: true
time_mode: per-frame-compact
subtitle_mode: frame_1
max_crops: 1
overlap_margins:
- 4.0
- 4.0
use_col_tokens: false
periodic_high_res_frame: null
high_low_train_mode: local_rnd
high_res_frame_sample_options: null
periodic_sample_rate_training:
4:
- 0.9
- 0.03
- 0.03
- 0.04
3:
- 0.6
- 0.2
- 0.2
skip_low_res_in_high_low: false
pooling_w: 3
pooling_h: 3
high_res_pooling_w: null
high_res_pooling_h: null
query_based_resolution_selection: false
max_queries_for_resolution_selection: 8
use_frame_special_tokens: true
frame_sel_clip_identifier: google/siglip2-so400m-patch14-384
image_padding_mask: false
max_subtitle_tokens: null
image:
crop_mode: resize
use_col_tokens: true
max_crops: 8
high_res_max_crops: 24
p_high_res: 0.0
pooling_w: 2
pooling_h: 2
overlap_margins:
- 4
- 4
max_images: 5
max_multi_image_crops: 8
multi_image_pooling_w: 2
multi_image_pooling_h: 2
use_single_crop_col_tokens: false
use_single_crop_start_token: true
topk: null
prune_from_frame: 0
bi_directional_attn: image_tokens
shared_low_high_embedding: true
debug: null
cp_enabled: false
apply_cp_to_vision_backbone: false
action_dim: 20
action_horizon: 16
n_action_steps: 8
n_obs_steps: 1
action_expert:
max_horizon: 32
action_dim: 20
hidden_size: 768
num_layers: 36
num_heads: 8
mlp_ratio: 4.0
timestep_embed_dim: 256
dropout: 0.0
attn_dropout: 0.0
context_layer_norm: true
action_expert_layer_mode: per_layer
flow_matching_num_steps: 10
flow_matching_cutoff: 0.999
flow_matching_beta_alpha: 1.0
flow_matching_beta_beta: 1.5
num_flow_timestamps: 8
same_noise_per_time: false
robot_preprocessor:
stats_by_repo:
synthmanip:
observation.state:
min:
- -4.904874324798584
- -4.564780235290527
- -3.5160739421844482
- -2.356419563293457
- -0.47234979271888733
- -2.0865397453308105
- -3.343071222305298
- -5.8824052810668945
- -1.7488995790481567
- -2.967109203338623
- -0.11299018561840057
- -2.3546268939971924
- -3.1416664123535156
- -2.0946199893951416
- -3.2890703678131104
- -6.282893657684326
- -1.7483078241348267
- -2.967064142227173
- -0.12049419432878494
- -1.778153419494629
- -1.7587945461273193
- -1.5871200561523438
max:
- 17.08185577392578
- 33.73189163208008
- 3.2411913871765137
- 2.356658697128296
- 3.1416971683502197
- 2.1008245944976807
- 0.07229717075824738
- 6.270575523376465
- 2.0102994441986084
- 2.9668161869049072
- 0.021467044949531555
- 2.3977394104003906
- 0.34489157795906067
- 2.0900635719299316
- 0.07242166996002197
- 6.27663516998291
- 2.0076160430908203
- 2.9636759757995605
- 0.04509617015719414
- 0.919683575630188
- 1.6717331409454346
- 1.1039749383926392
action:
q01:
- -0.04400388523936272
- -0.044572047889232635
- -0.05000000074505806
- -0.05000000074505806
- -0.037506889551877975
- -0.03562070056796074
- -0.05000000074505806
- -0.05000000074505806
- -0.04800133779644966
- -0.05000000074505806
- -100.0
- -0.05000000074505806
- -0.05000000074505806
- -0.04927435144782066
- -0.05000000074505806
- -0.05000000074505806
- -0.0456085205078125
- -0.05000000074505806
- -100.0
- -0.025820335373282433
q99:
- 0.04579437896609306
- 0.04565873369574547
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 0.03847877308726311
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 100.0
- 0.05000000074505806
- 0.03608553484082222
- 0.04896605759859085
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 100.0
- 0.7379999756813049
default_repo_id: synthmanip
action_key: action
state_keys:
- observation.state
action_norm_mode: quantiles
state_norm_mode: min_max
robot_postprocessor:
stats_by_repo:
synthmanip:
observation.state:
min:
- -4.904874324798584
- -4.564780235290527
- -3.5160739421844482
- -2.356419563293457
- -0.47234979271888733
- -2.0865397453308105
- -3.343071222305298
- -5.8824052810668945
- -1.7488995790481567
- -2.967109203338623
- -0.11299018561840057
- -2.3546268939971924
- -3.1416664123535156
- -2.0946199893951416
- -3.2890703678131104
- -6.282893657684326
- -1.7483078241348267
- -2.967064142227173
- -0.12049419432878494
- -1.778153419494629
- -1.7587945461273193
- -1.5871200561523438
max:
- 17.08185577392578
- 33.73189163208008
- 3.2411913871765137
- 2.356658697128296
- 3.1416971683502197
- 2.1008245944976807
- 0.07229717075824738
- 6.270575523376465
- 2.0102994441986084
- 2.9668161869049072
- 0.021467044949531555
- 2.3977394104003906
- 0.34489157795906067
- 2.0900635719299316
- 0.07242166996002197
- 6.27663516998291
- 2.0076160430908203
- 2.9636759757995605
- 0.04509617015719414
- 0.919683575630188
- 1.6717331409454346
- 1.1039749383926392
action:
q01:
- -0.04400388523936272
- -0.044572047889232635
- -0.05000000074505806
- -0.05000000074505806
- -0.037506889551877975
- -0.03562070056796074
- -0.05000000074505806
- -0.05000000074505806
- -0.04800133779644966
- -0.05000000074505806
- -100.0
- -0.05000000074505806
- -0.05000000074505806
- -0.04927435144782066
- -0.05000000074505806
- -0.05000000074505806
- -0.0456085205078125
- -0.05000000074505806
- -100.0
- -0.025820335373282433
q99:
- 0.04579437896609306
- 0.04565873369574547
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 0.03847877308726311
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 100.0
- 0.05000000074505806
- 0.03608553484082222
- 0.04896605759859085
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 0.05000000074505806
- 100.0
- 0.7379999756813049
default_repo_id: synthmanip
action_key: action
state_keys:
- observation.state
action_norm_mode: quantiles
state_norm_mode: min_max
parallelism:
data_parallel_replicate_degree: 1
enable_compiled_autograd: false
data_parallel_shard_degree: -1
fsdp_reshard_after_forward: default
context_parallel_config:
degree: 1
attention_type: ulysses
load_balancer: ulysses
head_stride: 1
tensor_parallel_config:
degree: 1
enable_async: false
data_parallel_config:
name: fsdp
param_dtype: null
reduce_dtype: float32
num_replicas: null
shard_degree: null
wrapping_strategy: full
prefetch_factor: 0
context_parallel_rotate_method: allgather
seed: 6198
epoch: null
dry_run: false
ft_llm: true
ft_vit: false
ft_connector: false
ft_embedding: lm_head
optimizer:
name: adamw
learning_rate: 0.0001
weight_decay: 0.01
betas:
- 0.9
- 0.95
eps: 1.0e-05
connector_learning_rate: 5.0e-06
vit_learning_rate: 5.0e-06
llm_learning_rate: 1.0e-05
frame_selector_learning_rate: 0.0001
temporal_token_scorer_learning_rate: 0.0001
action_expert_learning_rate: 0.0001
connector_weight_decay: 0.0
vit_weight_decay: 0.0
llm_weight_decay: 0.0
frame_selector_weight_decay: 0.01
temporal_token_scorer_weight_decay: 0.01
action_expert_weight_decay: 0.0
connector_betas:
- 0.9
- 0.95
vit_betas:
- 0.9
- 0.95
llm_betas:
- 0.9
- 0.95
frame_selector_betas:
- 0.9
- 0.95
temporal_token_scorer_betas:
- 0.9
- 0.95
action_expert_betas:
- 0.9
- 0.95
connector_eps: 1.0e-06
vit_eps: 1.0e-06
llm_eps: 1.0e-06
frame_selector_eps: 1.0e-06
temporal_token_scorer_eps: 1.0e-06
action_expert_eps: 1.0e-06
metrics_log_interval: -1
scheduler:
name: multimodal
units: steps
t_warmup: 100
t_max: null
alpha_f: 0.1
connector_t_warmup: 200
vit_t_warmup: 200
llm_t_warmup: 2000
frame_selector_t_warmup: 200
temporal_token_scorer_t_warmup: 200
action_expert_t_warmup: 200
grad_clip_warmup_steps: null
grad_clip_warmup_factor: null
warmup_min_lr: 0.0
data:
dataset: null
mixture:
synthmanip/task_0: 1.0
synthmanip/task_1: 1.0
synthmanip/task_2: 1.0
synthmanip/task_3: 1.0
synthmanip/task_4: 1.0
synthmanip/task_5: 1.0
synthmanip/task_6: 1.0
synthmanip/task_7: 1.0
synthmanip/task_8: 1.0
synthmanip/task_9: 1.0
root_size_mixture: null
kwargs_mixture: null
split: train
seed: 50189
pad: to_max
sequence_length: 1024
max_text_seq_len: null
shuffle: true
start_index: 0
packing: null
enable_variable_sized_token_pooling: true
num_workers: 4
drop_last: true
pin_memory: true
prefetch_factor: 4
persistent_workers: false
timeout: 300
action_data: null
action_loader_rate: null
action_batch_interval: 1
restore_dataloader: true
fast_forward_batches: null
evaluators: []
eval_interval: 0
inf_evaluators: []
inf_eval_interval: 1000
eval_on_last_step: true
eval_on_load: false
eval_on: []
save_folder: /weka/oe-training-default/snehalj/synthmanip_checkpoints/molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
checkpointer_config:
save_thread_count: null
load_thread_count: null
pre_download: false
work_dir: null
throttle_uploads: false
canceled_check_interval: 50
save_interval: 2000
save_at: null
save_final_optim: false
save_num_checkpoints_to_keep: 3
checkpoint_retention_frequency: 10000
save_final_unsharded_checkpoint: false
save_interval_ephemeral: null
save_overwrite: true
load_path: null
reset_optimizer_state: false
reset_trainer_state: false
initial_model_checkpoint: /weka/oe-training-default/hqfang/molmo2_checkpoints/4b-cp/step2000-unsharded/
allow_resume: true
max_duration: 100000
global_train_batch_size: 1024
device_train_microbatch_size: 8
max_grad_norm: 1.0
multi_component_grad_norm: true
batch_divisor: global_batch
max_grad_norm_ratio: null
precision: amp_bf16
wandb:
project: whirl-molmoflow-rby1
entity: prior-ai2
group: null
name: molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
tags:
- watching
log_artifacts: false
rank_zero_only: true
log_interval: 20
allow_resume: true
finish_on_sigterm: true
beaker_log_interval: 50
speed_monitor:
window_size: 20
gpu_flops_available: null
console_log_interval: 20
enable_timing_logs: false
gen1_gc_interval: 1
compile:
mode: default
fullgraph: false
dynamic: false
backend: inductor
activation_checkpointing: true
fsdp:
fsdp2: true
precision: pure
use_orig_params: true
wrapping_strategy: null
sharding_strategy: FULL_SHARD
hybrid_sharding_num_model_replicas: null
softmax_auxiliary_loss: false
softmax_auxiliary_loss_scale: 0.0001
response_logits_only: true
saliency_score_loss_wt: null
frame_score_loss_wt: null
frame_score_loss_type: mse
frame_score_loss_target: 0.7
time_limit: null
extra_steps_after_cancel: 0
python_profiling: false
torch_profiling: false
stop_at: 100000
stop_after: null
fused_loss: false
compile_loss: true
runtime_data:
args: launch_scripts/train_synthmanip.py /weka/oe-training-default/hqfang/molmo2_checkpoints/4b-cp/step2000-unsharded/
--data_paths /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/DoorOpeningDataGenConfig
/weka/prior/datasets/robomolmo/feb15_franka_and_rby1/DoorOpeningDataGenConfig
/weka/prior/datasets/robomolmo/feb12_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/RBY1PickDataGenConfig
/weka/prior/datasets/robomolmo/feb21_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/RBY1PickAndPlaceDataGenConfig
/weka/prior/datasets/robomolmo/feb15_franka_and_rby1/RBY1PickAndPlaceDataGenConfig
/weka/prior/datasets/robomolmo/feb21_franka_and_rby1/RBY1PickAndPlaceDataGenConfig
/weka/prior/datasets/robomolmo/feb23_open_datagen/RBY1OpenDataGenConfig /weka/prior/datasets/robomolmo/feb23_open_datagen_obja/RBY1OpenDataGenConfig
--no_val --dataset_sample_rates 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 --stats_path=/weka/prior/datasets/robomolmo/rby1_multitask_norm_stats.yaml
--action_preset RBY1_multitask --camera_preset RBY1_full_with_head_gopro --wandb.name=molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
--wandb.entity=prior-ai2 --wandb.project=whirl-molmoflow-rby1 --seq_len=1024 --max_duration=100000
--device_batch_size=8 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True
--model.mm_preprocessor.max_subtitle_tokens=null --data.num_workers=4 --prefetch_factor=4
--save_interval=2000 --save_num_checkpoints_to_keep=3 --checkpoint_retention_frequency=10000
--save_folder=/weka/oe-training-default/snehalj/synthmanip_checkpoints/molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
--exp_name=molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5
--data.packing=null --model.mm_preprocessor.image.max_images=5 --model.mm_preprocessor.image.crop_mode=resize
--model.mm_preprocessor.max_frames=1 --model.same_noise_per_time=False --model.num_flow_timestamps=8
--use_point_prompts_per_dataset 1 1 0 0 0 0 0 0 1 1 --randomize_prompts --point_prompt_camera=head_camera
--max_points_in_conditioning_frame=1 --conditioning_frame=random_first_10 --cameras_to_warp
head_camera --img_aug --ft_llm=True --scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-5
hostname: jupiter-cs-aus-147.reviz.ai2.in
date: 03/05/2026, 22:36
world_size: 128
resuming_from: null
beaker_experiment_id: 01KK0212A2CKWNFJEJHT7AZMW5
beaker_experiment_url: https://beaker.org/ex/01KK0212A2CKWNFJEJHT7AZMW5
wandb_id: t57qc9vl
wandb_url: https://wandb.ai/prior-ai2/whirl-molmoflow-rby1/runs/t57qc9vl
distributed_eval_enabled: false
distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark
distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig
distributed_eval_task_horizon: 300
distributed_eval_num_worker_jobs: 1
distributed_eval_wandb_project: mjthor-online-eval
distributed_eval_workspace: ai2/robo-molmo
distributed_eval_clusters:
- ai2/saturn
- ai2/neptune
- ai2/rhea
- ai2/ceres
distributed_eval_priority: high
distributed_eval_preemptible: true