run_name: molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5 model: model_name: molmoact data_formatter: prompt_templates: uber_model_v2 message_format: qwen3 system_prompt: demo_or_style_v2 always_start_with_space: false default_inference_len: 65 select_answer: best debug: false image_last: false format_message_list: null p_one_message: 0.0 eval_system_prompt_mapping: null p_choice_content_in_mc: 1.0 template_video_mc_questions: true pointing_format: html-v2 points_decimal_places: 1 use_seperate_non_pointing_qa_style: false timestamp_mode: 50-percent-seconds output_timestamp_mode: seconds seconds_decimal_places: 1 p_multi_point_all_image: 0.5 use_seperate_count_without_pointing_style: false sample_random_initial_point: true llm: d_model: 2560 n_heads: 32 n_kv_heads: 8 head_dim: 128 qkv_bias: false clip_qkv: null n_layers: 36 mlp_ratio: 4 mlp_hidden_size: 19456 activation_type: swiglu block_type: sequential rope: true rope_full_precision: true rope_theta: 5000000.0 rope_type: default rope_factor: null rope_high_freq_factor: null rope_low_freq_factor: null rope_original_max_position_embeddings: null rope_attention_factor: null rope_beta_fast: null rope_beta_slow: null rope_mscale: null rope_mscale_all_dim: null rope_truncate: null attention_type: sdpa full_attention_layers: null sliding_attention_rope_scaling: false float32_attention: true attention_dropout: 0.0 attention_layer_norm: true attention_layer_norm_type: qwen3 residual_dropout: 0.1 response_residual_dropout: 0.0 layer_norm_type: rms layer_norm_with_affine: true layer_norm_eps: 1.0e-06 attention_layer_norm_with_affine: true max_sequence_length: 8192 max_position_embeddings: null include_bias: false bias_for_layer_norm: null norm_after: false moe_num_experts: 8 moe_top_k: 2 moe_mlp_impl: sparse moe_log_expert_assignment: false moe_shared_expert: false moe_lbl_in_fp32: false moe_interleave: false moe_loss_weight: 0.1 moe_zloss_weight: null moe_dropless: true moe_capacity_factor: 1.25 embedding_dropout: 0.0 scale_logits: false vocab_size: 151936 additional_vocab_size: 128 weight_tying: true embedding_size: 151936 use_position_ids: true tokenizer: identifier: Qwen/Qwen3-4B-Instruct-2507 tokenizer_dir: null init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt init_incremental: null new_embedding_init_range: 0.02 initializer_range: 0.02 normalize_input_embeds: false activation_checkpoint: whole_layer compile: blocks fix_pad_tokenizer: false init_std: 0.02 init_fn: normal init_cutoff_factor: null vision_backbone: vit: image_model_type: siglip image_default_input_size: - 378 - 378 image_patch_size: 14 image_pos_patch_size: 14 image_emb_dim: 1152 image_num_heads: 16 image_num_key_value_heads: 16 image_num_layers: 27 image_head_dim: 72 image_mlp_dim: 4304 image_mlp_activations: gelu_pytorch_tanh image_dropout_rate: 0.0 image_num_pos: 729 image_norm_eps: 1.0e-06 attention_dropout: 0.0 residual_dropout: 0.0 initializer_range: 0.02 float32_attention: true attention_type: sdpa sdpa_backend: all activation_checkpointing: true init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt resize_mode: siglip pad_value: 0.0 normalize: siglip image_pooling_2d: attention_meanq pooling_attention_mask: true image_projector: mlp image_padding_embed: null vit_layers: - -3 - -9 skip_unused_layers: true use_deepstack: false share_connector: false image_feature_dropout: 0.0 connector_activation_checkpointing: true compile_vit: blocks pool_size_embeds: null compile_connector: null normalize_on_gpu: true use_image_augmentation: true use_resize_bottleneck: false mm_preprocessor: max_answer_len: null last_message_loss_only: false max_text_tokens: null loss_token_weighting: root_subsegments_root_tokens max_frames: 1 frame_sample_mode: uniform_last_frame candidate_sampling_fps: - 0.25 - 0.5 - 1.0 - 2.0 - 4.0 - 6.0 - 8.0 - 16.0 cache_videos: true loading_method: torchcodec_exact max_fps: - 2.0 time_sampling: true time_mode: per-frame-compact subtitle_mode: frame_1 max_crops: 1 overlap_margins: - 4.0 - 4.0 use_col_tokens: false periodic_high_res_frame: null high_low_train_mode: local_rnd high_res_frame_sample_options: null periodic_sample_rate_training: 4: - 0.9 - 0.03 - 0.03 - 0.04 3: - 0.6 - 0.2 - 0.2 skip_low_res_in_high_low: false pooling_w: 3 pooling_h: 3 high_res_pooling_w: null high_res_pooling_h: null query_based_resolution_selection: false max_queries_for_resolution_selection: 8 use_frame_special_tokens: true frame_sel_clip_identifier: google/siglip2-so400m-patch14-384 image_padding_mask: false max_subtitle_tokens: null image: crop_mode: resize use_col_tokens: true max_crops: 8 high_res_max_crops: 24 p_high_res: 0.0 pooling_w: 2 pooling_h: 2 overlap_margins: - 4 - 4 max_images: 5 max_multi_image_crops: 8 multi_image_pooling_w: 2 multi_image_pooling_h: 2 use_single_crop_col_tokens: false use_single_crop_start_token: true topk: null prune_from_frame: 0 bi_directional_attn: image_tokens shared_low_high_embedding: true debug: null cp_enabled: false apply_cp_to_vision_backbone: false action_dim: 20 action_horizon: 16 n_action_steps: 8 n_obs_steps: 1 action_expert: max_horizon: 32 action_dim: 20 hidden_size: 768 num_layers: 36 num_heads: 8 mlp_ratio: 4.0 timestep_embed_dim: 256 dropout: 0.0 attn_dropout: 0.0 context_layer_norm: true action_expert_layer_mode: per_layer flow_matching_num_steps: 10 flow_matching_cutoff: 0.999 flow_matching_beta_alpha: 1.0 flow_matching_beta_beta: 1.5 num_flow_timestamps: 8 same_noise_per_time: false robot_preprocessor: stats_by_repo: synthmanip: observation.state: min: - -4.904874324798584 - -4.564780235290527 - -3.5160739421844482 - -2.356419563293457 - -0.47234979271888733 - -2.0865397453308105 - -3.343071222305298 - -5.8824052810668945 - -1.7488995790481567 - -2.967109203338623 - -0.11299018561840057 - -2.3546268939971924 - -3.1416664123535156 - -2.0946199893951416 - -3.2890703678131104 - -6.282893657684326 - -1.7483078241348267 - -2.967064142227173 - -0.12049419432878494 - -1.778153419494629 - -1.7587945461273193 - -1.5871200561523438 max: - 17.08185577392578 - 33.73189163208008 - 3.2411913871765137 - 2.356658697128296 - 3.1416971683502197 - 2.1008245944976807 - 0.07229717075824738 - 6.270575523376465 - 2.0102994441986084 - 2.9668161869049072 - 0.021467044949531555 - 2.3977394104003906 - 0.34489157795906067 - 2.0900635719299316 - 0.07242166996002197 - 6.27663516998291 - 2.0076160430908203 - 2.9636759757995605 - 0.04509617015719414 - 0.919683575630188 - 1.6717331409454346 - 1.1039749383926392 action: q01: - -0.04400388523936272 - -0.044572047889232635 - -0.05000000074505806 - -0.05000000074505806 - -0.037506889551877975 - -0.03562070056796074 - -0.05000000074505806 - -0.05000000074505806 - -0.04800133779644966 - -0.05000000074505806 - -100.0 - -0.05000000074505806 - -0.05000000074505806 - -0.04927435144782066 - -0.05000000074505806 - -0.05000000074505806 - -0.0456085205078125 - -0.05000000074505806 - -100.0 - -0.025820335373282433 q99: - 0.04579437896609306 - 0.04565873369574547 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 0.03847877308726311 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 100.0 - 0.05000000074505806 - 0.03608553484082222 - 0.04896605759859085 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 100.0 - 0.7379999756813049 default_repo_id: synthmanip action_key: action state_keys: - observation.state action_norm_mode: quantiles state_norm_mode: min_max robot_postprocessor: stats_by_repo: synthmanip: observation.state: min: - -4.904874324798584 - -4.564780235290527 - -3.5160739421844482 - -2.356419563293457 - -0.47234979271888733 - -2.0865397453308105 - -3.343071222305298 - -5.8824052810668945 - -1.7488995790481567 - -2.967109203338623 - -0.11299018561840057 - -2.3546268939971924 - -3.1416664123535156 - -2.0946199893951416 - -3.2890703678131104 - -6.282893657684326 - -1.7483078241348267 - -2.967064142227173 - -0.12049419432878494 - -1.778153419494629 - -1.7587945461273193 - -1.5871200561523438 max: - 17.08185577392578 - 33.73189163208008 - 3.2411913871765137 - 2.356658697128296 - 3.1416971683502197 - 2.1008245944976807 - 0.07229717075824738 - 6.270575523376465 - 2.0102994441986084 - 2.9668161869049072 - 0.021467044949531555 - 2.3977394104003906 - 0.34489157795906067 - 2.0900635719299316 - 0.07242166996002197 - 6.27663516998291 - 2.0076160430908203 - 2.9636759757995605 - 0.04509617015719414 - 0.919683575630188 - 1.6717331409454346 - 1.1039749383926392 action: q01: - -0.04400388523936272 - -0.044572047889232635 - -0.05000000074505806 - -0.05000000074505806 - -0.037506889551877975 - -0.03562070056796074 - -0.05000000074505806 - -0.05000000074505806 - -0.04800133779644966 - -0.05000000074505806 - -100.0 - -0.05000000074505806 - -0.05000000074505806 - -0.04927435144782066 - -0.05000000074505806 - -0.05000000074505806 - -0.0456085205078125 - -0.05000000074505806 - -100.0 - -0.025820335373282433 q99: - 0.04579437896609306 - 0.04565873369574547 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 0.03847877308726311 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 100.0 - 0.05000000074505806 - 0.03608553484082222 - 0.04896605759859085 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 0.05000000074505806 - 100.0 - 0.7379999756813049 default_repo_id: synthmanip action_key: action state_keys: - observation.state action_norm_mode: quantiles state_norm_mode: min_max parallelism: data_parallel_replicate_degree: 1 enable_compiled_autograd: false data_parallel_shard_degree: -1 fsdp_reshard_after_forward: default context_parallel_config: degree: 1 attention_type: ulysses load_balancer: ulysses head_stride: 1 tensor_parallel_config: degree: 1 enable_async: false data_parallel_config: name: fsdp param_dtype: null reduce_dtype: float32 num_replicas: null shard_degree: null wrapping_strategy: full prefetch_factor: 0 context_parallel_rotate_method: allgather seed: 6198 epoch: null dry_run: false ft_llm: true ft_vit: false ft_connector: false ft_embedding: lm_head optimizer: name: adamw learning_rate: 0.0001 weight_decay: 0.01 betas: - 0.9 - 0.95 eps: 1.0e-05 connector_learning_rate: 5.0e-06 vit_learning_rate: 5.0e-06 llm_learning_rate: 1.0e-05 frame_selector_learning_rate: 0.0001 temporal_token_scorer_learning_rate: 0.0001 action_expert_learning_rate: 0.0001 connector_weight_decay: 0.0 vit_weight_decay: 0.0 llm_weight_decay: 0.0 frame_selector_weight_decay: 0.01 temporal_token_scorer_weight_decay: 0.01 action_expert_weight_decay: 0.0 connector_betas: - 0.9 - 0.95 vit_betas: - 0.9 - 0.95 llm_betas: - 0.9 - 0.95 frame_selector_betas: - 0.9 - 0.95 temporal_token_scorer_betas: - 0.9 - 0.95 action_expert_betas: - 0.9 - 0.95 connector_eps: 1.0e-06 vit_eps: 1.0e-06 llm_eps: 1.0e-06 frame_selector_eps: 1.0e-06 temporal_token_scorer_eps: 1.0e-06 action_expert_eps: 1.0e-06 metrics_log_interval: -1 scheduler: name: multimodal units: steps t_warmup: 100 t_max: null alpha_f: 0.1 connector_t_warmup: 200 vit_t_warmup: 200 llm_t_warmup: 2000 frame_selector_t_warmup: 200 temporal_token_scorer_t_warmup: 200 action_expert_t_warmup: 200 grad_clip_warmup_steps: null grad_clip_warmup_factor: null warmup_min_lr: 0.0 data: dataset: null mixture: synthmanip/task_0: 1.0 synthmanip/task_1: 1.0 synthmanip/task_2: 1.0 synthmanip/task_3: 1.0 synthmanip/task_4: 1.0 synthmanip/task_5: 1.0 synthmanip/task_6: 1.0 synthmanip/task_7: 1.0 synthmanip/task_8: 1.0 synthmanip/task_9: 1.0 root_size_mixture: null kwargs_mixture: null split: train seed: 50189 pad: to_max sequence_length: 1024 max_text_seq_len: null shuffle: true start_index: 0 packing: null enable_variable_sized_token_pooling: true num_workers: 4 drop_last: true pin_memory: true prefetch_factor: 4 persistent_workers: false timeout: 300 action_data: null action_loader_rate: null action_batch_interval: 1 restore_dataloader: true fast_forward_batches: null evaluators: [] eval_interval: 0 inf_evaluators: [] inf_eval_interval: 1000 eval_on_last_step: true eval_on_load: false eval_on: [] save_folder: /weka/oe-training-default/snehalj/synthmanip_checkpoints/molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5 checkpointer_config: save_thread_count: null load_thread_count: null pre_download: false work_dir: null throttle_uploads: false canceled_check_interval: 50 save_interval: 2000 save_at: null save_final_optim: false save_num_checkpoints_to_keep: 3 checkpoint_retention_frequency: 10000 save_final_unsharded_checkpoint: false save_interval_ephemeral: null save_overwrite: true load_path: null reset_optimizer_state: false reset_trainer_state: false initial_model_checkpoint: /weka/oe-training-default/hqfang/molmo2_checkpoints/4b-cp/step2000-unsharded/ allow_resume: true max_duration: 100000 global_train_batch_size: 1024 device_train_microbatch_size: 8 max_grad_norm: 1.0 multi_component_grad_norm: true batch_divisor: global_batch max_grad_norm_ratio: null precision: amp_bf16 wandb: project: whirl-molmoflow-rby1 entity: prior-ai2 group: null name: molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5 tags: - watching log_artifacts: false rank_zero_only: true log_interval: 20 allow_resume: true finish_on_sigterm: true beaker_log_interval: 50 speed_monitor: window_size: 20 gpu_flops_available: null console_log_interval: 20 enable_timing_logs: false gen1_gc_interval: 1 compile: mode: default fullgraph: false dynamic: false backend: inductor activation_checkpointing: true fsdp: fsdp2: true precision: pure use_orig_params: true wrapping_strategy: null sharding_strategy: FULL_SHARD hybrid_sharding_num_model_replicas: null softmax_auxiliary_loss: false softmax_auxiliary_loss_scale: 0.0001 response_logits_only: true saliency_score_loss_wt: null frame_score_loss_wt: null frame_score_loss_type: mse frame_score_loss_target: 0.7 time_limit: null extra_steps_after_cancel: 0 python_profiling: false torch_profiling: false stop_at: 100000 stop_after: null fused_loss: false compile_loss: true runtime_data: args: launch_scripts/train_synthmanip.py /weka/oe-training-default/hqfang/molmo2_checkpoints/4b-cp/step2000-unsharded/ --data_paths /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/DoorOpeningDataGenConfig /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/DoorOpeningDataGenConfig /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb21_franka_and_rby1/RBY1PickDataGenConfig /weka/prior/datasets/robomolmo/feb12_franka_and_rby1/RBY1PickAndPlaceDataGenConfig /weka/prior/datasets/robomolmo/feb15_franka_and_rby1/RBY1PickAndPlaceDataGenConfig /weka/prior/datasets/robomolmo/feb21_franka_and_rby1/RBY1PickAndPlaceDataGenConfig /weka/prior/datasets/robomolmo/feb23_open_datagen/RBY1OpenDataGenConfig /weka/prior/datasets/robomolmo/feb23_open_datagen_obja/RBY1OpenDataGenConfig --no_val --dataset_sample_rates 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 --stats_path=/weka/prior/datasets/robomolmo/rby1_multitask_norm_stats.yaml --action_preset RBY1_multitask --camera_preset RBY1_full_with_head_gopro --wandb.name=molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5 --wandb.entity=prior-ai2 --wandb.project=whirl-molmoflow-rby1 --seq_len=1024 --max_duration=100000 --device_batch_size=8 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True --model.mm_preprocessor.max_subtitle_tokens=null --data.num_workers=4 --prefetch_factor=4 --save_interval=2000 --save_num_checkpoints_to_keep=3 --checkpoint_retention_frequency=10000 --save_folder=/weka/oe-training-default/snehalj/synthmanip_checkpoints/molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5 --exp_name=molmoflow-rby1-multitask-16node-03-05-23-31-19_bs_1024_dbs_8_step_100000_llmlr_1e-5 --data.packing=null --model.mm_preprocessor.image.max_images=5 --model.mm_preprocessor.image.crop_mode=resize --model.mm_preprocessor.max_frames=1 --model.same_noise_per_time=False --model.num_flow_timestamps=8 --use_point_prompts_per_dataset 1 1 0 0 0 0 0 0 1 1 --randomize_prompts --point_prompt_camera=head_camera --max_points_in_conditioning_frame=1 --conditioning_frame=random_first_10 --cameras_to_warp head_camera --img_aug --ft_llm=True --scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-5 hostname: jupiter-cs-aus-147.reviz.ai2.in date: 03/05/2026, 22:36 world_size: 128 resuming_from: null beaker_experiment_id: 01KK0212A2CKWNFJEJHT7AZMW5 beaker_experiment_url: https://beaker.org/ex/01KK0212A2CKWNFJEJHT7AZMW5 wandb_id: t57qc9vl wandb_url: https://wandb.ai/prior-ai2/whirl-molmoflow-rby1/runs/t57qc9vl distributed_eval_enabled: false distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig distributed_eval_task_horizon: 300 distributed_eval_num_worker_jobs: 1 distributed_eval_wandb_project: mjthor-online-eval distributed_eval_workspace: ai2/robo-molmo distributed_eval_clusters: - ai2/saturn - ai2/neptune - ai2/rhea - ai2/ceres distributed_eval_priority: high distributed_eval_preemptible: true