OLMo2-7B-base-token5T / params.json
Rano23's picture
Add files using upload-large-folder tool
f29092a verified
{
"name": "olmo3-lingua",
"dump_dir": "logs/debug",
"seed": 777,
"grad_acc_steps": 1,
"gc_collect_freq": 1000,
"probe_freq": null,
"steps": 4882812500,
"data": {
"root_dir": "data",
"sources": {
"dclm-baseline_shuffled": 1.0
},
"batch_size": 32,
"seq_len": 4096,
"n_views": 2,
"seed": 42,
"add_bos": true,
"add_eos": true,
"load_async": true,
"prefetch_size": 1024,
"tokenizer": {
"name": "huggingface",
"path": "/data/rsadhukh/checkpoints/olmo2-1b-base-token4T"
}
},
"optim": {
"lr": 0.0003,
"weight_decay": 0.1,
"epsilon": 1e-08,
"beta1": 0.9,
"beta2": 0.95,
"clip": 1.0,
"scheduler": "cosine",
"warmup": 8192000,
"lr_min_ratio": 0.1,
"cycle_length": 1.0,
"cosine_theta": 1.0,
"annealing_step": 0,
"decay_fraction": 0.1,
"exp_factor": 0.5
},
"distributed": {
"dp_shard": 1,
"dp_replicate": 1,
"tp_size": 1,
"selective_activation_checkpointing": false,
"compile": true,
"fsdp_type": "full_shard",
"model_dtype": "bf16",
"float8_recipe": null,
"float8_filter": "layers\\.[0-9]+\\.",
"matmul_allow_tf32": false,
"detect_anomaly": false,
"compile_cache_size_limit": 8,
"spawn_method": "forkserver"
},
"checkpoint": {
"dump": {
"every": 10000,
"keep": 1
},
"eval": {
"every": 10000,
"keep": 1
},
"path": "logs/debug/checkpoints",
"init_ckpt_path": "",
"continue_training_from_init": false
},
"logging": {
"freq": 10,
"acc_freq": null,
"wandb": null
},
"model": {
"dim": 4096,
"n_layers": 32,
"head_dim": 128,
"n_heads": 32,
"n_kv_heads": 32,
"ffn_dim_multiplier": null,
"multiple_of": 256,
"norm_eps": 1e-06,
"rope_theta": 500000,
"rope_scaling": null,
"init_base_std": 0.02,
"init_std_factor": "disabled",
"max_seqlen": 4096,
"seed": 42,
"vocab_size": 100352,
"weight_tying": false,
"sliding_window": null
}
}