weights_format: sft models: - name: "classifier" tokens: 0 len: 6 dim: 512 resnet: 34 #loras: #- name : "lora" # rank: 128 # alpha: 128 # training: True # rvq_levels: [] hyperparameters: batch_size: 256 gradient_accumulation_steps: 1 gradient_clipping: 1.0 warmup_steps: 10 optimizer: Prodigy learning_rate: 1.0 torch_optimizer: True scheduler: "" # ScheduleFree torch_scheduler: True evaluation: batch_size: 64 frequency: 100 size: 64 steps: 450 temperature: 0.0 trainer: iterations: 1_000_000 save_frequency: 100 keep_last_checkpoints: 32 check_for_oom: False gradient_checkpointing: True weight_dtype: bfloat16 amp: True backend: deepspeed deepspeed: inferencing: False amp: False inference: backend: local weight_dtype: bfloat16 amp: True optimizations: injects: False replace: True linear: False embedding: False optimizers: True bitsandbytes: False dadaptation: False bitnet: False fp8: False dataset: use_hdf5: True hdf5_flag: r workers: 1 cache: True training: [ "./data/images/" ] validation: [ "./data/validation/" ]