job_type: speechlm

multimodal_io:
    text:
        tokenizer_name: Qwen/Qwen3-8B-Base
    discrete_audio:
        codec_choice: Xcodec
        codec_hf_model_tag: hf-audio/xcodec-hubert-general
        ssl_choice: null
        ssl_hf_model_tag: null
        delay_interleave: true
        stream_weights: [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
    continuous_audio:
        encoder_choice: huggingface
        encoder_hf_model_tag: Qwen/Qwen3-Omni-30B-A3B-Instruct
        attn_implementation: flash_attention_3
        dtype: bfloat16

model:
    model_choice: parallel
    model_hf_tag: Qwen/Qwen3-8B-Base
    model_conf:
        attn_implementation: flash_attention_3
        dtype: bfloat16
        compile_transformer_body: false
        freeze_text_embeddings: false
    activation_checkpointing: true

preprocessor:
    audio_input: continuous_audio
    audio_output: discrete_audio
    loss_region: assistant
    audio_cfg: 0.05

data_loading:
    batchfy_method: pack
    batch_size: 20000
    save_loader_state: false
    seed: 7
    num_workers: 6

trainer:
    deepspeed_config: conf/ds_acc4_tot500k.json
    freeze_param: [multimodal_io_dict.discrete_audio, multimodal_io_dict.continuous_audio]
    max_step: 356_000
    #inistep: 350_000
    save_interval: 3000
    log_interval: 1
