# ============================================================================
# Bagpiper — Inference Configuration
# Decoding hyperparameters for audio and text generation
# ============================================================================

dtype: bfloat16
num_hypo: 1

caption_extraction:
  enforce_modality: ["text"]
  text:
    temperature: 0.6
    topk: 20
    cfg: 1
    max_step: 2048

single_turn:
  enforce_modality: ["audio"]
  audio:
    temperature: 0.8
    topk: 20
    cfg: 3
    max_step: 2048
    min_step: 50
    add_generation_prompt: false # enabled for multi-turn

multi_turn:
  enforce_modality: ["audio"]
  audio:
    temperature: 0.8
    topk: 20
    cfg: 3
    max_step: 2048