AudioTextHTDemucs / config.yaml
jacob1576's picture
Updated pip requirements and added code to load model from HF hub
cffc5b3
data:
train_dir: /home/jacob/datasets/musdb18/train # Path to train subfolder of MUSDB18 dataset
test_dir: /home/jacob/datasets/musdb18/test # Path to test subfolder of MUSDB18 dataset
segment_seconds: 6.0 # Length of audio segments for training [s]
pct_train: 0.2 # Decimal percentage of full data to use for training (otherwise 1 epoch takes ~15 hrs)
pct_test: 0.1 # Decimal percentage of full data to use for testing
overlap: 0.1 # Overlap between segments for chunked inference [s]
sample_rate: 44100 # Sample rate for audio files [Hz]
channels: 2 # Number of audio channels (1 = mono, 2 = stereo)
random_segments: False # Whether to use random segments during training
augment: True # Whether to use data augmentation (gain adjustment and channel swapping)
model:
name: Audio-Text-HTDemucs # Model name
model_dim: 384 # Model dimension
text_dim: 512 # Text embedding dimension (laion/clap-htsat-unfused is 512)
num_heads: 8 # Number of attention heads for text cross-attention layer
device: cpu # Device to use for training (cuda for GPU or cpu)
use_amp: False # Whether to use automatic mixed precision (AMP) during training - WORK IN PROGRESS
training:
batch_size: 8 # Batch size for training
num_workers: 0 # Number of DataLoader workers
num_epochs: 20 # Number of training epochs
optimizer:
name: AdamW
lr: 1e-4 # Learning rate
weight_decay: 1e-2 # Weight decay for optimizer
grad_clip: 5.0 # Gradient clipping value (set to null to disable)
loss_weights:
sdr: 0.9 # Weight for SDR loss
sisdr_weight: 0.1 # Weight for SI-SDR loss, total loss is (sdr_weight * sdr) + (sisdr_weight * si_sdr)
use_L1_comb_loss: False # Whether to use L1 combination loss
L1_comb_loss:
sdr_weight: 1.0 # Weight for SDR in L1 combination loss
l1_weight: 0.1 # Weight for L1 loss in L1 combination loss
#resume_from: null # Path to checkpoint to resume training from (set to null to train from scratch)
resume_from: checkpoints/2025_11_30_batch4/best_model.pt
wandb:
use_wandb: False # Whether to use Weights & Biases for experiment tracking
project: audio-text-htdemucs # Wandb project name
run_name: null
log_every: 50 # Log to wandb every N batches
validate_every: 1 # Validate every N epochs
save_every: 5 # Save model checkpoint every N epochs
checkpoint_dir: checkpoints/2025_12_06/ # Directory to save model checkpoints
output_dir: results/2025_12_06 # Directory to save inference results