Spaces:

jacob1576
/

AudioTextHTDemucs

Running

App Files Files Community

jacob1576 commited on 1 day ago

Commit

7417a6a

1 Parent(s): 6c92aac

Add application file and dependencies

Browse files

Files changed (22) hide show

app.py +393 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/dataloader.cpython-313.pyc +0 -0
src/__pycache__/loss.cpython-313.pyc +0 -0
src/__pycache__/train.cpython-313.pyc +0 -0
src/dataloader.py +179 -0
src/loss.py +162 -0
src/models/__init__.py +0 -0
src/models/__pycache__/__init__.cpython-313.pyc +0 -0
src/models/stem_separation/ATHTDemucs_v2.py +348 -0
src/models/stem_separation/AudioTextDemucsV2.txt +237 -0
src/models/stem_separation/AudioTextHTDemucs_Full.txt +889 -0
src/models/stem_separation/AudioTextHTDemucs_Text_Only.txt +745 -0
src/models/stem_separation/CLAP_Text_Model_Fwd_Pass.txt +40 -0
src/models/stem_separation/HTDemucs_Fwd_Pass.txt +156 -0
src/models/stem_separation/__init__.py +0 -0
src/models/stem_separation/__pycache__/ATHTDemucs_v2.cpython-313.pyc +0 -0
src/models/stem_separation/__pycache__/AudioTextHTDemucs.cpython-313.pyc +0 -0
src/models/stem_separation/__pycache__/__init__.cpython-313.pyc +0 -0
src/train.py +610 -0
utils.py +968 -0

app.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Gradio Demo for AudioTextHTDemucs - Text-Conditioned Stem Separation
+Upload an audio file, enter a text prompt (e.g., "drums", "extract bass", "vocals"),
+and the model will separate that stem from the mixture.
+"""
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import gradio as gr
+import torch
+import torch.nn.functional as F
+import torchaudio
+import numpy as np
+import matplotlib.pyplot as plt
+from pathlib import Path
+from demucs import pretrained
+from transformers import ClapModel, AutoTokenizer
+from src.models.stem_separation.ATHTDemucs_v2 import AudioTextHTDemucs
+from utils import load_config, plot_spectrogram
+# ============================================================================
+# Configuration
+# ============================================================================
+cfg = load_config("config.yaml")
+CHECKPOINT_PATH     = cfg["training"]["resume_from"]  # Change as needed
+SAMPLE_RATE         = cfg["data"]["sample_rate"]
+SEGMENT_SECONDS     = cfg["data"]["segment_seconds"]
+OVERLAP             = cfg["data"]["overlap"]
+# Auto-detect device
+if torch.cuda.is_available():
+    DEVICE = "cuda"
+elif torch.backends.mps.is_available():
+    DEVICE = "mps"
+else:
+    DEVICE = "cpu"
+# DEVICE = "cpu"
+# ============================================================================
+# Model Loading
+# ============================================================================
+print(f"Loading model on device: {DEVICE}")
+print("Loading HTDemucs...")
+htdemucs = pretrained.get_model('htdemucs').models[0]
+print("Loading CLAP...")
+clap = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+print("Building AudioTextHTDemucs...")
+model = AudioTextHTDemucs(htdemucs, clap, tokenizer)
+print(f"Loading checkpoint from {CHECKPOINT_PATH}...")
+checkpoint = torch.load(CHECKPOINT_PATH, map_location="cpu")
+model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+print(f"Loaded checkpoint from epoch {checkpoint.get('epoch', '?')}")
+model = model.to(DEVICE)
+model.eval()
+print("Model ready!")
+# ============================================================================
+# Helper Functions
+# ============================================================================
+def create_spectrogram(audio, sr=SAMPLE_RATE, title="Spectrogram"):
+    """Create a spectrogram visualization."""
+    fig, ax = plt.subplots(figsize=(10, 4))
+    # Convert to mono for visualization if stereo
+    if audio.shape[0] == 2:
+        audio_mono = audio.mean(dim=0)
+    else:
+        audio_mono = audio.squeeze()
+    # Compute spectrogram
+    n_fft = 2048
+    hop_length = 512
+    spec = torch.stft(
+        audio_mono,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        return_complex=True
+    )
+    spec_mag = torch.abs(spec)
+    spec_db = 20 * torch.log10(spec_mag + 1e-8)
+    # Plot
+    im = ax.imshow(
+        spec_db.cpu().numpy(),
+        aspect='auto',
+        origin='lower',
+        cmap='viridis',
+        interpolation='nearest'
+    )
+    ax.set_xlabel('Time (frames)')
+    ax.set_ylabel('Frequency (bins)')
+    ax.set_title(title)
+    plt.colorbar(im, ax=ax, format='%+2.0f dB')
+    plt.tight_layout()
+    return fig
+def load_audio(audio_path, target_sr=SAMPLE_RATE):
+    """Load audio file and resample if necessary."""
+    waveform, sr = torchaudio.load(audio_path)
+    # Resample if necessary
+    if sr != target_sr:
+        resampler = torchaudio.transforms.Resample(sr, target_sr)
+        waveform = resampler(waveform)
+    # Convert to stereo if mono
+    if waveform.shape[0] == 1:
+        waveform = waveform.repeat(2, 1)
+    return waveform, target_sr
+def chunked_inference(mixture, prompt):
+    """Run chunked inference for a single stem."""
+    C, T = mixture.shape
+    chunk_len = int(SAMPLE_RATE * SEGMENT_SECONDS)
+    overlap_frames = int(OVERLAP * SAMPLE_RATE)
+    output = torch.zeros(C, T, device=DEVICE)
+    weight = torch.zeros(T, device=DEVICE)
+    start = 0
+    while start < T:
+        end = min(start + chunk_len, T)
+        chunk = mixture[:, start:end].unsqueeze(0).to(DEVICE)  # (1, C, chunk_len)
+        # Pad if needed
+        if chunk.shape[-1] < chunk_len:
+            pad_amount = chunk_len - chunk.shape[-1]
+            chunk = F.pad(chunk, (0, pad_amount))
+        with torch.no_grad():
+            out = model(chunk, [prompt])  # (1, C, chunk_len)
+        # Ensure output is on the correct device
+        out = out.to(DEVICE).squeeze(0)  # (C, chunk_len)
+        # Trim padding if we added any
+        actual_len = end - start
+        out = out[:, :actual_len]
+        # Create fade weights for overlap-add
+        fade_len = min(overlap_frames, actual_len // 2)
+        chunk_weight = torch.ones(actual_len, device=DEVICE)
+        if start > 0 and fade_len > 0:
+            # Fade in
+            chunk_weight[:fade_len] = torch.linspace(0, 1, fade_len, device=DEVICE)
+        if end < T and fade_len > 0:
+            # Fade out
+            chunk_weight[-fade_len:] = torch.linspace(1, 0, fade_len, device=DEVICE)
+        output[:, start:end] += out * chunk_weight
+        weight[start:end] += chunk_weight
+        # Move to next chunk with overlap
+        start += chunk_len - overlap_frames
+    # Normalize by weights
+    weight = weight.clamp(min=1e-8)
+    output = output / weight
+    return output
+def download_youtube_audio(yt_link):
+    """Download audio from a YouTube link using yt-dlp."""
+    try:
+        import yt_dlp
+        os.remove("temp/yt_audio.webm") if os.path.exists("temp/yt_audio.webm") else None
+        ydl_opts = {
+            'format': 'bestaudio/best',
+            'quiet': True,
+            'outtmpl': 'temp/yt_audio.webm',
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([yt_link])
+        mixture, sr = load_audio("temp/yt_audio.webm", target_sr=SAMPLE_RATE)
+        return (sr, mixture.T.numpy())
+    except Exception as e:
+        return f"Error downloading audio from YouTube: {str(e)}"
+# ============================================================================
+# Gradio Interface Functions
+# ============================================================================
+def process_audio(audio_file, yt_link, text_prompt):
+    """Main processing function for the Gradio interface."""
+    if audio_file is None and (yt_link is None or yt_link.strip() == ""):
+        return None, None, None, None, "Please upload an audio file."
+    if not text_prompt or text_prompt.strip() == "":
+        return None, None, None, None, "Please enter a text prompt."
+    if yt_link and yt_link.strip() != "":
+        try:
+            download_youtube_audio(yt_link)
+        except Exception as e:
+            return None, None, None, None, str(e)
+    try:
+        # Load audio
+        mixture, sr = load_audio(audio_file if audio_file else "temp/yt_audio.webm", target_sr=SAMPLE_RATE)
+        print(f"Loaded audio: {mixture.shape}, sr={sr}")
+        # Create input spectrogram
+        input_spec_fig = create_spectrogram(mixture, sr, title="Input Mixture Spectrogram")
+        #input_spec_fig = plot_spectrogram(mixture, sr, title="Input Mixture Spectrogram")
+        # Run separation
+        print(f"Running separation with prompt: '{text_prompt}'")
+        separated = chunked_inference(mixture.to(DEVICE), text_prompt.strip())
+        separated = separated.cpu()
+        # Debug: Check if output is non-zero
+        print(f"Separated audio shape: {separated.shape}")
+        print(f"Separated audio range: [{separated.min():.4f}, {separated.max():.4f}]")
+        print(f"Separated audio mean abs: {separated.abs().mean():.4f}")
+        # Create output spectrogram
+        output_spec_fig = create_spectrogram(separated, sr, title=f"Separated: {text_prompt}")
+        # Convert to audio format for Gradio
+        # Gradio Audio expects tuple: (sample_rate, numpy_array)
+        # numpy_array shape should be (samples, channels) for stereo
+        input_audio = (sr, mixture.T.numpy())  # (sr, (T, 2))
+        output_audio = (sr, separated.T.numpy())  # (sr, (T, 2))
+        status = f"✓ Successfully separated '{text_prompt}' from the mixture!"
+        return input_audio, output_audio, input_spec_fig, output_spec_fig, status
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        print(error_msg)
+        import traceback
+        traceback.print_exc()
+        return None, None, None, None, error_msg
+# ============================================================================
+# Gradio Interface
+# ============================================================================
+def create_demo():
+    """Create the Gradio interface."""
+    with gr.Blocks(title="AudioTextHTDemucs Demo") as demo:
+        gr.Markdown(
+            """
+            # 🎵 AudioTextHTDemucs - Text-Conditioned Stem Separation
+            Upload an audio file and enter a text prompt to separate specific stems from the mixture.
+            **Example prompts:**
+            - `drums` - Extract drum sounds
+            - `bass` - Extract bass guitar
+            - `vocals` - Extract singing voice
+            - `other` - Extract other instruments
+            - Or any natural language description like "extract the guitar" or "piano sound"
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Input")
+                audio_input = gr.Audio(
+                    label="Upload Audio File",
+                    type="filepath",
+                    sources=["upload"]
+                )
+                yt_link_input = gr.Textbox(
+                    label="YouTube Video URL (optional)",
+                    placeholder="Provide a YouTube link to fetch audio",
+                    lines=1
+                )
+                text_input = gr.Textbox(
+                    label="Text Prompt",
+                    placeholder="Enter what you want to extract (e.g., 'drums', 'vocals', 'bass')",
+                    lines=1
+                )
+                gr.Examples(
+                    examples=[
+                        ["drums"],
+                        ["bass"],
+                        ["vocals"],
+                        ["other"],
+                        ["extract the drums"],
+                        ["guitar sound"],
+                    ],
+                    inputs=text_input,
+                    label="Click to use example prompts"
+                )
+                with gr.Row():
+                    clear_btn = gr.Button("Clear", variant="secondary")
+                    submit_btn = gr.Button("Separate Audio", variant="primary")
+                status_output = gr.Textbox(label="Status", interactive=False)
+                yt_link_input.change(download_youtube_audio, inputs=[yt_link_input], outputs=[audio_input])
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Input Mixture")
+                input_audio_player = gr.Audio(
+                    label="Input Audio (Original Mix)",
+                    type="numpy",
+                    interactive=False
+                )
+                input_spec_plot = gr.Plot(label="Input Spectrogram")
+            with gr.Column():
+                gr.Markdown("### Separated Output")
+                output_audio_player = gr.Audio(
+                    label="Separated Audio",
+                    type="numpy",
+                    interactive=False
+                )
+                output_spec_plot = gr.Plot(label="Output Spectrogram")
+        # Button actions
+        submit_btn.click(
+            fn=process_audio,
+            inputs=[audio_input, yt_link_input, text_input],
+            outputs=[
+                input_audio_player,
+                output_audio_player,
+                input_spec_plot,
+                output_spec_plot,
+                status_output
+            ]
+        )
+        def clear_all():
+            return None, "", None, None, None, None, None, ""
+        clear_btn.click(
+            fn=clear_all,
+            outputs=[
+                audio_input,
+                text_input,
+                yt_link_input,
+                input_audio_player,
+                output_audio_player,
+                input_spec_plot,
+                output_spec_plot,
+                status_output
+            ]
+        )
+        gr.Markdown(
+            """
+            ---
+            ### Notes
+            - The model works best with music audio sampled at 44.1kHz
+            - Processing time depends on audio length (segments processed in 6-second chunks)
+            - The model was trained on stems: drums, bass, vocals, and other instruments
+            - You can use natural language descriptions thanks to CLAP text embeddings
+            """
+        )
+    return demo
+# ============================================================================
+# Launch
+# ============================================================================
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(
+        share=False,  # Set to True to create a public link
+        server_name="0.0.0.0",  # Allow external connections
+        server_port=7860
+    )

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (149 Bytes). View file

src/__pycache__/dataloader.cpython-313.pyc ADDED Viewed

Binary file (8.6 kB). View file

src/__pycache__/loss.cpython-313.pyc ADDED Viewed

Binary file (5.93 kB). View file

src/__pycache__/train.cpython-313.pyc ADDED Viewed

Binary file (21.5 kB). View file

src/dataloader.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import random
+from pathlib import Path
+from typing import Dict, List, Tuple
+import torch
+from torch.utils.data import Dataset
+import stempeg
+import soundfile as sf
+import math
+import numpy as np
+# ============================================================================
+# Data Loader
+# ============================================================================
+def get_random_prompt(stem_name: str) -> str:
+    """Get a random text prompt for a given stem."""
+    return random.choice(STEM_PROMPTS[stem_name])
+# Text Prompt Templates
+STEM_PROMPTS: Dict[str, List[str]] = {
+    "drums": ["drums", "drum kit", "percussion", "the drums"],
+    "bass": ["bass", "bass guitar", "the bass", "bass line"],
+    "other": ["other instruments", "accompaniment", "instruments"],
+    "vocals": ["vocals", "voice", "singing", "the vocals"],
+}
+PROMPT_TO_STEM: Dict[str, str] = {
+    prompt: stem
+    for stem, prompts in STEM_PROMPTS.items()
+    for prompt in prompts
+}
+STEM_NAME_TO_INDEX = {"drums": 0, "bass": 1, "other": 2, "vocals": 3}
+class MusDBStemDataset(Dataset):
+    def __init__(
+            self,
+            root_dir: str,
+            segment_samples: int,
+            sample_rate: int = 44100,
+            channels: int = 2,
+            random_segments: bool = True,
+            augment: bool = True,
+    ):
+        self.root_dir = Path(root_dir)
+        self.segment_samples = segment_samples
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.random_segments = random_segments
+        self.augment = augment
+        self.stem_names = ["drums", "bass", "other", "vocals"]
+        self.files = list(self.root_dir.glob("*.stem.mp4"))
+        if not self.files:
+            raise ValueError(f"No .stem.mp4 files found in {root_dir}")
+        # Compute number of examples
+        self.index_map      = []                        # (file_idx, stem_idx, segment_idx)
+        #self.sample_lengths = [0] * len(self.files)     # total samples per file
+        for file_idx, file in enumerate(self.files):
+            info = stempeg.Info(str(file))
+            total_samples = info.duration(0) * info.sample_rate(0)      # 0 - using mixture stream as reference
+            #self.sample_lengths[file_idx] = int(total_samples)
+            num_segments = math.ceil(total_samples / segment_samples)
+            # Build index map: for each stem, each segment
+            for stem_idx in range(len(self.stem_names)):
+                for seg in range(num_segments):
+                    self.index_map.append((file_idx, stem_idx, seg))
+        print(f"Found {len(self.files)} tracks, total dataset items: {len(self.index_map)}")
+    def __len__(self) -> int:
+        return len(self.index_map)
+    def _load_stems(self, filepath: Path) -> np.ndarray:
+        """Load all stems from a .stem.mp4 file."""
+        stems, rate = stempeg.read_stems(str(filepath))
+        # stems shape: (num_stems, samples, channels)
+        # [mix, drums, bass, other, vocals]
+        return stems
+    def _extract_random_segment(self, stems: np.ndarray) -> np.ndarray:
+        """Extract the same random segment from all stems."""
+        total_samples = stems.shape[1]  # stems: (num_stems, samples, channels)
+        if total_samples <= self.segment_samples:
+            # Pad if too short
+            pad_amount = self.segment_samples - total_samples
+            stems = np.pad(stems, ((0, 0), (0, pad_amount), (0, 0)), mode='constant')
+        else:
+            # Random start position (same for all stems)
+            if self.random_segments:
+                start = random.randint(0, total_samples - self.segment_samples)
+            else:
+                start = 0
+            stems = stems[:, start:start + self.segment_samples, :]
+        return stems
+    def _extract_segment(self, stems: np.ndarray, seg_idx: int) -> np.ndarray:
+        total_samples = stems.shape[1]
+        if self.random_segments:
+            # fallback to random segment extractor
+            return self._extract_random_segment(stems)
+        start = seg_idx * self.segment_samples
+        end = start + self.segment_samples
+        if end <= total_samples:
+            return stems[:, start:end, :]
+        else:
+            # Last segment may need padding
+            pad_amount = end - total_samples
+            seg = stems[:, start:, :]
+            seg = np.pad(seg, ((0, 0),(0, pad_amount), (0, 0)), mode="constant")
+            return seg
+    def _augment(self, mixture: np.ndarray, target: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Apply data augmentation."""
+        if random.random() < 0.5:
+            gain = random.uniform(0.7, 1.3)
+            mixture = mixture * gain
+            target = target * gain
+        if random.random() < 0.3 and mixture.shape[-1] == 2:
+            mixture = mixture[:, ::-1].copy()
+            target = target[:, ::-1].copy()
+        return mixture, target
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor | str]:
+        file_idx, stem_idx, seg_idx = self.index_map[idx]
+        filepath = self.files[file_idx]
+        stems = self._load_stems(filepath)
+        # deterministic segment selection
+        stems = self._extract_segment(stems, seg_idx)
+        mixture = stems[0]               # (T, C)
+        target  = stems[stem_idx+1]      # (T, C)
+        if self.augment:
+            mixture, target = self._augment(mixture, target)
+        # -> (C, T)
+        mixture = torch.from_numpy(mixture.T).float()
+        target  = torch.from_numpy(target.T).float()
+        # ensure stereo
+        if mixture.shape[0] == 1:
+            mixture = mixture.repeat(2, 1)
+            target  = target.repeat(2, 1)
+        prompt = get_random_prompt(self.stem_names[stem_idx])
+        return {
+            "mixture": mixture,
+            "target": target,
+            "prompt": prompt,
+            "stem_name": self.stem_names[stem_idx],
+            "file_idx": file_idx,
+            "segment_idx": seg_idx,
+        }
+def collate_fn(batch: List[Dict]) -> Dict[str, torch.Tensor | List[str]]:
+    """Custom collate function."""
+    return {
+        "mixture": torch.stack([item["mixture"] for item in batch]),
+        "target": torch.stack([item["target"] for item in batch]),
+        "prompt": [item["prompt"] for item in batch],
+        "stem_name": [item["stem_name"] for item in batch],
+    }

src/loss.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from typing import Dict, Tuple
+import torch
+# ============================================================================
+# Loss Functions
+# ============================================================================
+def sdr_loss(estimated, target):
+    """
+    Compute negative SDR loss.
+    Based on the definition from Vincent et al. 2006.
+    """
+    # Flatten to [batch, -1] to ensure compatible shapes
+    est_flat = estimated.reshape(estimated.shape[0], -1)
+    tgt_flat = target.reshape(target.shape[0], -1)
+    # Compute SDR: 10 * log10(||target||^2 / ||target - estimated||^2)
+    delta = 1e-8  # Small constant for numerical stability
+    num = torch.sum(tgt_flat ** 2, dim=-1)
+    den = torch.sum((tgt_flat - est_flat) ** 2, dim=-1)
+    # Avoid division by zero
+    sdr = 10 * torch.log10((num + delta) / (den + delta))
+    # Clamp to reasonable range to avoid extreme values
+    sdr = torch.clamp(sdr, min=-30, max=30)
+    return -sdr.mean()  # Return negative for minimization
+def sisdr_loss(estimated, target):
+    """
+    Compute negative SI-SDR (Scale-Invariant SDR) loss.
+    This is more robust to scaling differences between estimate and target.
+    """
+    # Flatten to [batch, -1]
+    est_flat = estimated.reshape(estimated.shape[0], -1)
+    tgt_flat = target.reshape(target.shape[0], -1)
+    # Zero-mean normalization (critical for SI-SDR)
+    est_flat = est_flat - est_flat.mean(dim=-1, keepdim=True)
+    tgt_flat = tgt_flat - tgt_flat.mean(dim=-1, keepdim=True)
+    # SI-SDR calculation
+    # Project estimate onto target: s_target = <s', s> / ||s||^2 * s
+    delta = 1e-8
+    dot = torch.sum(est_flat * tgt_flat, dim=-1, keepdim=True)
+    s_target_norm_sq = torch.sum(tgt_flat ** 2, dim=-1, keepdim=True)
+    # Scaled target
+    s_target = (dot / (s_target_norm_sq + delta)) * tgt_flat
+    # Noise is the orthogonal component
+    e_noise = est_flat - s_target
+    # SI-SDR = 10 * log10(||s_target||^2 / ||e_noise||^2)
+    s_target_energy = torch.sum(s_target ** 2, dim=-1)
+    e_noise_energy = torch.sum(e_noise ** 2, dim=-1)
+    sisdr = 10 * torch.log10((s_target_energy + delta) / (e_noise_energy + delta))
+    # Clamp to reasonable range
+    sisdr = torch.clamp(sisdr, min=-30, max=30)
+    return -sisdr.mean()  # Return negative for minimization
+def new_sdr_metric(estimated: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the SDR according to the MDX challenge definition (positive values).
+    This is for evaluation/logging, not for loss.
+    Args:
+        estimated: (batch, channels, time)
+        target: (batch, channels, time)
+    Returns:
+        SDR scores per batch item (batch,)
+    """
+    delta = 1e-8
+    num = torch.sum(target ** 2, dim=(1, 2))
+    den = torch.sum((target - estimated) ** 2, dim=(1, 2))
+    scores = 10 * torch.log10((num + delta) / (den + delta))
+    return scores
+def combined_loss(
+        estimated: torch.Tensor,
+        target: torch.Tensor,
+        sdr_weight: float = 0.9,
+        sisdr_weight: float = 0.1
+) -> Tuple[torch.Tensor, Dict[str, float]]:
+    """
+    Combined SDR and SI-SDR loss.
+    Args:
+        estimated: Estimated audio (batch, channels, time)
+        target: Target audio (batch, channels, time)
+        sdr_weight: Weight for SDR loss (default 0.9)
+        sisdr_weight: Weight for SI-SDR loss (default 0.1)
+    Returns:
+        total_loss: Combined loss for backpropagation
+        metrics: Dictionary of metrics for logging
+    """
+    sdr = sdr_loss(estimated, target)
+    sisdr = sisdr_loss(estimated, target)
+    total = sdr_weight * sdr + sisdr_weight * sisdr
+    # For logging, also compute positive SDR metric
+    with torch.no_grad():
+        pos_sdr = new_sdr_metric(estimated, target).mean()
+    metrics = {
+        "loss/total": total.item(),
+        "loss/sdr": sdr.item(),
+        "loss/sisdr": sisdr.item(),
+        "metrics/sdr": -sdr.item(),  # Positive SDR for logging
+        "metrics/sisdr": -sisdr.item(),  # Positive SI-SDR for logging
+        "metrics/new_sdr": pos_sdr.item(),  # MDX-style SDR
+    }
+    return total, metrics
+def combined_L1_sdr_loss(
+        estimated: torch.Tensor,
+        target: torch.Tensor,
+        sdr_weight: float = 1.0,
+        l1_weight: float = 0.05
+) -> Tuple[torch.Tensor, Dict[str, float]]:
+    """
+    Combined SDR and L1 loss.
+    Args:
+        estimated: Estimated audio (batch, channels, time)
+        target: Target audio (batch, channels, time)
+        sdr_weight: Weight for SDR loss (default 0.9)
+        l1_weight: Weight for SI-SDR loss (default 0.1)
+    Returns:
+        total_loss: Combined loss for backpropagation
+        metrics: Dictionary of metrics for logging
+    """
+    sdr = sdr_loss(estimated, target)
+    sisdr = sisdr_loss(estimated, target)
+    l1  = torch.nn.functional.l1_loss(estimated, target)
+    total = sdr_weight * sdr + l1_weight * l1
+    metrics = {
+        "loss/total": total.item(),
+        "loss/sdr": sdr.item(),
+        "loss/sisdr": sisdr.item(),
+        "metrics/sdr": -sdr.item(),  # Positive SDR for logging
+        "metrics/sisdr": -sisdr.item(),  # Positive SI-SDR for logging
+    }
+    return total, metrics

src/models/__init__.py ADDED Viewed

File without changes

src/models/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (156 Bytes). View file

src/models/stem_separation/ATHTDemucs_v2.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""
+AudioTextHTDemucs v2 - Text-conditioned source separation.
+Changes from v1:
+- Custom trainable decoder that outputs 1 source (not 4)
+- HTDemucs encoder kept (frozen)
+- CLAP text encoder (frozen)
+- Cross-attention conditioning at bottleneck
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Any
+from fractions import Fraction
+from einops import rearrange
+from demucs.htdemucs import HTDemucs
+from transformers import ClapModel, ClapTextModelWithProjection, RobertaTokenizerFast
+class TextCrossAttention(nn.Module):
+    """Cross-attention: audio features attend to text embeddings."""
+    def __init__(self, feat_dim, text_dim, n_heads=8, dropout=0.0):
+        super().__init__()
+        self.q_proj = nn.Linear(feat_dim, feat_dim)
+        self.k_proj = nn.Linear(text_dim, feat_dim)
+        self.v_proj = nn.Linear(text_dim, feat_dim)
+        self.attn = nn.MultiheadAttention(feat_dim, n_heads, batch_first=True, dropout=dropout)
+        self.out_mlp = nn.Sequential(
+            nn.Linear(feat_dim, feat_dim),
+            nn.GELU(),
+            nn.Linear(feat_dim, feat_dim),
+        )
+        self.norm_q = nn.LayerNorm(feat_dim)
+        self.norm_out = nn.LayerNorm(feat_dim)
+    def forward_attend(self, queries, text_emb):
+        q = self.norm_q(queries)
+        if text_emb.dim() == 2:
+            text_emb = text_emb.unsqueeze(1)
+        k = self.k_proj(text_emb)
+        v = self.v_proj(text_emb)
+        q_proj = self.q_proj(q)
+        attn_out, _ = self.attn(query=q_proj, key=k, value=v)
+        out = queries + attn_out
+        out = out + self.out_mlp(out)
+        return self.norm_out(out)
+    def forward(self, x, xt, text_emb):
+        B, C, F, T = x.shape
+        x_seq = rearrange(x, "b c f t -> b (f t) c")
+        xt_seq = rearrange(xt, "b c t -> b t c")
+        x_seq = self.forward_attend(x_seq, text_emb)
+        xt_seq = self.forward_attend(xt_seq, text_emb)
+        x = rearrange(x_seq, "b (f t) c -> b c f t", f=F, t=T)
+        xt = rearrange(xt_seq, "b t c -> b c t")
+        return x, xt
+class FreqDecoder(nn.Module):
+    """Frequency-domain decoder: mirrors HTDemucs encoder structure but outputs 1 source."""
+    def __init__(self, channels: List[int], kernel_size: int = 8, stride: int = 4):
+        """
+        channels: List of channel dims from bottleneck to output, e.g. [384, 192, 96, 48, 2]
+        """
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(len(channels) - 1):
+            in_ch = channels[i]
+            out_ch = channels[i + 1]
+            is_last = (i == len(channels) - 2)
+            self.layers.append(nn.Sequential(
+                nn.ConvTranspose2d(in_ch, out_ch, kernel_size=(kernel_size, 1), stride=(stride, 1), padding=(kernel_size//4, 0)),
+                nn.GroupNorm(1, out_ch) if not is_last else nn.Identity(),
+                nn.GELU() if not is_last else nn.Identity(),
+            ))
+    def forward(self, x, skips: List[torch.Tensor], target_lengths: List[int]):
+        """
+        x: (B, C, F, T) bottleneck features
+        skips: encoder skip connections (reversed order)
+        target_lengths: target frequency dimensions for each layer
+        """
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            # Match target size
+            if i < len(target_lengths):
+                target_f = target_lengths[i]
+                if x.shape[2] != target_f:
+                    x = F.interpolate(x, size=(target_f, x.shape[3]), mode='bilinear', align_corners=False)
+            # Add skip connection if available
+            if i < len(skips):
+                skip = skips[i]
+                # Project skip to match channels if needed
+                if skip.shape[1] != x.shape[1]:
+                    skip = skip[:, :x.shape[1]]  # Simple channel truncation
+                if skip.shape[2:] != x.shape[2:]:
+                    skip = F.interpolate(skip, size=x.shape[2:], mode='bilinear', align_corners=False)
+                x = x + skip * 0.1  # Scaled residual
+        return x
+class TimeDecoder(nn.Module):
+    """Time-domain decoder: outputs 1 source waveform."""
+    def __init__(self, channels: List[int], kernel_size: int = 8, stride: int = 4):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(len(channels) - 1):
+            in_ch = channels[i]
+            out_ch = channels[i + 1]
+            is_last = (i == len(channels) - 2)
+            self.layers.append(nn.Sequential(
+                nn.ConvTranspose1d(in_ch, out_ch, kernel_size, stride, padding=kernel_size//4),
+                nn.GroupNorm(1, out_ch) if not is_last else nn.Identity(),
+                nn.GELU() if not is_last else nn.Identity(),
+            ))
+    def forward(self, x, skips: List[torch.Tensor], target_lengths: List[int]):
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i < len(target_lengths):
+                target_t = target_lengths[i]
+                if x.shape[2] != target_t:
+                    x = F.interpolate(x, size=target_t, mode='linear', align_corners=False)
+            if i < len(skips):
+                skip = skips[i]
+                if skip.shape[1] != x.shape[1]:
+                    skip = skip[:, :x.shape[1]]
+                if skip.shape[2] != x.shape[2]:
+                    skip = F.interpolate(skip, size=x.shape[2], mode='linear', align_corners=False)
+                x = x + skip * 0.1
+        return x
+class AudioTextHTDemucs(nn.Module):
+    """
+    Text-conditioned source separation.
+    - HTDemucs encoder (frozen): extracts multi-scale audio features
+    - CLAP (frozen): text embeddings
+    - Cross-attention: conditions audio on text at bottleneck
+    - Custom decoder (trainable): outputs single source
+    """
+    def __init__(
+        self,
+        htdemucs_model: HTDemucs,
+        clap_encoder: ClapModel | ClapTextModelWithProjection,
+        clap_tokenizer: RobertaTokenizerFast,
+        model_dim: int = 384,
+        text_dim: int = 512,
+        num_heads: int = 8,
+        sample_rate: int = 44100,
+        segment: float = 7.8,
+    ):
+        super().__init__()
+        self.htdemucs = htdemucs_model
+        self.clap = clap_encoder
+        self.tokenizer = clap_tokenizer
+        self.sample_rate = sample_rate
+        self.segment = segment
+        # Freeze HTDemucs encoder
+        for param in self.htdemucs.parameters():
+            param.requires_grad = False
+        # Freeze CLAP
+        for param in self.clap.parameters():
+            param.requires_grad = False
+        # Text cross-attention at bottleneck
+        self.text_attn = TextCrossAttention(model_dim, text_dim, num_heads)
+        # Custom decoders (trainable) - output 1 source with 2 channels (stereo)
+        # Channel progression: 384 -> 192 -> 96 -> 48 -> 4 (will be reshaped to 2 channels)
+        self.freq_decoder = FreqDecoder([384, 192, 96, 48, 4])
+        self.time_decoder = TimeDecoder([384, 192, 96, 48, 4])
+        # Final projection to stereo
+        self.freq_out = nn.Conv2d(4, 2, 1)
+        self.time_out = nn.Conv1d(4, 2, 1)
+    def _encode(self, x, xt):
+        """Run HTDemucs encoder, save skip connections."""
+        saved = []
+        saved_t = []
+        lengths = []
+        lengths_t = []
+        for idx, encode in enumerate(self.htdemucs.encoder):
+            lengths.append(x.shape[-1])
+            inject = None
+            if idx < len(self.htdemucs.tencoder):
+                lengths_t.append(xt.shape[-1])
+                tenc = self.htdemucs.tencoder[idx]
+                xt = tenc(xt)
+                if not tenc.empty:
+                    saved_t.append(xt)
+                else:
+                    inject = xt
+            x = encode(x, inject)
+            if idx == 0 and self.htdemucs.freq_emb is not None:
+                frs = torch.arange(x.shape[-2], device=x.device)
+                emb = self.htdemucs.freq_emb(frs).t()[None, :, :, None].expand_as(x)
+                x = x + self.htdemucs.freq_emb_scale * emb
+            saved.append(x)
+        # Cross-transformer at bottleneck
+        if self.htdemucs.crosstransformer:
+            if self.htdemucs.bottom_channels:
+                b, c, f, t = x.shape
+                x = rearrange(x, "b c f t -> b c (f t)")
+                x = self.htdemucs.channel_upsampler(x)
+                x = rearrange(x, "b c (f t) -> b c f t", f=f)
+                xt = self.htdemucs.channel_upsampler_t(xt)
+            x, xt = self.htdemucs.crosstransformer(x, xt)
+            if self.htdemucs.bottom_channels:
+                x = rearrange(x, "b c f t -> b c (f t)")
+                x = self.htdemucs.channel_downsampler(x)
+                x = rearrange(x, "b c (f t) -> b c f t", f=f)
+                xt = self.htdemucs.channel_downsampler_t(xt)
+        return x, xt, saved, saved_t, lengths, lengths_t
+    def _get_clap_embeddings(self, text: List[str], device):
+        inputs = self.tokenizer(text, padding=True, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        if isinstance(self.clap, ClapModel):
+            # Use get_text_features for ClapModel
+            with torch.no_grad():
+                return self.clap.get_text_features(**inputs)
+        else:
+            # Use forward pass for ClapTextModelWithProjection
+            with torch.no_grad():
+                return self.clap.forward(**inputs).text_embeds
+    def forward(self, wav, text):
+        """
+        wav: (B, 2, T) stereo mixture
+        text: List[str] prompts
+        Returns: (B, 2, T) separated stereo source
+        """
+        device = wav.device
+        B = wav.shape[0]
+        original_length = wav.shape[-1]
+        # Compute spectrogram (ensure all on same device)
+        z = self.htdemucs._spec(wav).to(device)
+        mag = self.htdemucs._magnitude(z).to(device)
+        x = mag
+        B, C, Fq, T_spec = x.shape
+        # Normalize
+        mean = x.mean(dim=(1, 2, 3), keepdim=True)
+        std = x.std(dim=(1, 2, 3), keepdim=True)
+        x = (x - mean) / (1e-5 + std)
+        xt = wav
+        meant = xt.mean(dim=(1, 2), keepdim=True)
+        stdt = xt.std(dim=(1, 2), keepdim=True)
+        xt = (xt - meant) / (1e-5 + stdt)
+        # Encode (frozen)
+        with torch.no_grad():
+            x_enc, xt_enc, saved, saved_t, lengths, lengths_t = self._encode(x, xt)
+        # Text conditioning via cross-attention (trainable)
+        text_emb = self._get_clap_embeddings(text, device)
+        x_cond, xt_cond = self.text_attn(x_enc, xt_enc, text_emb)
+        # Decode with custom decoder (trainable)
+        # Reverse skips for decoder
+        saved_rev = saved[::-1]
+        saved_t_rev = saved_t[::-1]
+        lengths_rev = lengths[::-1]
+        lengths_t_rev = lengths_t[::-1]
+        # Frequency decoder
+        x_dec = self.freq_decoder(x_cond, saved_rev, lengths_rev)
+        x_dec = self.freq_out(x_dec)  # (B, 2, F, T)
+        # Interpolate to match original spectrogram size
+        x_dec = F.interpolate(x_dec, size=(Fq, T_spec), mode='bilinear', align_corners=False)
+        # Apply as mask and invert spectrogram
+        mask = torch.sigmoid(x_dec)  # (B, 2, F, T) in [0, 1]
+        # mag is (B, C, F, T) from htdemucs - take first 2 channels for stereo
+        mag_stereo = mag[:, :2, :, :]  # (B, 2, F, T)
+        masked_spec = mag_stereo * mask
+        # z is complex (B, C, F, T) - take stereo channels
+        z_stereo = z[:, :2, :, :]  # (B, 2, F, T)
+        phase = z_stereo / (mag_stereo + 1e-8)  # Complex phase
+        masked_z = masked_spec * phase  # Apply mask while preserving phase
+        freq_wav = self.htdemucs._ispec(masked_z, original_length).to(device)
+        # Time decoder
+        xt_dec = self.time_decoder(xt_cond, saved_t_rev, lengths_t_rev)
+        xt_dec = self.time_out(xt_dec)  # (B, 2, T)
+        # Interpolate to original length
+        if xt_dec.shape[-1] != original_length:
+            xt_dec = F.interpolate(xt_dec, size=original_length, mode='linear', align_corners=False)
+        # Denormalize time output
+        xt_dec = xt_dec * stdt + meant
+        # Combine frequency and time branches
+        output = freq_wav + xt_dec
+        return output
+if __name__ == "__main__":
+    from demucs import pretrained
+    htdemucs = pretrained.get_model('htdemucs').models[0]
+    clap = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+    tokenizer = __import__('transformers').AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    model = AudioTextHTDemucs(htdemucs, clap, tokenizer)
+    # Count params
+    total = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total params: {total:,}")
+    print(f"Trainable params: {trainable:,}")
+    # Test forward
+    wav = torch.randn(2, 2, 44100 * 3)
+    prompts = ["drums", "bass"]
+    out = model(wav, prompts)
+    print(f"Input: {wav.shape} -> Output: {out.shape}")

src/models/stem_separation/AudioTextDemucsV2.txt ADDED Viewed

	@@ -0,0 +1,237 @@

+Model:
+TextConditionedSeparator(
+  (clap): ClapModel(
+    (text_model): ClapTextModel(
+      (embeddings): ClapTextEmbeddings(
+        (word_embeddings): Embedding(50265, 768, padding_idx=1)
+        (position_embeddings): Embedding(514, 768, padding_idx=1)
+        (token_type_embeddings): Embedding(1, 768)
+        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (encoder): ClapTextEncoder(
+        (layer): ModuleList(
+          (0-11): 12 x ClapTextLayer(
+            (attention): ClapTextAttention(
+              (self): ClapTextSelfAttention(
+                (query): Linear(in_features=768, out_features=768, bias=True)
+                (key): Linear(in_features=768, out_features=768, bias=True)
+                (value): Linear(in_features=768, out_features=768, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+              (output): ClapTextSelfOutput(
+                (dense): Linear(in_features=768, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (intermediate): ClapTextIntermediate(
+              (dense): Linear(in_features=768, out_features=3072, bias=True)
+              (intermediate_act_fn): GELUActivation()
+            )
+            (output): ClapTextOutput(
+              (dense): Linear(in_features=3072, out_features=768, bias=True)
+              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+              (dropout): Dropout(p=0.1, inplace=False)
+            )
+          )
+        )
+      )
+      (pooler): ClapTextPooler(
+        (dense): Linear(in_features=768, out_features=768, bias=True)
+        (activation): Tanh()
+      )
+    )
+    (text_projection): ClapProjectionLayer(
+      (linear1): Linear(in_features=768, out_features=512, bias=True)
+      (activation): ReLU()
+      (linear2): Linear(in_features=512, out_features=512, bias=True)
+    )
+    (audio_model): ClapAudioModel(
+      (audio_encoder): ClapAudioEncoder(
+        (patch_embed): ClapAudioPatchEmbed(
+          (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4))
+          (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
+        )
+        (layers): ModuleList(
+          (0): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-1): 2 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=96, out_features=96, bias=True)
+                    (key): Linear(in_features=96, out_features=96, bias=True)
+                    (value): Linear(in_features=96, out_features=96, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=96, out_features=96, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=96, out_features=384, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=384, out_features=96, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+            (downsample): ClapAudioPatchMerging(
+              (reduction): Linear(in_features=384, out_features=192, bias=False)
+              (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (1): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-1): 2 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=192, out_features=192, bias=True)
+                    (key): Linear(in_features=192, out_features=192, bias=True)
+                    (value): Linear(in_features=192, out_features=192, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=192, out_features=192, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=192, out_features=768, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=768, out_features=192, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+            (downsample): ClapAudioPatchMerging(
+              (reduction): Linear(in_features=768, out_features=384, bias=False)
+              (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (2): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-5): 6 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=384, out_features=384, bias=True)
+                    (key): Linear(in_features=384, out_features=384, bias=True)
+                    (value): Linear(in_features=384, out_features=384, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=384, out_features=384, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=384, out_features=1536, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=1536, out_features=384, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+            (downsample): ClapAudioPatchMerging(
+              (reduction): Linear(in_features=1536, out_features=768, bias=False)
+              (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (3): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-1): 2 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+          )
+        )
+        (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+        (avgpool): AdaptiveAvgPool1d(output_size=1)
+      )
+    )
+    (audio_projection): ClapProjectionLayer(
+      (linear1): Linear(in_features=768, out_features=512, bias=True)
+      (activation): ReLU()
+      (linear2): Linear(in_features=512, out_features=512, bias=True)
+    )
+  )
+  (z_encoder): PatchConv1d(
+    (conv): Conv1d(1, 256, kernel_size=(16,), stride=(8,))
+  )
+  (text_proj): Linear(in_features=512, out_features=256, bias=True)
+  (z_proj): Linear(in_features=256, out_features=256, bias=True)
+  (cross): CrossAttention(
+    (attn): MultiheadAttention(
+      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
+    )
+    (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+    (ff): MLP(
+      (fc1): Linear(in_features=256, out_features=1024, bias=True)
+      (fc2): Linear(in_features=1024, out_features=256, bias=True)
+      (act): GELU(approximate='none')
+    )
+    (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+  )
+  (transformer): TransformerEncoder(
+    (layers): ModuleList(
+      (0-5): 6 x TransformerEncoderLayer(
+        (self_attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
+        )
+        (linear1): Linear(in_features=256, out_features=1024, bias=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (linear2): Linear(in_features=1024, out_features=256, bias=True)
+        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
+        (dropout1): Dropout(p=0.1, inplace=False)
+        (dropout2): Dropout(p=0.1, inplace=False)
+      )
+    )
+  )
+  (spec_decoder): Sequential(
+    (0): Linear(in_features=256, out_features=256, bias=True)
+    (1): GELU(approximate='none')
+    (2): Linear(in_features=256, out_features=2049, bias=True)
+  )
+)
+output waveform shape: torch.Size([2, 1, 48000])
+output spectrogram shape: torch.Size([2, 12001, 2049])

src/models/stem_separation/AudioTextHTDemucs_Full.txt ADDED Viewed

	@@ -0,0 +1,889 @@

+Model Summary:
+AudioTextHTDemucs(
+  (htdemucs): HTDemucs(
+    (encoder): ModuleList(
+      (0): HEncLayer(
+        (conv): Conv2d(4, 48, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HEncLayer(
+        (conv): Conv2d(48, 96, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HEncLayer(
+        (conv): Conv2d(96, 192, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HEncLayer(
+        (conv): Conv2d(192, 384, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (decoder): ModuleList(
+      (0): HDecLayer(
+        (conv_tr): ConvTranspose2d(384, 192, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(384, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HDecLayer(
+        (conv_tr): ConvTranspose2d(192, 96, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HDecLayer(
+        (conv_tr): ConvTranspose2d(96, 48, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(96, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HDecLayer(
+        (conv_tr): ConvTranspose2d(48, 16, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(48, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (tencoder): ModuleList(
+      (0): HEncLayer(
+        (conv): Conv1d(2, 48, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(48, 96, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HEncLayer(
+        (conv): Conv1d(48, 96, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HEncLayer(
+        (conv): Conv1d(96, 192, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HEncLayer(
+        (conv): Conv1d(192, 384, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(384, 768, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (tdecoder): ModuleList(
+      (0): HDecLayer(
+        (conv_tr): ConvTranspose1d(384, 192, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(384, 768, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HDecLayer(
+        (conv_tr): ConvTranspose1d(192, 96, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(192, 384, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HDecLayer(
+        (conv_tr): ConvTranspose1d(96, 48, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(96, 192, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HDecLayer(
+        (conv_tr): ConvTranspose1d(48, 8, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(48, 96, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (freq_emb): ScaledEmbedding(
+      (embedding): Embedding(512, 48)
+    )
+    (channel_upsampler): Conv1d(384, 512, kernel_size=(1,), stride=(1,))
+    (channel_downsampler): Conv1d(512, 384, kernel_size=(1,), stride=(1,))
+    (channel_upsampler_t): Conv1d(384, 512, kernel_size=(1,), stride=(1,))
+    (channel_downsampler_t): Conv1d(512, 384, kernel_size=(1,), stride=(1,))
+    (crosstransformer): CrossTransformerEncoder(
+      (norm_in): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+      (norm_in_t): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+      (layers): ModuleList(
+        (0): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (1): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (2): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (3): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (4): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+      )
+      (layers_t): ModuleList(
+        (0): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (1): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (2): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (3): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (4): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+      )
+    )
+  )
+  (clap): ClapModel(
+    (text_model): ClapTextModel(
+      (embeddings): ClapTextEmbeddings(
+        (word_embeddings): Embedding(50265, 768, padding_idx=1)
+        (position_embeddings): Embedding(514, 768, padding_idx=1)
+        (token_type_embeddings): Embedding(1, 768)
+        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (encoder): ClapTextEncoder(
+        (layer): ModuleList(
+          (0-11): 12 x ClapTextLayer(
+            (attention): ClapTextAttention(
+              (self): ClapTextSelfAttention(
+                (query): Linear(in_features=768, out_features=768, bias=True)
+                (key): Linear(in_features=768, out_features=768, bias=True)
+                (value): Linear(in_features=768, out_features=768, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+              (output): ClapTextSelfOutput(
+                (dense): Linear(in_features=768, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (intermediate): ClapTextIntermediate(
+              (dense): Linear(in_features=768, out_features=3072, bias=True)
+              (intermediate_act_fn): GELUActivation()
+            )
+            (output): ClapTextOutput(
+              (dense): Linear(in_features=3072, out_features=768, bias=True)
+              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+              (dropout): Dropout(p=0.1, inplace=False)
+            )
+          )
+        )
+      )
+      (pooler): ClapTextPooler(
+        (dense): Linear(in_features=768, out_features=768, bias=True)
+        (activation): Tanh()
+      )
+    )
+    (text_projection): ClapProjectionLayer(
+      (linear1): Linear(in_features=768, out_features=512, bias=True)
+      (activation): ReLU()
+      (linear2): Linear(in_features=512, out_features=512, bias=True)
+    )
+    (audio_model): ClapAudioModel(
+      (audio_encoder): ClapAudioEncoder(
+        (patch_embed): ClapAudioPatchEmbed(
+          (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4))
+          (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
+        )
+        (layers): ModuleList(
+          (0): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-1): 2 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=96, out_features=96, bias=True)
+                    (key): Linear(in_features=96, out_features=96, bias=True)
+                    (value): Linear(in_features=96, out_features=96, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=96, out_features=96, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=96, out_features=384, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=384, out_features=96, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+            (downsample): ClapAudioPatchMerging(
+              (reduction): Linear(in_features=384, out_features=192, bias=False)
+              (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (1): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-1): 2 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=192, out_features=192, bias=True)
+                    (key): Linear(in_features=192, out_features=192, bias=True)
+                    (value): Linear(in_features=192, out_features=192, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=192, out_features=192, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=192, out_features=768, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=768, out_features=192, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+            (downsample): ClapAudioPatchMerging(
+              (reduction): Linear(in_features=768, out_features=384, bias=False)
+              (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (2): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-5): 6 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=384, out_features=384, bias=True)
+                    (key): Linear(in_features=384, out_features=384, bias=True)
+                    (value): Linear(in_features=384, out_features=384, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=384, out_features=384, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=384, out_features=1536, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=1536, out_features=384, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+            (downsample): ClapAudioPatchMerging(
+              (reduction): Linear(in_features=1536, out_features=768, bias=False)
+              (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
+            )
+          )
+          (3): ClapAudioStage(
+            (blocks): ModuleList(
+              (0-1): 2 x ClapAudioLayer(
+                (layernorm_before): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (attention): ClapAudioAttention(
+                  (self): ClapAudioSelfAttention(
+                    (query): Linear(in_features=768, out_features=768, bias=True)
+                    (key): Linear(in_features=768, out_features=768, bias=True)
+                    (value): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                  (output): ClapAudioSelfOutput(
+                    (dense): Linear(in_features=768, out_features=768, bias=True)
+                    (dropout): Dropout(p=0.0, inplace=False)
+                  )
+                )
+                (drop_path): Identity()
+                (layernorm_after): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                (intermediate): ClapAudioIntermediate(
+                  (dense): Linear(in_features=768, out_features=3072, bias=True)
+                  (intermediate_act_fn): GELUActivation()
+                )
+                (output): ClapAudioOutput(
+                  (dense): Linear(in_features=3072, out_features=768, bias=True)
+                  (dropout): Dropout(p=0.1, inplace=False)
+                )
+              )
+            )
+          )
+        )
+        (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+        (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+        (avgpool): AdaptiveAvgPool1d(output_size=1)
+      )
+    )
+    (audio_projection): ClapProjectionLayer(
+      (linear1): Linear(in_features=768, out_features=512, bias=True)
+      (activation): ReLU()
+      (linear2): Linear(in_features=512, out_features=512, bias=True)
+    )
+  )
+  (text_attn): TextCrossAttention(
+    (q_proj): Linear(in_features=384, out_features=384, bias=True)
+    (k_proj): Linear(in_features=512, out_features=384, bias=True)
+    (v_proj): Linear(in_features=512, out_features=384, bias=True)
+    (attn): MultiheadAttention(
+      (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
+    )
+    (out_mlp): Sequential(
+      (0): Linear(in_features=384, out_features=384, bias=True)
+      (1): GELU(approximate='none')
+      (2): Linear(in_features=384, out_features=384, bias=True)
+    )
+    (norm_q): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+    (norm_out): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+  )
+  (freq_decoder): FreqDecoder(
+    (layers): ModuleList(
+      (0): Sequential(
+        (0): ConvTranspose2d(384, 192, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): GroupNorm(1, 192, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (1): Sequential(
+        (0): ConvTranspose2d(192, 96, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): GroupNorm(1, 96, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (2): Sequential(
+        (0): ConvTranspose2d(96, 48, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (3): Sequential(
+        (0): ConvTranspose2d(48, 4, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): Identity()
+        (2): Identity()
+      )
+    )
+  )
+  (time_decoder): TimeDecoder(
+    (layers): ModuleList(
+      (0): Sequential(
+        (0): ConvTranspose1d(384, 192, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): GroupNorm(1, 192, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (1): Sequential(
+        (0): ConvTranspose1d(192, 96, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): GroupNorm(1, 96, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (2): Sequential(
+        (0): ConvTranspose1d(96, 48, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (3): Sequential(
+        (0): ConvTranspose1d(48, 4, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): Identity()
+        (2): Identity()
+      )
+    )
+  )
+  (freq_out): Conv2d(4, 2, kernel_size=(1, 1), stride=(1, 1))
+  (time_out): Conv1d(4, 2, kernel_size=(1,), stride=(1,))
+)

src/models/stem_separation/AudioTextHTDemucs_Text_Only.txt ADDED Viewed

	@@ -0,0 +1,745 @@

+Loading pretrained HTDemucs...
+Loading CLAP model...
+Model Summary:
+AudioTextHTDemucs(
+  (htdemucs): HTDemucs(
+    (encoder): ModuleList(
+      (0): HEncLayer(
+        (conv): Conv2d(4, 48, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HEncLayer(
+        (conv): Conv2d(48, 96, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(96, 192, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HEncLayer(
+        (conv): Conv2d(96, 192, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(192, 384, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HEncLayer(
+        (conv): Conv2d(192, 384, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (norm1): Identity()
+        (rewrite): Conv2d(384, 768, kernel_size=(1, 1), stride=(1, 1))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (decoder): ModuleList(
+      (0): HDecLayer(
+        (conv_tr): ConvTranspose2d(384, 192, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(384, 768, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HDecLayer(
+        (conv_tr): ConvTranspose2d(192, 96, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HDecLayer(
+        (conv_tr): ConvTranspose2d(96, 48, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(96, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HDecLayer(
+        (conv_tr): ConvTranspose2d(48, 16, kernel_size=(8, 1), stride=(4, 1))
+        (norm2): Identity()
+        (rewrite): Conv2d(48, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (tencoder): ModuleList(
+      (0): HEncLayer(
+        (conv): Conv1d(2, 48, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(48, 96, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HEncLayer(
+        (conv): Conv1d(48, 96, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(96, 192, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HEncLayer(
+        (conv): Conv1d(96, 192, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(192, 384, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HEncLayer(
+        (conv): Conv1d(192, 384, kernel_size=(8,), stride=(4,), padding=(2,))
+        (norm1): Identity()
+        (rewrite): Conv1d(384, 768, kernel_size=(1,), stride=(1,))
+        (norm2): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (tdecoder): ModuleList(
+      (0): HDecLayer(
+        (conv_tr): ConvTranspose1d(384, 192, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(384, 768, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(384, 48, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(48, 768, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 768, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (1): HDecLayer(
+        (conv_tr): ConvTranspose1d(192, 96, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(192, 384, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(192, 24, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 24, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(24, 384, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 384, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (2): HDecLayer(
+        (conv_tr): ConvTranspose1d(96, 48, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(96, 192, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(96, 12, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 12, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(12, 192, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 192, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+      (3): HDecLayer(
+        (conv_tr): ConvTranspose1d(48, 8, kernel_size=(8,), stride=(4,))
+        (norm2): Identity()
+        (rewrite): Conv1d(48, 96, kernel_size=(3,), stride=(1,), padding=(1,))
+        (norm1): Identity()
+        (dconv): DConv(
+          (layers): ModuleList(
+            (0): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+            (1): Sequential(
+              (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
+              (1): GroupNorm(1, 6, eps=1e-05, affine=True)
+              (2): GELU(approximate='none')
+              (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
+              (4): GroupNorm(1, 96, eps=1e-05, affine=True)
+              (5): GLU(dim=1)
+              (6): LayerScale()
+            )
+          )
+        )
+      )
+    )
+    (freq_emb): ScaledEmbedding(
+      (embedding): Embedding(512, 48)
+    )
+    (channel_upsampler): Conv1d(384, 512, kernel_size=(1,), stride=(1,))
+    (channel_downsampler): Conv1d(512, 384, kernel_size=(1,), stride=(1,))
+    (channel_upsampler_t): Conv1d(384, 512, kernel_size=(1,), stride=(1,))
+    (channel_downsampler_t): Conv1d(512, 384, kernel_size=(1,), stride=(1,))
+    (crosstransformer): CrossTransformerEncoder(
+      (norm_in): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+      (norm_in_t): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+      (layers): ModuleList(
+        (0): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (1): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (2): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (3): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (4): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+      )
+      (layers_t): ModuleList(
+        (0): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (1): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (2): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+        (3): CrossTransformerEncoderLayer(
+          (cross_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+        )
+        (4): MyTransformerEncoderLayer(
+          (self_attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+          )
+          (linear1): Linear(in_features=512, out_features=2048, bias=True)
+          (dropout): Dropout(p=0.02, inplace=False)
+          (linear2): Linear(in_features=2048, out_features=512, bias=True)
+          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+          (dropout1): Dropout(p=0.02, inplace=False)
+          (dropout2): Dropout(p=0.02, inplace=False)
+          (norm_out): MyGroupNorm(1, 512, eps=1e-05, affine=True)
+          (gamma_1): LayerScale()
+          (gamma_2): LayerScale()
+        )
+      )
+    )
+  )
+  (clap): ClapTextModelWithProjection(
+    (text_model): ClapTextModel(
+      (embeddings): ClapTextEmbeddings(
+        (word_embeddings): Embedding(50265, 768, padding_idx=1)
+        (position_embeddings): Embedding(514, 768, padding_idx=1)
+        (token_type_embeddings): Embedding(1, 768)
+        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (encoder): ClapTextEncoder(
+        (layer): ModuleList(
+          (0-11): 12 x ClapTextLayer(
+            (attention): ClapTextAttention(
+              (self): ClapTextSelfAttention(
+                (query): Linear(in_features=768, out_features=768, bias=True)
+                (key): Linear(in_features=768, out_features=768, bias=True)
+                (value): Linear(in_features=768, out_features=768, bias=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+              (output): ClapTextSelfOutput(
+                (dense): Linear(in_features=768, out_features=768, bias=True)
+                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+                (dropout): Dropout(p=0.1, inplace=False)
+              )
+            )
+            (intermediate): ClapTextIntermediate(
+              (dense): Linear(in_features=768, out_features=3072, bias=True)
+              (intermediate_act_fn): GELUActivation()
+            )
+            (output): ClapTextOutput(
+              (dense): Linear(in_features=3072, out_features=768, bias=True)
+              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
+              (dropout): Dropout(p=0.1, inplace=False)
+            )
+          )
+        )
+      )
+      (pooler): ClapTextPooler(
+        (dense): Linear(in_features=768, out_features=768, bias=True)
+        (activation): Tanh()
+      )
+    )
+    (text_projection): ClapProjectionLayer(
+      (linear1): Linear(in_features=768, out_features=512, bias=True)
+      (activation): ReLU()
+      (linear2): Linear(in_features=512, out_features=512, bias=True)
+    )
+  )
+  (text_attn): TextCrossAttention(
+    (q_proj): Linear(in_features=384, out_features=384, bias=True)
+    (k_proj): Linear(in_features=512, out_features=384, bias=True)
+    (v_proj): Linear(in_features=512, out_features=384, bias=True)
+    (attn): MultiheadAttention(
+      (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
+    )
+    (out_mlp): Sequential(
+      (0): Linear(in_features=384, out_features=384, bias=True)
+      (1): GELU(approximate='none')
+      (2): Linear(in_features=384, out_features=384, bias=True)
+    )
+    (norm_q): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+    (norm_out): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
+  )
+  (freq_decoder): FreqDecoder(
+    (layers): ModuleList(
+      (0): Sequential(
+        (0): ConvTranspose2d(384, 192, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): GroupNorm(1, 192, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (1): Sequential(
+        (0): ConvTranspose2d(192, 96, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): GroupNorm(1, 96, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (2): Sequential(
+        (0): ConvTranspose2d(96, 48, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (3): Sequential(
+        (0): ConvTranspose2d(48, 4, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
+        (1): Identity()
+        (2): Identity()
+      )
+    )
+  )
+  (time_decoder): TimeDecoder(
+    (layers): ModuleList(
+      (0): Sequential(
+        (0): ConvTranspose1d(384, 192, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): GroupNorm(1, 192, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (1): Sequential(
+        (0): ConvTranspose1d(192, 96, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): GroupNorm(1, 96, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (2): Sequential(
+        (0): ConvTranspose1d(96, 48, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): GroupNorm(1, 48, eps=1e-05, affine=True)
+        (2): GELU(approximate='none')
+      )
+      (3): Sequential(
+        (0): ConvTranspose1d(48, 4, kernel_size=(8,), stride=(4,), padding=(2,))
+        (1): Identity()
+        (2): Identity()
+      )
+    )
+  )
+  (freq_out): Conv2d(4, 2, kernel_size=(1, 1), stride=(1, 1))
+  (time_out): Conv1d(4, 2, kernel_size=(1,), stride=(1,))
+)

src/models/stem_separation/CLAP_Text_Model_Fwd_Pass.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+** NOTE: get_text_features() method does a final projection to 512 dims and normalization after this forward pass
+=========================================================================================================
+Layer (type:depth-idx)                                  Output Shape              Param #
+=========================================================================================================
+ClapTextModel                                           [1, 768]                  --
+├─ClapTextEmbeddings: 1-1                               [1, 5, 768]               --
+│    └─Embedding: 2-1                                   [1, 5, 768]               (38,603,520)
+│    └─Embedding: 2-2                                   [1, 5, 768]               (768)
+│    └─Embedding: 2-3                                   [1, 5, 768]               (394,752)
+│    └─LayerNorm: 2-4                                   [1, 5, 768]               (1,536)
+│    └─Dropout: 2-5                                     [1, 5, 768]               --
+├─ClapTextEncoder: 1-2                                  [1, 5, 768]               --
+│    └─ModuleList: 2-6                                  --                        --
+│    │    └─ClapTextLayer: 3-1                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-2                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-3                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-4                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-5                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-6                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-7                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-8                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-9                          [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-10                         [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-11                         [1, 5, 768]               (7,087,872)
+│    │    └─ClapTextLayer: 3-12                         [1, 5, 768]               (7,087,872)
+├─ClapTextPooler: 1-3                                   [1, 768]                  --
+│    └─Linear: 2-7                                      [1, 768]                  (590,592)
+│    └─Tanh: 2-8                                        [1, 768]                  --
+=========================================================================================================
+Total params: 124,645,632
+Trainable params: 0
+Non-trainable params: 124,645,632
+Total mult-adds (Units.MEGABYTES): 124.65
+=========================================================================================================
+Input size (MB): 0.00
+Forward/backward pass size (MB): 4.18
+Params size (MB): 498.58
+Estimated Total Size (MB): 502.77
+=========================================================================================================

src/models/stem_separation/HTDemucs_Fwd_Pass.txt ADDED Viewed

	@@ -0,0 +1,156 @@

+====================================================================================================
+Layer (type:depth-idx)                             Output Shape              Param #
+====================================================================================================
+HTDemucs                                           [1, 4, 2, 264600]         --
+├─ModuleList: 1-8                                  --                        (recursive)
+│    └─HEncLayer: 2-1                              [1, 48, 85995]            --
+│    │    └─Conv1d: 3-1                            [1, 48, 85995]            (816)
+│    │    └─Identity: 3-2                          [1, 48, 85995]            --
+│    │    └─DConv: 3-3                             [1, 48, 85995]            (3,588)
+│    │    └─Conv1d: 3-4                            [1, 96, 85995]            (4,704)
+│    │    └─Identity: 3-5                          [1, 96, 85995]            --
+├─ModuleList: 1-9                                  --                        (recursive)
+│    └─HEncLayer: 2-2                              [1, 48, 512, 336]         --
+│    │    └─Conv2d: 3-6                            [1, 48, 512, 336]         (1,584)
+│    │    └─Identity: 3-7                          [1, 48, 512, 336]         --
+│    │    └─DConv: 3-8                             [512, 48, 336]            (3,588)
+│    │    └─Conv2d: 3-9                            [1, 96, 512, 336]         (4,704)
+│    │    └─Identity: 3-10                         [1, 96, 512, 336]         --
+├─ScaledEmbedding: 1-3                             [512, 48]                 --
+│    └─Embedding: 2-3                              [512, 48]                 (24,576)
+├─ModuleList: 1-8                                  --                        (recursive)
+│    └─HEncLayer: 2-4                              [1, 96, 21499]            --
+│    │    └─Conv1d: 3-11                           [1, 96, 21499]            (36,960)
+│    │    └─Identity: 3-12                         [1, 96, 21499]            --
+│    │    └─DConv: 3-13                            [1, 96, 21499]            (12,936)
+│    │    └─Conv1d: 3-14                           [1, 192, 21499]           (18,624)
+│    │    └─Identity: 3-15                         [1, 192, 21499]           --
+├─ModuleList: 1-9                                  --                        (recursive)
+│    └─HEncLayer: 2-5                              [1, 96, 128, 336]         --
+│    │    └─Conv2d: 3-16                           [1, 96, 128, 336]         (36,960)
+│    │    └─Identity: 3-17                         [1, 96, 128, 336]         --
+│    │    └─DConv: 3-18                            [128, 96, 336]            (12,936)
+│    │    └─Conv2d: 3-19                           [1, 192, 128, 336]        (18,624)
+│    │    └─Identity: 3-20                         [1, 192, 128, 336]        --
+├─ModuleList: 1-8                                  --                        (recursive)
+│    └─HEncLayer: 2-6                              [1, 192, 5375]            --
+│    │    └─Conv1d: 3-21                           [1, 192, 5375]            (147,648)
+│    │    └─Identity: 3-22                         [1, 192, 5375]            --
+│    │    └─DConv: 3-23                            [1, 192, 5375]            (48,912)
+│    │    └─Conv1d: 3-24                           [1, 384, 5375]            (74,112)
+│    │    └─Identity: 3-25                         [1, 384, 5375]            --
+├─ModuleList: 1-9                                  --                        (recursive)
+│    └─HEncLayer: 2-7                              [1, 192, 32, 336]         --
+│    │    └─Conv2d: 3-26                           [1, 192, 32, 336]         (147,648)
+│    │    └─Identity: 3-27                         [1, 192, 32, 336]         --
+│    │    └─DConv: 3-28                            [32, 192, 336]            (48,912)
+│    │    └─Conv2d: 3-29                           [1, 384, 32, 336]         (74,112)
+│    │    └─Identity: 3-30                         [1, 384, 32, 336]         --
+├─ModuleList: 1-8                                  --                        (recursive)
+│    └─HEncLayer: 2-8                              [1, 384, 1344]            --
+│    │    └─Conv1d: 3-31                           [1, 384, 1344]            (590,208)
+│    │    └─Identity: 3-32                         [1, 384, 1344]            --
+│    │    └─DConv: 3-33                            [1, 384, 1344]            (189,984)
+│    │    └─Conv1d: 3-34                           [1, 768, 1344]            (295,680)
+│    │    └─Identity: 3-35                         [1, 768, 1344]            --
+├─ModuleList: 1-9                                  --                        (recursive)
+│    └─HEncLayer: 2-9                              [1, 384, 8, 336]          --
+│    │    └─Conv2d: 3-36                           [1, 384, 8, 336]          (590,208)
+│    │    └─Identity: 3-37                         [1, 384, 8, 336]          --
+│    │    └─DConv: 3-38                            [8, 384, 336]             (189,984)
+│    │    └─Conv2d: 3-39                           [1, 768, 8, 336]          (295,680)
+│    │    └─Identity: 3-40                         [1, 768, 8, 336]          --
+├─Conv1d: 1-10                                     [1, 512, 2688]            (197,120)
+├─Conv1d: 1-11                                     [1, 512, 1344]            (197,120)
+├─CrossTransformerEncoder: 1-12                    [1, 512, 8, 336]          --
+│    └─LayerNorm: 2-10                             [1, 2688, 512]            (1,024)
+│    └─LayerNorm: 2-11                             [1, 1344, 512]            (1,024)
+│    └─ModuleList: 2-20                            --                        (recursive)
+│    │    └─MyTransformerEncoderLayer: 3-41        [1, 2688, 512]            (3,154,432)
+│    └─ModuleList: 2-21                            --                        (recursive)
+│    │    └─MyTransformerEncoderLayer: 3-42        [1, 1344, 512]            (3,154,432)
+│    └─ModuleList: 2-20                            --                        (recursive)
+│    │    └─CrossTransformerEncoderLayer: 3-43     [1, 2688, 512]            (3,155,456)
+│    └─ModuleList: 2-21                            --                        (recursive)
+│    │    └─CrossTransformerEncoderLayer: 3-44     [1, 1344, 512]            (3,155,456)
+│    └─ModuleList: 2-20                            --                        (recursive)
+│    │    └─MyTransformerEncoderLayer: 3-45        [1, 2688, 512]            (3,154,432)
+│    └─ModuleList: 2-21                            --                        (recursive)
+│    │    └─MyTransformerEncoderLayer: 3-46        [1, 1344, 512]            (3,154,432)
+│    └─ModuleList: 2-20                            --                        (recursive)
+│    │    └─CrossTransformerEncoderLayer: 3-47     [1, 2688, 512]            (3,155,456)
+│    └─ModuleList: 2-21                            --                        (recursive)
+│    │    └─CrossTransformerEncoderLayer: 3-48     [1, 1344, 512]            (3,155,456)
+│    └─ModuleList: 2-20                            --                        (recursive)
+│    │    └─MyTransformerEncoderLayer: 3-49        [1, 2688, 512]            (3,154,432)
+│    └─ModuleList: 2-21                            --                        (recursive)
+│    │    └─MyTransformerEncoderLayer: 3-50        [1, 1344, 512]            (3,154,432)
+├─Conv1d: 1-13                                     [1, 384, 2688]            (196,992)
+├─Conv1d: 1-14                                     [1, 384, 1344]            (196,992)
+├─ModuleList: 1-21                                 --                        (recursive)
+│    └─HDecLayer: 2-22                             [1, 192, 32, 336]         --
+│    │    └─Conv2d: 3-51                           [1, 768, 8, 336]          (2,654,976)
+│    │    └─Identity: 3-52                         [1, 768, 8, 336]          --
+│    │    └─DConv: 3-53                            [8, 384, 336]             (189,984)
+│    │    └─ConvTranspose2d: 3-54                  [1, 192, 36, 336]         (590,016)
+│    │    └─Identity: 3-55                         [1, 192, 36, 336]         --
+├─ModuleList: 1-22                                 --                        (recursive)
+│    └─HDecLayer: 2-23                             [1, 192, 5375]            --
+│    │    └─Conv1d: 3-56                           [1, 768, 1344]            (885,504)
+│    │    └─Identity: 3-57                         [1, 768, 1344]            --
+│    │    └─DConv: 3-58                            [1, 384, 1344]            (189,984)
+│    │    └─ConvTranspose1d: 3-59                  [1, 192, 5380]            (590,016)
+│    │    └─Identity: 3-60                         [1, 192, 5380]            --
+├─ModuleList: 1-21                                 --                        (recursive)
+│    └─HDecLayer: 2-24                             [1, 96, 128, 336]         --
+│    │    └─Conv2d: 3-61                           [1, 384, 32, 336]         (663,936)
+│    │    └─Identity: 3-62                         [1, 384, 32, 336]         --
+│    │    └─DConv: 3-63                            [32, 192, 336]            (48,912)
+│    │    └─ConvTranspose2d: 3-64                  [1, 96, 132, 336]         (147,552)
+│    │    └─Identity: 3-65                         [1, 96, 132, 336]         --
+���─ModuleList: 1-22                                 --                        (recursive)
+│    └─HDecLayer: 2-25                             [1, 96, 21499]            --
+│    │    └─Conv1d: 3-66                           [1, 384, 5375]            (221,568)
+│    │    └─Identity: 3-67                         [1, 384, 5375]            --
+│    │    └─DConv: 3-68                            [1, 192, 5375]            (48,912)
+│    │    └─ConvTranspose1d: 3-69                  [1, 96, 21504]            (147,552)
+│    │    └─Identity: 3-70                         [1, 96, 21504]            --
+├─ModuleList: 1-21                                 --                        (recursive)
+│    └─HDecLayer: 2-26                             [1, 48, 512, 336]         --
+│    │    └─Conv2d: 3-71                           [1, 192, 128, 336]        (166,080)
+│    │    └─Identity: 3-72                         [1, 192, 128, 336]        --
+│    │    └─DConv: 3-73                            [128, 96, 336]            (12,936)
+│    │    └─ConvTranspose2d: 3-74                  [1, 48, 516, 336]         (36,912)
+│    │    └─Identity: 3-75                         [1, 48, 516, 336]         --
+├─ModuleList: 1-22                                 --                        (recursive)
+│    └─HDecLayer: 2-27                             [1, 48, 85995]            --
+│    │    └─Conv1d: 3-76                           [1, 192, 21499]           (55,488)
+│    │    └─Identity: 3-77                         [1, 192, 21499]           --
+│    │    └─DConv: 3-78                            [1, 96, 21499]            (12,936)
+│    │    └─ConvTranspose1d: 3-79                  [1, 48, 86000]            (36,912)
+│    │    └─Identity: 3-80                         [1, 48, 86000]            --
+├─ModuleList: 1-21                                 --                        (recursive)
+│    └─HDecLayer: 2-28                             [1, 16, 2048, 336]        --
+│    │    └─Conv2d: 3-81                           [1, 96, 512, 336]         (41,568)
+│    │    └─Identity: 3-82                         [1, 96, 512, 336]         --
+│    │    └─DConv: 3-83                            [512, 48, 336]            (3,588)
+│    │    └─ConvTranspose2d: 3-84                  [1, 16, 2052, 336]        (6,160)
+│    │    └─Identity: 3-85                         [1, 16, 2052, 336]        --
+├─ModuleList: 1-22                                 --                        (recursive)
+│    └─HDecLayer: 2-29                             [1, 8, 343980]            --
+│    │    └─Conv1d: 3-86                           [1, 96, 85995]            (13,920)
+│    │    └─Identity: 3-87                         [1, 96, 85995]            --
+│    │    └─DConv: 3-88                            [1, 48, 85995]            (3,588)
+│    │    └─ConvTranspose1d: 3-89                  [1, 8, 343984]            (3,080)
+│    │    └─Identity: 3-90                         [1, 8, 343984]            --
+====================================================================================================
+Total params: 41,984,456
+Trainable params: 0
+Non-trainable params: 41,984,456
+Total mult-adds (Units.GIGABYTES): 88.31
+====================================================================================================
+Input size (MB): 2.12
+Forward/backward pass size (MB): 6021.99
+Params size (MB): 125.91
+Estimated Total Size (MB): 6150.02
+====================================================================================================

src/models/stem_separation/__init__.py ADDED Viewed

File without changes

src/models/stem_separation/__pycache__/ATHTDemucs_v2.cpython-313.pyc ADDED Viewed

Binary file (18.6 kB). View file

src/models/stem_separation/__pycache__/AudioTextHTDemucs.cpython-313.pyc ADDED Viewed

Binary file (14.2 kB). View file

src/models/stem_separation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (172 Bytes). View file

src/train.py ADDED Viewed

	@@ -0,0 +1,610 @@

+from pathlib import Path
+from typing import Dict, Optional
+import torch
+from torch.utils.data import DataLoader, Subset
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.cuda.amp import GradScaler, autocast
+from tqdm import tqdm
+from demucs import pretrained
+from transformers import AutoTokenizer, ClapModel, ClapTextModelWithProjection
+from src.models.stem_separation.ATHTDemucs_v2 import AudioTextHTDemucs
+from src.loss import combined_loss, combined_L1_sdr_loss, sdr_loss
+from src.dataloader import MusDBStemDataset, collate_fn, STEM_PROMPTS, PROMPT_TO_STEM
+from utils import load_config, log_separation_spectrograms_to_wandb, log_audio_to_wandb
+# ============================================================================
+# Training Helper Functions
+# ============================================================================
+def train_epoch(
+        model: AudioTextHTDemucs,
+        dataloader: DataLoader,
+        optimizer: torch.optim.Optimizer,
+        scaler: Optional[GradScaler],
+        device: str,
+        use_amp: bool,
+        use_L1_cmb_loss: bool,
+        l1_sdr_weight: Optional[float],
+        l1_weight: Optional[float],
+        grad_clip: float,
+        sdr_weight: float,
+        sisdr_weight: float,
+        epoch: int,
+        log_every: int,
+        use_wandb: bool,
+) -> Dict[str, float]:
+    """Train for one epoch."""
+    model.train()
+    total_loss = 0.0
+    total_sdr = 0.0
+    total_sisdr = 0.0
+    num_batches = 0
+    # Set loss function
+    if use_L1_cmb_loss:
+        loss_function = combined_L1_sdr_loss
+        weight1 = l1_sdr_weight
+        if l1_weight is None:
+            raise ValueError("l1_weight must be provided when using L1 combination loss.")
+        weight2 = l1_weight
+        print("**Using L1 + SDR combination loss for training")
+    else:
+        loss_function = combined_loss
+        weight1 = sdr_weight
+        weight2 = sisdr_weight
+    pbar = tqdm(dataloader, desc=f"Epoch {epoch + 1}")
+    for batch_idx, batch in enumerate(pbar):
+        mixture = batch["mixture"].to(device)
+        target = batch["target"].to(device)
+        prompts = batch["prompt"]
+        optimizer.zero_grad()
+        # TODO: Add L1 + SDR combination loss option
+        if use_amp and device == "cuda":
+            with autocast():
+                estimated = model(mixture, prompts)
+                loss, metrics = loss_function(
+                    estimated, target, weight1, weight2
+                )
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            estimated = model(mixture, prompts)
+            loss, metrics = loss_function(
+                estimated, target, weight1, weight2
+            )
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+            optimizer.step()
+        total_loss += metrics["loss/total"]
+        total_sdr += metrics["metrics/sdr"]
+        total_sisdr += metrics["metrics/sisdr"]
+        num_batches += 1
+        pbar.set_postfix({
+            "loss": f"{metrics['loss/total']:.4f}",
+            "SDR": f"{metrics['metrics/sdr']:.2f}",
+        })
+        if use_wandb and batch_idx % log_every == 0:
+            import wandb
+            wandb.log({
+                "train/loss": metrics["loss/total"],
+                "train/sdr": metrics["metrics/sdr"],
+                "train/sisdr": metrics["metrics/sisdr"],
+                "train/step": epoch * len(dataloader) + batch_idx,
+            })
+            # Plot spectrograms for first sample in batch and log to wandb
+            # NOTE: For now, only 1 extracted stem is visualized (should be extended to all stems later)
+            stem_name_log = PROMPT_TO_STEM[prompts[0]]
+            log_separation_spectrograms_to_wandb(
+                mixture=mixture[0],
+                estimated=estimated[0],
+                reference=target[0],
+                stem_name=stem_name_log,
+                step=epoch * len(dataloader) + batch_idx,
+            )
+            # Log audio to wandb
+            log_audio_to_wandb(mixture[0], "mixture", is_gt=True)
+            log_audio_to_wandb(target[0], stem_name_log, is_gt=True)
+            log_audio_to_wandb(estimated[0], stem_name_log, is_gt=False)
+    return {
+        "loss": total_loss / num_batches,
+        "sdr": total_sdr / num_batches,
+        "sisdr": total_sisdr / num_batches,
+    }
+@torch.no_grad()
+def validate(
+        model: AudioTextHTDemucs,
+        dataloader: DataLoader,
+        device: str,
+        use_amp: bool,
+        use_L1_cmb_loss: bool,
+        l1_sdr_weight: Optional[float],
+        l1_weight: Optional[float],
+        sdr_weight: float = 0.9,
+        sisdr_weight: float = 0.1,
+) -> Dict[str, float]:
+    """Validate the model."""
+    model.eval()
+    total_loss = 0.0
+    total_sdr = 0.0
+    total_sisdr = 0.0
+    num_batches = 0
+    stem_metrics = {name: {"sdr": 0.0, "count": 0} for name in STEM_PROMPTS.keys()}
+    # Set loss function
+    if use_L1_cmb_loss:
+        loss_function = combined_L1_sdr_loss
+        weight1 = l1_sdr_weight
+        if l1_weight is None:
+            raise ValueError("l1_weight must be provided when using L1 combination loss.")
+        weight2 = l1_weight
+    else:
+        loss_function = combined_loss
+        weight1 = sdr_weight
+        weight2 = sisdr_weight
+    for batch in tqdm(dataloader, desc="Validating"):
+        mixture = batch["mixture"].to(device)
+        target = batch["target"].to(device)
+        prompts = batch["prompt"]
+        stem_names = batch["stem_name"]
+        if use_amp and device == "cuda":
+            with autocast():
+                estimated = model(mixture, prompts)
+                loss, metrics = loss_function(estimated, target, weight1, weight2)
+        else:
+            estimated = model(mixture, prompts)
+            loss, metrics = loss_function(estimated, target, weight1, weight2)
+        total_loss += metrics["loss/total"]
+        total_sdr += metrics["metrics/sdr"]
+        total_sisdr += metrics["metrics/sisdr"]
+        num_batches += 1
+        for i, stem_name in enumerate(stem_names):
+            est_i = estimated[i:i + 1]
+            tgt_i = target[i:i + 1]
+            sdr_i = -sdr_loss(est_i, tgt_i).item()
+            stem_metrics[stem_name]["sdr"] += sdr_i
+            stem_metrics[stem_name]["count"] += 1
+    avg_metrics = {
+        "loss": total_loss / num_batches,
+        "sdr": total_sdr / num_batches,
+        "sisdr": total_sisdr / num_batches,
+    }
+    for stem_name, data in stem_metrics.items():
+        if data["count"] > 0:
+            avg_metrics[f"sdr/{stem_name}"] = data["sdr"] / data["count"]
+    return avg_metrics
+def save_checkpoint(
+        model: AudioTextHTDemucs,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler._LRScheduler,
+        epoch: int,
+        metrics: Dict[str, float],
+        checkpoint_dir: str,
+        is_best: bool = False,
+):
+    """Save a training checkpoint."""
+    checkpoint_path = Path(checkpoint_dir)
+    checkpoint_path.mkdir(parents=True, exist_ok=True)
+    checkpoint = {
+        "epoch": epoch,
+        "model_state_dict": model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "scheduler_state_dict": scheduler.state_dict(),
+        "metrics": metrics,
+    }
+    path = checkpoint_path / f"checkpoint_epoch_{epoch}.pt"
+    torch.save(checkpoint, path)
+    print(f"Saved checkpoint to {path}")
+    if is_best:
+        best_path = checkpoint_path / "best_model.pt"
+        torch.save(checkpoint, best_path)
+        print(f"Saved best model to {best_path}")
+    latest_path = checkpoint_path / "latest.pt"
+    torch.save(checkpoint, latest_path)
+def load_checkpoint(
+        model: AudioTextHTDemucs,
+        optimizer: Optional[torch.optim.Optimizer],
+        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler],
+        checkpoint_path: str,
+) -> int:
+    """
+    Load a checkpoint and return the epoch number.
+    Ignores any unused weights (e.g. if ClapTextModelWithProjection is being used but checkpoint has ClapModel with audio encoder weights).
+    Also applies to optimizer and scheduler.
+    """
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    # Try loading optimizer and scheduler state, but ignore mismatches (due to new CLAP model, etc)
+    try:
+        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+    except Exception as e:
+        print("Skipping optimizer state...")
+    # Same idea for scheduler
+    try:
+        scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
+    except:
+        print("Skipping scheduler state...")
+    print(f"Loaded checkpoint from epoch {checkpoint['epoch']}")
+    return checkpoint["epoch"]
+# ============================================================================
+# Main Training Function
+# ============================================================================
+def train(config_path):
+    """
+    Main training function for AudioTextHTDemucs.
+    Args (loaded from YAML config):
+        train_dir: Path to training data directory
+        test_dir: Path to test/validation data directory
+        checkpoint_dir: Path to save checkpoints
+        sample_rate: Audio sample rate
+        segment_seconds: Length of audio segments in seconds
+        batch_size: Training batch size
+        num_workers: Number of dataloader workers
+        epochs: Number of training epochs
+        learning_rate: Initial learning rate
+        weight_decay: AdamW weight decay
+        grad_clip: Gradient clipping value
+        sdr_weight: Weight for SDR loss component
+        sisdr_weight: Weight for SI-SDR loss component
+        model_dim: Model hidden dimension
+        text_dim: Text embedding dimension
+        n_heads: Number of attention heads
+        use_wandb: Whether to use Weights & Biases logging
+        wandb_project: W&B project name
+        wandb_run_name: W&B run name (optional)
+        log_every: Log training metrics every N batches
+        validate_every: Run validation every N epochs
+        save_every: Save checkpoint every N epochs
+        use_amp: Use automatic mixed precision
+        device: Device to train on (auto-detected if None)
+        resume_from: Path to checkpoint to resume from (optional)
+    Returns:
+        Dict containing final metrics and best SDR achieved
+    """
+    # Load configuration
+    cfg             = load_config(config_path)
+    data_cfg        = cfg["data"]
+    model_cfg       = cfg["model"]
+    training_cfg    = cfg["training"]
+    wandb_cfg       = cfg["wandb"]
+    # Paths
+    train_dir       = data_cfg.get("train_dir", "../data/train")
+    test_dir        = data_cfg.get("test_dir", "../data/test")
+    checkpoint_dir  = wandb_cfg.get("checkpoint_dir", "../checkpoints")
+    # Data splits
+    pct_train       = data_cfg.get("pct_train", 1.0)
+    pct_test        = data_cfg.get("pct_test", 1.0)
+    # Audio parameters
+    sample_rate     = data_cfg.get("sample_rate", 44100)
+    segment_seconds = data_cfg.get("segment_seconds", 6.0)
+    # Training parameters
+    batch_size      = training_cfg.get("batch_size", 4)
+    num_workers     = training_cfg.get("num_workers", 0)
+    epochs          = training_cfg.get("num_epochs", 10)
+    learning_rate   = float(training_cfg["optimizer"].get("lr", 1e-4))
+    weight_decay    = float(training_cfg["optimizer"].get("weight_decay", 1e-5))
+    grad_clip       = training_cfg["optimizer"].get("grad_clip", 1.0)
+    use_L1_cmb_loss = training_cfg.get("use_L1_comb_loss", False)
+    l1_sdr_weight   = training_cfg["L1_comb_loss"].get("sdr_weight", 1.0)
+    l1_weight       = training_cfg["L1_comb_loss"].get("l1_weight", 0.05)
+    # Loss weights
+    sdr_weight      = training_cfg["loss_weights"].get("sdr", 0.9)
+    sisdr_weight    = training_cfg["loss_weights"].get("sisdr", 0.1)
+    # Model parameters
+    model_dim       = model_cfg.get("model_dim", 384)
+    text_dim        = model_cfg.get("text_dim", 512)
+    n_heads         = model_cfg.get("n_heads", 8)
+    # Logging
+    use_wandb       = wandb_cfg.get("use_wandb", True)
+    wandb_project   = wandb_cfg.get("project", "audio-text-htdemucs")
+    wandb_run_name  = wandb_cfg.get("run_name", None)
+    log_every       = wandb_cfg.get("log_every", 50)
+    validate_every  = wandb_cfg.get("validate_every", 1)
+    save_every      = wandb_cfg.get("save_every", 1)
+    # Mixed precision
+    use_amp         = training_cfg.get("use_amp", False)
+    # Device
+    device          = model_cfg.get("device", None)
+    # Resume training
+    resume_from     = training_cfg.get("resume_from", None)
+    # Auto-detect device
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    segment_samples = int(sample_rate * segment_seconds)
+    # Initialize wandb
+    if use_wandb:
+        import wandb
+        wandb.init(
+            project=wandb_project,
+            name=wandb_run_name,
+            config={
+                "train_dir": train_dir,
+                "test_dir": test_dir,
+                "sample_rate": sample_rate,
+                "segment_seconds": segment_seconds,
+                "batch_size": batch_size,
+                "epochs": epochs,
+                "learning_rate": learning_rate,
+                "weight_decay": weight_decay,
+                "grad_clip": grad_clip,
+                "sdr_weight": sdr_weight,
+                "sisdr_weight": sisdr_weight,
+                "model_dim": model_dim,
+                "text_dim": text_dim,
+                "n_heads": n_heads,
+                "use_amp": use_amp,
+            },
+        )
+    print("=" * 60)
+    print("Audio-Text HTDemucs Training")
+    print("=" * 60)
+    print(f"Device: {device}")
+    print(f"Train directory: {train_dir}")
+    print(f"Test directory: {test_dir}")
+    print(f"Segment length: {segment_seconds}s ({segment_samples} samples)")
+    print(f"Batch size: {batch_size}")
+    print(f"Epochs: {epochs}")
+    print(f"Learning rate: {learning_rate}")
+    print("=" * 60)
+    # Load pretrained models
+    print("Loading pretrained HTDemucs...")
+    htdemucs = pretrained.get_model('htdemucs').models[0]
+    print("Loading CLAP model...")
+    #clap = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+    clap = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")          # More memory efficient than loading full ClapModel (text + audio)
+    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    # Create model
+    print("Building AudioTextHTDemucs model...")
+    model = AudioTextHTDemucs(
+        htdemucs_model=htdemucs,
+        clap_encoder=clap,
+        clap_tokenizer=tokenizer,
+        model_dim=model_dim,
+        text_dim=text_dim,
+        num_heads=n_heads,
+    )
+    model = model.to(device)
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    # Create datasets
+    print("Creating datasets...")
+    train_dataset = MusDBStemDataset(
+        root_dir=train_dir,
+        segment_samples=segment_samples,
+        sample_rate=sample_rate,
+        random_segments=True,
+        augment=True,
+    )
+    val_dataset = MusDBStemDataset(
+        root_dir=test_dir,
+        segment_samples=segment_samples,
+        sample_rate=sample_rate,
+        random_segments=False,
+        augment=False,
+    )
+    # Create suubsets if specified
+    if 0.0 < pct_train < 1.0:
+        num_train = int(len(train_dataset) * pct_train)
+        train_idxs = torch.randperm(len(train_dataset))[:num_train]
+        train_subset = Subset(train_dataset, train_idxs)
+    if 0.0 < pct_test < 1.0:
+        num_val = int(len(val_dataset) * pct_test)
+        val_idxs = torch.randperm(len(val_dataset))[:num_val]
+        val_subset = Subset(train_dataset, val_idxs)
+    # Create dataloaders
+    train_loader = DataLoader(
+        train_dataset if pct_train >= 1.0 else train_subset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        pin_memory=(device == "cuda"),
+        drop_last=True,
+    )
+    val_loader = DataLoader(
+        val_dataset if pct_test >= 1.0 else val_subset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        pin_memory=(device == "cuda"),
+    )
+    # Optimizer and scheduler
+    optimizer = AdamW(
+        model.parameters(),
+        lr=learning_rate,
+        weight_decay=weight_decay,
+        betas=(0.9, 0.999),
+    )
+    scheduler = CosineAnnealingLR(
+        optimizer,
+        T_max=epochs,
+        eta_min=learning_rate * 0.01,
+    )
+    # Mixed precision scaler
+    scaler = GradScaler() if use_amp and device == "cuda" else None
+    # Resume from checkpoint
+    start_epoch = 0
+    best_sdr = -float("inf")
+    if resume_from is not None:
+        resume_path = Path(resume_from)
+        if resume_path.exists():
+            print(f"Resuming from {resume_path}")
+            start_epoch = load_checkpoint(model, optimizer, scheduler, str(resume_path))
+            start_epoch += 1
+    else:
+        # Check for latest checkpoint
+        latest_checkpoint = Path(checkpoint_dir) / "latest.pt"
+        if latest_checkpoint.exists():
+            print(f"Found latest checkpoint at {latest_checkpoint}")
+            start_epoch = load_checkpoint(model, optimizer, scheduler, str(latest_checkpoint))
+            start_epoch += 1
+    # Training loop
+    print("\nStarting training...")
+    for epoch in range(start_epoch, epochs):
+        print(f"\n{'=' * 60}")
+        print(f"Epoch {epoch + 1}/{epochs}")
+        print(f"Learning rate: {scheduler.get_last_lr()[0]:.2e}")
+        print(f"{'=' * 60}")
+        # Train
+        train_metrics = train_epoch(
+            model=model,
+            dataloader=train_loader,
+            optimizer=optimizer,
+            scaler=scaler,
+            device=device,
+            use_amp=use_amp,
+            use_L1_cmb_loss=use_L1_cmb_loss,
+            l1_sdr_weight=l1_sdr_weight,
+            l1_weight=l1_weight,
+            grad_clip=grad_clip,
+            sdr_weight=sdr_weight,
+            sisdr_weight=sisdr_weight,
+            epoch=epoch,
+            log_every=log_every,
+            use_wandb=use_wandb,
+        )
+        print(f"Train - Loss: {train_metrics['loss']:.4f}, SDR: {train_metrics['sdr']:.2f} dB")
+        # Step scheduler
+        scheduler.step()
+        # Validate
+        if (epoch + 1) % validate_every == 0:
+            val_metrics = validate(
+                model=model,
+                dataloader=val_loader,
+                device=device,
+                use_amp=use_amp,
+                use_L1_cmb_loss=use_L1_cmb_loss,
+                l1_sdr_weight=l1_sdr_weight,
+                l1_weight=l1_weight,
+                sdr_weight=sdr_weight,
+                sisdr_weight=sisdr_weight,
+            )
+            print(f"Val - Loss: {val_metrics['loss']:.4f}, SDR: {val_metrics['sdr']:.2f} dB")
+            for stem_name in STEM_PROMPTS.keys():
+                if f"sdr/{stem_name}" in val_metrics:
+                    print(f"  {stem_name}: {val_metrics[f'sdr/{stem_name}']:.2f} dB")
+            if use_wandb:
+                import wandb
+                wandb.log({
+                    "val/loss": val_metrics["loss"],
+                    "val/sdr": val_metrics["sdr"],
+                    "val/sisdr": val_metrics["sisdr"],
+                    **{f"val/{k}": v for k, v in val_metrics.items() if k.startswith("sdr/")},
+                    "epoch": epoch + 1,
+                })
+            is_best = val_metrics["sdr"] > best_sdr
+            if is_best:
+                best_sdr = val_metrics["sdr"]
+                print(f"New best SDR: {best_sdr:.2f} dB")
+        else:
+            val_metrics = {}
+            is_best = False
+        # Save checkpoint
+        if (epoch + 1) % save_every == 0 or is_best:
+            save_checkpoint(
+                model, optimizer, scheduler, epoch + 1,
+                {**train_metrics, **val_metrics},
+                checkpoint_dir, is_best
+            )
+        else:
+            save_checkpoint(
+                model, optimizer, scheduler, epoch + 1,
+                {**train_metrics, **val_metrics},
+                checkpoint_dir, is_best=False
+            )
+    print("\n" + "=" * 60)
+    print("Training complete!")
+    print(f"Best validation SDR: {best_sdr:.2f} dB")
+    print("=" * 60)
+    if use_wandb:
+        import wandb
+        wandb.finish()
+    return {
+        "final_train_metrics": train_metrics,
+        "final_val_metrics": val_metrics,
+        "best_sdr": best_sdr,
+    }
+if __name__ == "__main__":
+    # Example: run training with default parameters
+    train(train_dir="/home/jacob/datasets/musdb18/train", test_dir="/home/jacob/datasets/musdb18/test", checkpoint_dir="../checkpoints")

utils.py ADDED Viewed

	@@ -0,0 +1,968 @@

+from typing import Union, Optional, Dict, List
+from pathlib import Path
+import yaml
+import torch
+import torch.nn.functional as F
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend for server/training use
+# ============================================================================
+# YAML Config
+# ============================================================================
+def load_config(file_path: Union[str, Path]) -> dict:
+    """Load a YAML configuration file."""
+    with open(file_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+# ============================================================================
+# Spectrogram Utilities
+# ============================================================================
+def compute_spectrogram(
+    waveform: torch.Tensor,
+    n_fft: int = 2048,
+    hop_length: int = 512,
+    power: float = 2.0,
+    to_db: bool = True,
+    top_db: float = 80.0,
+) -> torch.Tensor:
+    """
+    Compute spectrogram from waveform using STFT.
+    Args:
+        waveform: (C, T) or (T,) audio waveform
+        n_fft: FFT window size
+        hop_length: Hop length between frames
+        power: Exponent for magnitude (1.0 for magnitude, 2.0 for power)
+        to_db: Convert to decibel scale
+        top_db: Threshold for dynamic range in dB
+    Returns:
+        (F, T') spectrogram tensor
+    """
+    # Handle stereo by taking mean to mono
+    if waveform.dim() == 2:
+        waveform = waveform.mean(dim=0)  # (T,)
+    # Move to CPU for STFT computation
+    waveform = waveform.cpu()
+    # Compute STFT
+    window = torch.hann_window(n_fft)
+    stft = torch.stft(
+        waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=n_fft,
+        window=window,
+        return_complex=True,
+        center=True,
+        pad_mode='reflect'
+    )
+    # Compute magnitude spectrogram
+    spec = torch.abs(stft).pow(power)
+    # Convert to dB
+    if to_db:
+        spec = amplitude_to_db(spec, top_db=top_db)
+    return spec
+def amplitude_to_db(
+    spec: torch.Tensor,
+    ref: float = 1.0,
+    amin: float = 1e-10,
+    top_db: float = 80.0,
+) -> torch.Tensor:
+    """Convert amplitude/power spectrogram to decibel scale."""
+    spec_db = 10.0 * torch.log10(torch.clamp(spec, min=amin) / ref)
+    # Clip to top_db range
+    max_val = spec_db.max()
+    spec_db = torch.clamp(spec_db, min=max_val - top_db)
+    return spec_db
+def plot_spectrogram(
+    spec: torch.Tensor,
+    sample_rate: int = 44100,
+    hop_length: int = 512,
+    title: str = "Spectrogram",
+    figsize: tuple = (10, 4),
+    cmap: str = "magma",
+    colorbar: bool = True,
+) -> plt.Figure:
+    """
+    Plot a single spectrogram.
+    Args:
+        spec: (F, T) spectrogram tensor (in dB scale)
+        sample_rate: Audio sample rate
+        hop_length: Hop length used for STFT
+        title: Plot title
+        figsize: Figure size
+        cmap: Colormap for spectrogram
+        colorbar: Whether to show colorbar
+    Returns:
+        matplotlib Figure object
+    """
+    spec_np = spec.detach().cpu().numpy() if isinstance(spec, torch.Tensor) else spec
+    fig, ax = plt.subplots(figsize=figsize)
+    # Compute time and frequency axes
+    n_frames = spec_np.shape[1]
+    n_freqs = spec_np.shape[0]
+    time_max = n_frames * hop_length / sample_rate
+    freq_max = sample_rate / 2  # Nyquist frequency
+    img = ax.imshow(
+        spec_np,
+        aspect='auto',
+        origin='lower',
+        cmap=cmap,
+        extent=[0, time_max, 0, freq_max / 1000]  # freq in kHz
+    )
+    ax.set_xlabel('Time (s)')
+    ax.set_ylabel('Frequency (kHz)')
+    ax.set_title(title)
+    if colorbar:
+        cbar = fig.colorbar(img, ax=ax, format='%+2.0f dB')
+        cbar.set_label('Magnitude (dB)')
+    fig.tight_layout()
+    return fig
+def plot_spectrogram_comparison(
+    spectrograms: Dict[str, torch.Tensor],
+    sample_rate: int = 44100,
+    hop_length: int = 512,
+    figsize: tuple = (14, 3),
+    cmap: str = "magma",
+    suptitle: Optional[str] = None,
+) -> plt.Figure:
+    """
+    Plot multiple spectrograms side by side for comparison.
+    Args:
+        spectrograms: Dict mapping names to spectrogram tensors
+        sample_rate: Audio sample rate
+        hop_length: Hop length used for STFT
+        figsize: Figure size (width, height per row)
+        cmap: Colormap for spectrograms
+        suptitle: Super title for the figure
+    Returns:
+        matplotlib Figure object
+    """
+    n_specs = len(spectrograms)
+    fig, axes = plt.subplots(
+        1, n_specs,
+        figsize=(figsize[0], figsize[1]),
+        constrained_layout=True  # Better layout handling with colorbars
+    )
+    if n_specs == 1:
+        axes = [axes]
+    # Find global min/max for consistent colorbar
+    all_specs = [s.detach().cpu().numpy() if isinstance(s, torch.Tensor) else s
+                 for s in spectrograms.values()]
+    vmin = min(s.min() for s in all_specs)
+    vmax = max(s.max() for s in all_specs)
+    for ax, (name, spec) in zip(axes, spectrograms.items()):
+        spec_np = spec.detach().cpu().numpy() if isinstance(spec, torch.Tensor) else spec
+        n_frames = spec_np.shape[1]
+        time_max = n_frames * hop_length / sample_rate
+        freq_max = sample_rate / 2
+        img = ax.imshow(
+            spec_np,
+            aspect='auto',
+            origin='lower',
+            cmap=cmap,
+            extent=[0, time_max, 0, freq_max / 1000],
+            vmin=vmin,
+            vmax=vmax,
+        )
+        ax.set_xlabel('Time (s)')
+        ax.set_ylabel('Frequency (kHz)')
+        ax.set_title(name)
+    # Add single colorbar
+    fig.colorbar(img, ax=axes, format='%+2.0f dB', label='Magnitude (dB)')
+    if suptitle:
+        fig.suptitle(suptitle, fontsize=12)
+    return fig
+def plot_separation_spectrograms(
+    mixture: torch.Tensor,
+    estimated: torch.Tensor,
+    reference: torch.Tensor,
+    stem_name: str = "stem",
+    sample_rate: int = 44100,
+    n_fft: int = 2048,
+    hop_length: int = 512,
+) -> plt.Figure:
+    """
+    Create a comparison spectrogram plot for stem separation.
+    Shows mixture, estimated, reference, and difference.
+    Args:
+        mixture: (C, T) mixture waveform
+        estimated: (C, T) estimated stem waveform
+        reference: (C, T) ground truth stem waveform
+        stem_name: Name of the stem for title
+        sample_rate: Audio sample rate
+        n_fft: FFT window size
+        hop_length: Hop length
+    Returns:
+        matplotlib Figure object
+    """
+    # Compute spectrograms
+    spec_mix = compute_spectrogram(mixture, n_fft=n_fft, hop_length=hop_length)
+    spec_est = compute_spectrogram(estimated, n_fft=n_fft, hop_length=hop_length)
+    spec_ref = compute_spectrogram(reference, n_fft=n_fft, hop_length=hop_length)
+    # Create comparison plot
+    spectrograms = {
+        "Mixture": spec_mix,
+        f"Estimated ({stem_name})": spec_est,
+        f"Ground Truth ({stem_name})": spec_ref,
+    }
+    fig = plot_spectrogram_comparison(
+        spectrograms,
+        sample_rate=sample_rate,
+        hop_length=hop_length,
+        suptitle=f"Stem Separation: {stem_name.capitalize()}"
+    )
+    return fig
+def plot_all_stems_spectrograms(
+    mixture: torch.Tensor,
+    estimated_stems: Dict[str, torch.Tensor],
+    reference_stems: Dict[str, torch.Tensor],
+    sample_rate: int = 44100,
+    n_fft: int = 2048,
+    hop_length: int = 512,
+    figsize: tuple = (16, 12),
+) -> plt.Figure:
+    """
+    Create a grid of spectrograms for all stems.
+    Args:
+        mixture: (C, T) mixture waveform
+        estimated_stems: Dict mapping stem names to estimated (C, T) waveforms
+        reference_stems: Dict mapping stem names to reference (C, T) waveforms
+        sample_rate: Audio sample rate
+        n_fft: FFT window size
+        hop_length: Hop length
+        figsize: Figure size
+    Returns:
+        matplotlib Figure object
+    """
+    stem_names = list(estimated_stems.keys())
+    n_stems = len(stem_names)
+    # Create grid: rows = stems, cols = [Estimated, Ground Truth]
+    fig, axes = plt.subplots(
+        n_stems, 2,
+        figsize=figsize,
+        constrained_layout=True  # Better layout handling with colorbars
+    )
+    if n_stems == 1:
+        axes = axes.reshape(1, -1)
+    # Compute all spectrograms and find global min/max for consistent colorbar
+    all_specs = []
+    spec_data = {}
+    for stem_name in stem_names:
+        spec_est = compute_spectrogram(
+            estimated_stems[stem_name], n_fft=n_fft, hop_length=hop_length
+        )
+        spec_ref = compute_spectrogram(
+            reference_stems[stem_name], n_fft=n_fft, hop_length=hop_length
+        )
+        spec_data[stem_name] = {'est': spec_est, 'ref': spec_ref}
+        all_specs.extend([spec_est.cpu().numpy(), spec_ref.cpu().numpy()])
+    vmin = min(s.min() for s in all_specs)
+    vmax = max(s.max() for s in all_specs)
+    for row, stem_name in enumerate(stem_names):
+        spec_est = spec_data[stem_name]['est']
+        spec_ref = spec_data[stem_name]['ref']
+        # Get time extent
+        n_frames = spec_est.shape[1]
+        time_max = n_frames * hop_length / sample_rate
+        freq_max = sample_rate / 2
+        # Plot estimated
+        spec_np = spec_est.detach().cpu().numpy()
+        axes[row, 0].imshow(
+            spec_np, aspect='auto', origin='lower', cmap='magma',
+            extent=[0, time_max, 0, freq_max / 1000],
+            vmin=vmin, vmax=vmax
+        )
+        axes[row, 0].set_title(f'{stem_name.capitalize()} - Estimated')
+        axes[row, 0].set_ylabel('Freq (kHz)')
+        # Plot reference
+        spec_np = spec_ref.detach().cpu().numpy()
+        img = axes[row, 1].imshow(
+            spec_np, aspect='auto', origin='lower', cmap='magma',
+            extent=[0, time_max, 0, freq_max / 1000],
+            vmin=vmin, vmax=vmax
+        )
+        axes[row, 1].set_title(f'{stem_name.capitalize()} - Ground Truth')
+    # Set x labels on bottom row
+    axes[-1, 0].set_xlabel('Time (s)')
+    axes[-1, 1].set_xlabel('Time (s)')
+    fig.colorbar(img, ax=axes, format='%+2.0f dB', label='Magnitude (dB)')
+    fig.suptitle('Stem Separation Results', fontsize=14)
+    return fig
+# ============================================================================
+# Weights & Biases Logging Utilities
+# ============================================================================
+def log_spectrogram_to_wandb(
+    fig: plt.Figure,
+    key: str = "spectrogram",
+    step: Optional[int] = None,
+    caption: Optional[str] = None,
+):
+    """
+    Log a matplotlib figure as an image to W&B.
+    Args:
+        fig: matplotlib Figure object
+        key: W&B log key
+        step: Training step (optional)
+        caption: Image caption
+    """
+    import wandb
+    # Convert figure to W&B Image
+    wandb_img = wandb.Image(fig, caption=caption)
+    log_dict = {key: wandb_img}
+    if step is not None:
+        wandb.log(log_dict, step=step)
+    else:
+        wandb.log(log_dict)
+    # Close the figure to free memory
+    plt.close(fig)
+def log_audio_to_wandb(
+    audio: torch.Tensor,
+    stem_name: str,
+    is_gt: bool,
+    sample_rate: int = 44100
+):
+    """
+    Log audio waveform to W&B.
+    Args:
+        audio: (C, T) audio waveform tensor
+        stem_name: Name of the stem
+        is_gt: Whether this is ground truth audio (or extracted audio)
+        sample_rate: Audio sample rate
+    """
+    import wandb
+    # Convert to numpy
+    audio_np = audio.detach().cpu().numpy().T  # (T, C)
+    title =f"true_{stem_name}" if is_gt else f"extracted_{stem_name}"
+    keyname = f"audio/{title}"
+    wandb.log({
+        keyname: wandb.Audio(
+            audio_np,
+            sample_rate=sample_rate,
+            caption=title
+        )
+    })
+def log_separation_spectrograms_to_wandb(
+    mixture: torch.Tensor,
+    estimated: torch.Tensor,
+    reference: torch.Tensor,
+    stem_name: str,
+    step: Optional[int] = None,
+    sample_rate: int = 44100,
+):
+    """
+    Log stem separation spectrograms to W&B.
+    Args:
+        mixture: (C, T) mixture waveform
+        estimated: (C, T) estimated stem waveform
+        reference: (C, T) ground truth stem waveform
+        stem_name: Name of the stem
+        step: Training step (optional)
+        sample_rate: Audio sample rate
+    """
+    fig = plot_separation_spectrograms(
+        mixture=mixture,
+        estimated=estimated,
+        reference=reference,
+        stem_name=stem_name,
+        sample_rate=sample_rate,
+    )
+    log_spectrogram_to_wandb(
+        fig=fig,
+        key=f"spectrograms/{stem_name}",
+        step=step,
+        caption=f"Separation for {stem_name}"
+    )
+def log_all_stems_to_wandb(
+    mixture: torch.Tensor,
+    estimated_stems: Dict[str, torch.Tensor],
+    reference_stems: Dict[str, torch.Tensor],
+    step: Optional[int] = None,
+    sample_rate: int = 44100,
+    log_individual: bool = True,
+    log_combined: bool = True,
+):
+    """
+    Log spectrograms for all stems to W&B.
+    Args:
+        mixture: (C, T) mixture waveform
+        estimated_stems: Dict mapping stem names to estimated (C, T) waveforms
+        reference_stems: Dict mapping stem names to reference (C, T) waveforms
+        step: Training step (optional)
+        sample_rate: Audio sample rate
+        log_individual: Log individual stem comparisons
+        log_combined: Log combined grid of all stems
+    """
+    if log_individual:
+        for stem_name in estimated_stems.keys():
+            log_separation_spectrograms_to_wandb(
+                mixture=mixture,
+                estimated=estimated_stems[stem_name],
+                reference=reference_stems[stem_name],
+                stem_name=stem_name,
+                step=step,
+                sample_rate=sample_rate,
+            )
+    if log_combined:
+        fig = plot_all_stems_spectrograms(
+            mixture=mixture,
+            estimated_stems=estimated_stems,
+            reference_stems=reference_stems,
+            sample_rate=sample_rate,
+        )
+        log_spectrogram_to_wandb(
+            fig=fig,
+            key="spectrograms/all_stems",
+            step=step,
+            caption="All stems separation comparison"
+        )
+# --- Audio I/O ---
+# def load_audio(
+#     file_path: Union[str, Path],
+#     sample_rate: int = DEFAULT_SAMPLE_RATE,
+#     max_len: int = 5,
+#     mono: bool = True
+# ) -> Tuple[np.ndarray, int]:
+#     """
+#     Load an audio file into a numpy array.
+#     Parameters
+#     ----------
+#     file_path (str or Path): Path to the audio file
+#     max_len (int): Maximum length of audio in seconds
+#     sample_rate (int, optional): Target sample rate
+#     mono (bool, optional): Whether to convert audio to mono
+#     Returns
+#     -------
+#     tuple
+#         (audio_data, sample_rate)
+#     """
+#     try:
+#         audio_data, sr = librosa.load(file_path, sr=sample_rate, mono=mono)
+#         # Clip audio to max_len
+#         max_samples = int(sample_rate * max_len)
+#         if len(audio_data) > max_samples:
+#             audio_data = audio_data[:max_samples]
+#         else:
+#             padding = max_samples - len(audio_data)
+#             audio_data = np.pad(
+#                 audio_data,
+#                 (0, padding),
+#                 'constant'
+#             )
+#         return audio_data, sr
+#     except Exception as e:
+#         raise IOError(f"Error loading audio file {file_path}: {str(e)}")
+# def save_audio(
+#     audio_data: np.ndarray,
+#     file_path: Union[str, Path],
+#     sample_rate: int = DEFAULT_SAMPLE_RATE,
+#     normalize: bool = True,
+#     file_format: str = 'flac'
+# ) -> None:
+#     """
+#     Save audio data to a file.
+#     Parameters
+#     ----------
+#     audio_data   (np.ndarray): Audio time series
+#     file_path    (str or Path): Path to save the audio file
+#     sample_rate  (int, optional): Sample rate of audio
+#     normalize    (bool, optional): Whether to normalize audio before saving
+#     file_format  (str, optional): Audio file format
+#     Returns
+#     -------
+#     None
+#     """
+#     output_dir = Path(file_path).parent
+#     if output_dir and not output_dir.exists():
+#         try:
+#             output_dir.mkdir(parents=True, exist_ok=True)
+#         except Exception as e:
+#             raise IOError(f"Error creating directory {output_dir}: {str(e)}")
+#     # Normalize audio before saving
+#     audio_data = librosa.util.normalize(audio_data) if normalize else audio_data
+#     try:
+#         sf.write(file_path, audio_data, sample_rate, format=file_format)
+#     except Exception as e:
+#         raise IOError(f"Error saving audio to {file_path}: {str(e)}")
+# # --- Gap Processing ---
+# def create_gap_mask(
+#     audio_len_samples: int,
+#     gap_len_s: float,
+#     sample_rate: int = DEFAULT_SAMPLE_RATE,
+#     gap_start_s: Optional[float] = None,
+# ) -> Tuple[np.ndarray, Tuple[int, int]]:
+#     """
+#     Creates a binary mask with a single gap of zeros at a random location.
+#     Parameters
+#     ----------
+#     audio_len_samples : int
+#         Length of the target audio in samples.
+#     gap_len_s : float
+#         Desired gap length in seconds.
+#     sample_rate : int, optional
+#         Sample rate. Defaults to DEFAULT_SAMPLE_RATE.
+#     gap_start_s : float, optional
+#         Timestap in seconds where the gap starts. If None, a random position is chosen.
+#     Returns
+#     -------
+#     Tuple[np.ndarray, Tuple[int, int]]
+#         (mask, (gap_start_sample, gap_end_sample))
+#         Mask is 1.0 for signal, 0.0 for gap (float32).
+#         Interval is gap start/end indices in samples.
+#     """
+#     gap_len_samples = int(gap_len_s * sample_rate)
+#     if gap_len_samples <= 0:
+#         # No gap, return full mask and zero interval
+#         return np.ones(audio_len_samples, dtype=np.float32), (0, 0)
+#     if gap_len_samples >= audio_len_samples:
+#         # Gap covers everything
+#         print(f"Warning: Gap length ({gap_len_s}s) >= audio length. Returning all zeros mask.")
+#         return np.zeros(audio_len_samples, dtype=np.float32), (0, audio_len_samples)
+#     # Choose a random start position for the gap (inclusive range)
+#     max_start_sample = audio_len_samples - gap_len_samples
+#     if (gap_start_s is None):
+#         gap_start_sample = np.random.randint(0, max_start_sample + 1)
+#     else:
+#         gap_start_sample = int(gap_start_s * sample_rate)
+#     gap_end_sample = gap_start_sample + gap_len_samples
+#     # Create mask
+#     mask = np.ones(audio_len_samples, dtype=np.float32)
+#     mask[gap_start_sample:gap_end_sample] = 0.0
+#     return mask, (gap_start_sample, gap_end_sample)
+# def add_random_gap(
+#         file_path: Union[str, Path],
+#         gap_len: int,
+#         sample_rate: int = DEFAULT_SAMPLE_RATE,
+#         mono: bool = True
+# ) -> Tuple[np.ndarray, Tuple[float, float]]:
+#     """
+#     Add a random gap of length gap_len at a random valid position within the audio file and return the audio data
+#     Parameters
+#     ----------
+#     file_path (str or Path): Path to the audio file
+#     gap_len (int): Gap length (seconds) to add at one location within the audio file
+#     sample_rate (int, optional): Target sample rate
+#     mono (bool, optional): Whether to convert audio to mono
+#     Returns
+#     -------
+#     tuple
+#         (modified_audio_data, gap_interval)
+#         gap_interval is a tuple of (start_time, end_time) in seconds
+#     """
+#     audio_data, sr = load_audio(file_path, sample_rate=sample_rate, mono=mono)
+#     # Convert gap length to samples
+#     gap_length    = int(gap_len * sample_rate)
+#     audio_len     = len(audio_data)
+#     # Handle case where gap is longer than audio
+#     if gap_length >= audio_len:
+#         raise ValueError(f"Gap length ({gap_length}s) exceeds audio length ({audio_len/sample_rate}s)")
+#     # Get sample indices for gap placement
+#     gap_start_idx = np.random.randint(0, audio_len - int(gap_len * sample_rate))
+#     silence       = np.zeros(gap_length)
+#     # Add gap
+#     audio_new = np.concatenate([audio_data[:gap_start_idx], silence, audio_data[gap_start_idx + gap_length:]])
+#     # Return gap interval as a tuple
+#     gap_interval = (gap_start_idx / sample_rate, (gap_start_idx + gap_length) / sample_rate)
+#     return audio_new, gap_interval
+# # --- STFT Processing ---
+# def extract_spectrogram(
+#     audio_data: np.ndarray,
+#     n_fft: int = 2048,
+#     hop_length: int = 512,
+#     win_length: Optional[int] = None,
+#     window: str = 'hann',
+#     center: bool = True,
+#     power: float = 1.0
+# ) -> np.ndarray:
+#     """
+#     Extract magnitude spectrogram from audio data.
+#     Parameters
+#     ----------
+#     audio_data (np.ndarray): Audio time series
+#     n_fft (int, optional): FFT window size
+#     hop_length (int, optional): Number of samples between successive frames
+#     win_length (int or None, optional): Window length. If None, defaults to n_fft
+#     window (str, optional): Window specification
+#     center (bool, optional): If True, pad signal on both sides
+#     power (float, optional): Exponent for the magnitude spectrogram (e.g. 1 for energy, 2 for power)
+#     Returns
+#     -------
+#     np.ndarray
+#         Magnitude spectrogram
+#     """
+#     if power < 0:
+#         raise ValueError("Power must be non-negative")
+#     if win_length is None:
+#         win_length = n_fft
+#     stft = librosa.stft(
+#         audio_data,
+#         n_fft=n_fft,
+#         hop_length=hop_length,
+#         win_length=win_length,
+#         window=window,
+#         center=center
+#     )
+#     return stft
+# def extract_mel_spectrogram(
+#     audio_data: np.ndarray,
+#     sample_rate: int = DEFAULT_SAMPLE_RATE,
+#     n_fft: int = 2048,
+#     hop_length: int = 512,
+#     n_mels: int = 128,
+#     fmin: float = 0.0,
+#     fmax: Optional[float] = None,
+#     power: float = 2.0
+# ) -> np.ndarray:
+#     """
+#     Extract mel spectrogram from audio data.
+#     Parameters
+#     ----------
+#     audio_data (np.ndarray): Audio time series
+#     sample_rate (int, optional): Sample rate of audio
+#     n_fft (int, optional): FFT window size
+#     hop_length (int, optional): Number of samples between successive frames
+#     n_mels (int, optional): Number of mel bands
+#     fmin (float, optional): Minimum frequency
+#     fmax (float or None, optional): Maximum frequency. If None, use sample_rate/2
+#     power (float, optional): Exponent for the magnitude spectrogram (e.g. 1 for energy, 2 for power)
+#     Returns
+#     -------
+#     np.ndarray
+#         Mel spectrogram
+#     """
+#     if power < 0:
+#         raise ValueError("Power must be non-negative")
+#     return librosa.feature.melspectrogram(
+#         y=audio_data,
+#         sr=sample_rate,
+#         n_fft=n_fft,
+#         hop_length=hop_length,
+#         n_mels=n_mels,
+#         fmin=fmin,
+#         fmax=fmax,
+#         power=power
+#     )
+# def spectrogram_to_audio(
+#     spectrogram: np.ndarray,
+#     phase: Optional[np.ndarray] = None,
+#     phase_info: bool = False,
+#     n_fft=512,
+#     n_iter=64,
+#     window='hann',
+#     hop_length=512,
+#     win_length=None,
+#     center=True) -> np.ndarray:
+#     """
+#     Convert a spectrogram back to audio using either:
+#     1. Original phase information (if provided)
+#     2. Griffin-Lim algorithm to estimate phase (if no phase provided)
+#     Even with original phase, the reconstruction is not truely lossless 1e-33 MSE loss.
+#     Parameters:
+#     -----------
+#     spectrogram (np.ndarray): The magnitude spectrogram to convert back to audio
+#     phase       (np.ndarray, optional): Phase information to use for reconstruction. If None, Griffin-Lim is used.
+#     phase_info  (bool): If True, the input is assumed to be a phase spectrogram
+#     n_fft       (int): FFT window size
+#     n_iter      (int, optional): Number of iterations for Griffin-Lim algorithm
+#     window      (str): Window function to use
+#     win_length  (int or None): Window size. If None, defaults to n_fft
+#     hop_length  (int, optional): Number of samples between successive frames
+#     center      (bool, optional): Whether to pad the signal at the edges
+#     Returns:
+#     --------
+#     y : np.ndarray The reconstructed audio signal
+#     """
+#     # If the input is in dB scale, convert back to amplitude
+#     if np.max(spectrogram) < 0 and np.mean(spectrogram) < 0:
+#         spectrogram = librosa.db_to_amplitude(spectrogram)
+#     if phase_info:
+#         return librosa.istft(spectrogram, n_fft=n_fft, hop_length=hop_length,
+#                           win_length=win_length, window=window, center=center)
+#     # If phase information is provided, use it for reconstruction
+#     if phase is not None:
+#         # Combine magnitude and phase to form complex spectrogram
+#         complex_spectrogram = spectrogram * np.exp(1j * phase)
+#         # Inverse STFT to get audio
+#         y = librosa.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length,
+#                           win_length=win_length, window=window, center=center)
+#     else:
+#         # Use Griffin-Lim algorithm to estimate phase
+#         y = librosa.griffinlim(spectrogram, n_fft=n_fft, n_iter=n_iter,
+#                                hop_length=hop_length, win_length=win_length,
+#                                window=window, center=center)
+#     return y
+# def mel_spectrogram_to_audio(
+#     mel_spectrogram: np.ndarray,
+#     sample_rate: int = DEFAULT_SAMPLE_RATE,
+#     n_fft: int = 2048,
+#     hop_length: int = 512,
+#     n_iter: int = 32,
+#     n_mels: int = 128,
+#     fmin: float = 0.0,
+#     fmax: Optional[float] = None,
+#     power: float = 2.0
+# ) -> np.ndarray:
+#     """
+#     Convert a mel spectrogram to audio using inverse transformation and Griffin-Lim.
+#     Parameters
+#     ----------
+#     mel_spectrogram (np.ndarray): Mel spectrogram
+#     sample_rate     (int, optional): Sample rate of audio
+#     n_fft           (int, optional): FFT window size
+#     hop_length      (int, optional): Number of samples between successive frames
+#     n_iter          (int, optional): Number of iterations for Griffin-Lim
+#     n_mels          (int, optional): Number of mel bands
+#     fmin            (float, optional): Minimum frequency
+#     fmax            (float or None, optional): Maximum frequency. If None, use sample_rate/2
+#     power           (float, optional): Exponent for the magnitude spectrogram (e.g. 1 for energy, 2 for power)
+#     Returns
+#     -------
+#     np.ndarray
+#         Audio time series
+#     """
+#     # Create a mel filterbank
+#     mel_basis = librosa.filters.mel(
+#         sr=sample_rate,
+#         n_fft=n_fft,
+#         n_mels=n_mels,
+#         fmin=fmin,
+#         fmax=fmax
+#     )
+#     # Compute the pseudo-inverse of the mel filterbank
+#     mel_filterbank_inv = np.linalg.pinv(mel_basis)
+#     # Convert Mel spectrogram to linear spectrogram
+#     linear_spec = np.dot(mel_filterbank_inv, mel_spectrogram)
+#     # # If the input was a power spectrogram, take the square root
+#     if power == 2.0:
+#        linear_spec = np.sqrt(linear_spec)
+#     # Perform Griffin-Lim to estimate the phase and convert to audio
+#     audio_data = librosa.griffinlim(
+#         linear_spec,
+#         hop_length=hop_length,
+#         n_fft=n_fft,
+#         n_iter=n_iter
+#     )
+#     return audio_data
+# def visualize_spectrogram(
+#     spectrogram: np.ndarray,
+#     power: int = 1,
+#     sample_rate: int = DEFAULT_SAMPLE_RATE,
+#     n_fft: int = 512,
+#     hop_length: int = 192,
+#     win_length: int = 384,
+#     gap_int: Optional[Tuple[int, int]] = None,
+#     in_db: bool = False,
+#     y_axis: str = 'log',
+#     x_axis: str = 'time',
+#     title: str = 'Spectrogram',
+#     save_path: Optional[Union[str, Path]] = None
+# ) -> figure:
+#     """
+#     Visualize a spectrogram.
+#     Parameters
+#     ----------
+#     spectrogram (np.ndarray): Spectrogram to visualize
+#     power       (int): Whether the spectrogram is in energy (1) or power (2) scale
+#     sample_rate (int, optional): Sample rate of audio
+#     hop_length  (int, optional): Number of samples between successive frames
+#     gap_int     (float tuple, optional): Start and end time [s] of the gap (if given) to be plotted as vertical lines
+#     in_db       (bool, optional): Whether the spectrogram is already in dB scale
+#     y_axis      (str, optional): Scale for the y-axis ('linear', 'log', or 'mel')
+#     x_axis      (str, optional): Scale for the x-axis ('time' or 'frames')
+#     title       (str, optional): Title for the plot
+#     save_path   (str or Path or None, optional): Path to save the visualization. If None, the plot is displayed.
+#     Returns
+#     -------
+#     Figure or None
+#         The matplotlib Figure object if save_path is None, otherwise None
+#     """
+#     if power not in (1, 2):
+#         raise ValueError("Power must be 1 (energy) or 2 (power)")
+#     # Convert to dB scale if needed
+#     if in_db:
+#         spectrogram_data = np.array(spectrogram)
+#     elif power == 1:
+#         spectrogram_data = librosa.amplitude_to_db(spectrogram, ref=np.max, amin=1e-5, top_db=80)
+#     else:  # power == 2
+#         spectrogram_data = librosa.power_to_db(spectrogram, ref=np.max, amin=1e-5, top_db=80)
+#     fig, ax = plt.subplots(figsize=(10, 4))
+#     img = librosa.display.specshow(
+#         spectrogram_data,
+#         sr=sample_rate,
+#         n_fft=n_fft,
+#         win_length=win_length,
+#         hop_length=hop_length,
+#         y_axis=y_axis,
+#         x_axis=x_axis,
+#         ax=ax
+#     )
+#     # Compute gap start and end indices and plot vertical lines
+#     if gap_int is not None:
+#         gap_start_s, gap_end_s = gap_int
+#         ax.axvline(x=gap_start_s, color='white', linestyle='--', label='Gap Start')
+#         ax.axvline(x=gap_end_s, color='white', linestyle='--', label='Gap End')
+#         ax.legend()
+#     # Add colorbar and title
+#     fig.colorbar(img, ax=ax, format='%+2.0f dB')
+#     ax.set_title(title)
+#     fig.tight_layout()
+#     # Save or return the figure
+#     if save_path is not None:
+#         save_path = Path(save_path)
+#         output_dir = save_path.parent
+#         if output_dir and not output_dir.exists():
+#             output_dir.mkdir(parents=True, exist_ok=True)
+#         fig.savefig(save_path)
+#         plt.close(fig)
+#         return None
+#     return fig