Spaces:

hugging-science
/

abmelt-benchmark

Build error

File size: 13,309 Bytes

8ef403e

#!/usr/bin/env python3

"""

Cleanup utility for AbMelt inference pipeline temp directory.

Removes intermediate GROMACS files while preserving files needed for:

- Re-running descriptor computation (--skip-md)

- Re-running inference (--skip-descriptors)

- Debugging and validation

"""

import os
import logging
from pathlib import Path
from typing import List, Set, Dict
import glob

logger = logging.getLogger(__name__)


# Files that MUST be kept for pipeline to work
REQUIRED_FILES = {
    # Structure files
    "structure": [
        "{antibody_name}.pdb",  # Input structure
        "processed.pdb",  # Processed structure (needed for re-indexing)
        "processed.gro",  # Processed GRO (needed for re-indexing)
        "topol.top",  # Topology file
        "index.ndx",  # Index file for CDR regions
    ],
    
    # MD simulation final outputs (needed for descriptor computation)
    "md_final": [
        "md_final_{temp}.xtc",  # Final processed trajectory per temperature
        "md_final_{temp}.gro",  # Final reference structure per temperature
        "md_{temp}.tpr",  # Topology file per temperature
    ],
    
    # Descriptor computation outputs
    "descriptors": [
        "descriptors.csv",  # Aggregated descriptors (CSV)
        "descriptors.pkl",  # Aggregated descriptors (pickle)
        "*.xvg",  # All GROMACS descriptor files (needed for re-aggregation)
        "res_sasa_{temp}.np",  # SASA data per temperature
        "sconf_{temp}.log",  # Conformational entropy log per temperature
    ],
    
    # Order parameter files (optional - can be regenerated but useful for debugging)
    "order_params": [
        "order_s2_{temp}K_{block}_{start}.csv",  # Order parameter CSVs
        "order_lambda_{block}_{start}.csv",  # Lambda CSV
    ],
    
    # Model inference outputs
    "predictions": [
        "{antibody_name}_predictions.csv",  # Prediction results
    ],
}


# File patterns that can be safely deleted (intermediate files)
INTERMEDIATE_PATTERNS = [
    # GROMACS backup files (created when overwriting existing files)
    "#*#",  # Matches #filename.number# backup files
    
    # Intermediate trajectory processing files
    "md_whole_*.xtc",
    "md_nopbcjump_*.xtc",
    "md_{temp}.xtc",  # Raw trajectory before processing
    "md_{temp}.gro",  # Raw structure before processing
    
    # Equilibration files
    "nvt_*.gro",
    "nvt_*.xtc",
    "nvt_*.tpr",
    "nvt_*.cpt",
    "nvt_*.edr",
    "nvt_*.log",
    "npt_*.gro",
    "npt_*.xtc",
    "npt_*.tpr",
    "npt_*.cpt",
    "npt_*.edr",
    "npt_*.log",
    
    # System setup intermediates
    "box.gro",
    "solv.gro",
    "solv_ions.gro",
    "em.gro",
    "em.tpr",
    "em.edr",
    "em.log",
    "ions.tpr",
    
    # Covariance analysis intermediates
    "md_final_covar_*.xtc",
    "covar_*.trr",
    "covar_*.xvg",
    "covar.log",
    "avg_covar*.pdb",
    "covar_matrix_*.dat",
    
    # Custom simulation time files (when simulation_time != 100)
    "md_{temp}_*.gro",  # e.g., md_300_2.gro
    "md_{temp}_*.xtc",  # e.g., md_300_2.xtc
    "md_{temp}_*.tpr",  # e.g., md_300_2.tpr
    
    # Other intermediate files
    "em.trr",  # Energy minimization trajectory
    "hbond.ndx",  # Temporary index file for hydrogen bonds
    
    # PropKa output
    "*.pka",
    
    # MD run logs (except sconf logs which are kept)
    "md_*.log",
    "md_*.edr",
    
    # Checkpoint files (can be regenerated)
    "*.cpt",
    
    # Temporary MDP files created in work dir
    "nvt_*.mdp",
    "npt_*.mdp",
    "md_*.mdp",
    "mdout.mdp",  # GROMACS output MDP file
    "ions.mdp",
    "em.mdp",
    
    # Topology include files (generated during pdb2gmx)
    "posre_*.itp",  # Position restraint files
    "topol_*.itp",  # Topology include files for chains
]


def get_required_files(work_dir: Path, antibody_name: str, temperatures: List[str]) -> Set[str]:
    """

    Generate set of required file patterns based on antibody name and temperatures.

    

    Args:

        work_dir: Working directory path

        antibody_name: Name of antibody

        temperatures: List of temperature strings (e.g., ['300', '350', '400'])

        

    Returns:

        Set of required file paths (absolute)

    """
    required = set()
    
    # Structure files
    for pattern in REQUIRED_FILES["structure"]:
        file_path = work_dir / pattern.format(antibody_name=antibody_name)
        required.add(str(file_path))
    
    # MD final outputs (per temperature)
    for temp in temperatures:
        for pattern in REQUIRED_FILES["md_final"]:
            file_path = work_dir / pattern.format(temp=temp)
            required.add(str(file_path))
    
    # Descriptor outputs (per temperature)
    for temp in temperatures:
        for pattern in REQUIRED_FILES["descriptors"]:
            if "{temp}" in pattern:
                file_path = work_dir / pattern.format(temp=temp)
                required.add(str(file_path))
            elif pattern == "*.xvg":
                # Add all XVG files
                xvg_files = list(work_dir.glob("*.xvg"))
                required.update(str(f) for f in xvg_files)
            elif pattern == "res_sasa_{temp}.np":
                file_path = work_dir / pattern.format(temp=temp)
                required.add(str(file_path))
            elif pattern == "sconf_{temp}.log":
                file_path = work_dir / pattern.format(temp=temp)
                required.add(str(file_path))
    
    # Descriptor files (not temperature-specific)
    for pattern in ["descriptors.csv", "descriptors.pkl"]:
        file_path = work_dir / pattern
        required.add(str(file_path))
    
    # Order parameter files (optional - match any)
    order_s2_files = list(work_dir.glob("order_s2_*.csv"))
    order_lambda_files = list(work_dir.glob("order_lambda_*.csv"))
    required.update(str(f) for f in order_s2_files)
    required.update(str(f) for f in order_lambda_files)
    
    # Prediction files
    for pattern in REQUIRED_FILES["predictions"]:
        file_path = work_dir / pattern.format(antibody_name=antibody_name)
        required.add(str(file_path))
    
    return required


def get_intermediate_files(work_dir: Path, temperatures: List[str]) -> Set[str]:
    """

    Find all intermediate files that can be deleted.

    

    Args:

        work_dir: Working directory path

        temperatures: List of temperature strings

        

    Returns:

        Set of intermediate file paths (absolute)

    """
    intermediate = set()
    
    # Convert patterns to actual file matches
    for pattern in INTERMEDIATE_PATTERNS:
        # Handle temperature-specific patterns
        if "{temp}" in pattern:
            for temp in temperatures:
                actual_pattern = pattern.format(temp=temp)
                matches = list(work_dir.glob(actual_pattern))
                intermediate.update(str(f) for f in matches)
        else:
            # Special handling for GROMACS backup files (#*# pattern)
            if pattern == "#*#":
                # Match files that start with # and end with #
                # This is a glob pattern that matches GROMACS backup files
                matches = []
                for f in work_dir.rglob("*"):
                    if f.is_file() and f.name.startswith("#") and f.name.endswith("#"):
                        matches.append(f)
                intermediate.update(str(f) for f in matches)
            else:
                matches = list(work_dir.glob(pattern))
                intermediate.update(str(f) for f in matches)
    
    return intermediate


def cleanup_temp_directory(

    work_dir: Path,

    antibody_name: str,

    temperatures: List[str],

    dry_run: bool = True,

    keep_order_params: bool = True

) -> Dict[str, int]:
    """

    Clean up temporary directory, removing intermediate files.

    

    Args:

        work_dir: Working directory to clean

        antibody_name: Name of antibody

        temperatures: List of temperature strings

        dry_run: If True, only report what would be deleted without deleting

        keep_order_params: If True, keep order parameter CSV files

        

    Returns:

        Dictionary with cleanup statistics

    """
    work_dir = Path(work_dir).resolve()
    
    if not work_dir.exists():
        raise ValueError(f"Work directory does not exist: {work_dir}")
    
    # Get required files
    required = get_required_files(work_dir, antibody_name, temperatures)
    
    # Get intermediate files
    intermediate = get_intermediate_files(work_dir, temperatures)
    
    # Remove order param files from intermediate if keeping them
    if keep_order_params:
        order_param_files = set(str(f) for f in work_dir.glob("order_*.csv"))
        intermediate -= order_param_files
    
    # Find all files in directory
    all_files = set(str(f) for f in work_dir.rglob("*") if f.is_file())
    
    # Files to delete = intermediate files that are not required
    to_delete = intermediate - required
    
    # Also check for any other files not in required set (safety check)
    # But exclude hidden files and common non-GROMACS files
    other_files = all_files - required - intermediate
    suspicious = set()
    for f in other_files:
        f_path = Path(f)
        # Keep common non-GROMACS files
        if f_path.suffix in ['.py', '.yaml', '.yml', '.txt', '.md', '.json']:
            continue
        # Keep hidden files
        if f_path.name.startswith('.'):
            continue
        # Keep prediction CSV files (may have different naming conventions)
        if 'prediction' in f_path.name.lower() and f_path.suffix == '.csv':
            continue
        suspicious.add(f)
    
    stats = {
        "total_files": len(all_files),
        "required_files": len(required),
        "intermediate_files": len(intermediate),
        "files_to_delete": len(to_delete),
        "suspicious_files": len(suspicious),
    }
    
    if dry_run:
        logger.info("DRY RUN - No files will be deleted")
        logger.info(f"Total files: {stats['total_files']}")
        logger.info(f"Required files: {stats['required_files']}")
        logger.info(f"Files to delete: {stats['files_to_delete']}")
        if suspicious:
            logger.warning(f"Suspicious files (not in required or intermediate): {len(suspicious)}")
            logger.warning("These files will NOT be deleted. Review manually:")
            for f in sorted(suspicious)[:10]:  # Show first 10
                logger.warning(f"  {Path(f).name}")
    else:
        # Actually delete files
        deleted_count = 0
        failed_count = 0
        
        for file_path in sorted(to_delete):
            try:
                Path(file_path).unlink()
                deleted_count += 1
            except Exception as e:
                logger.error(f"Failed to delete {file_path}: {e}")
                failed_count += 1
        
        stats["deleted"] = deleted_count
        stats["failed"] = failed_count
        
        logger.info(f"Cleanup completed:")
        logger.info(f"  Deleted: {deleted_count} files")
        logger.info(f"  Failed: {failed_count} files")
        logger.info(f"  Remaining: {stats['total_files'] - deleted_count} files")
    
    return stats


def main():
    """CLI entry point for cleanup utility."""
    import argparse
    
    parser = argparse.ArgumentParser(
        description="Clean up AbMelt temp directory, removing intermediate GROMACS files"
    )
    parser.add_argument(
        "work_dir",
        type=str,
        help="Path to working directory (temp directory)"
    )
    parser.add_argument(
        "--antibody-name",
        type=str,
        required=True,
        help="Antibody name (for finding prediction files)"
    )
    parser.add_argument(
        "--temperatures",
        type=str,
        nargs="+",
        default=["300", "350", "400"],
        help="List of temperatures used in simulation"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be deleted without actually deleting"
    )
    parser.add_argument(
        "--delete-order-params",
        action="store_true",
        help="Also delete order parameter CSV files (default: keep them)"
    )
    
    args = parser.parse_args()
    
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    # Run cleanup
    stats = cleanup_temp_directory(
        work_dir=Path(args.work_dir),
        antibody_name=args.antibody_name,
        temperatures=args.temperatures,
        dry_run=args.dry_run,
        keep_order_params=not args.delete_order_params
    )
    
    print("\n" + "="*60)
    print("Cleanup Summary")
    print("="*60)
    for key, value in stats.items():
        print(f"  {key}: {value}")
    print("="*60)


if __name__ == "__main__":
    main()