Spaces:

hugging-science
/

abmelt-benchmark

Build error

App Files Files Community

abmelt-benchmark / src /cleanup_temp_files.py

ZijianGuan

Upload folder using huggingface_hub

8ef403e verified 4 days ago

raw

history blame contribute delete

13.3 kB

	#!/usr/bin/env python3

	"""
	Cleanup utility for AbMelt inference pipeline temp directory.
	Removes intermediate GROMACS files while preserving files needed for:
	- Re-running descriptor computation (--skip-md)
	- Re-running inference (--skip-descriptors)
	- Debugging and validation
	"""

	import os
	import logging
	from pathlib import Path
	from typing import List, Set, Dict
	import glob

	logger = logging.getLogger(__name__)


	# Files that MUST be kept for pipeline to work
	REQUIRED_FILES = {
	# Structure files
	"structure": [
	"{antibody_name}.pdb", # Input structure
	"processed.pdb", # Processed structure (needed for re-indexing)
	"processed.gro", # Processed GRO (needed for re-indexing)
	"topol.top", # Topology file
	"index.ndx", # Index file for CDR regions
	],

	# MD simulation final outputs (needed for descriptor computation)
	"md_final": [
	"md_final_{temp}.xtc", # Final processed trajectory per temperature
	"md_final_{temp}.gro", # Final reference structure per temperature
	"md_{temp}.tpr", # Topology file per temperature
	],

	# Descriptor computation outputs
	"descriptors": [
	"descriptors.csv", # Aggregated descriptors (CSV)
	"descriptors.pkl", # Aggregated descriptors (pickle)
	"*.xvg", # All GROMACS descriptor files (needed for re-aggregation)
	"res_sasa_{temp}.np", # SASA data per temperature
	"sconf_{temp}.log", # Conformational entropy log per temperature
	],

	# Order parameter files (optional - can be regenerated but useful for debugging)
	"order_params": [
	"order_s2_{temp}K_{block}_{start}.csv", # Order parameter CSVs
	"order_lambda_{block}_{start}.csv", # Lambda CSV
	],

	# Model inference outputs
	"predictions": [
	"{antibody_name}_predictions.csv", # Prediction results
	],
	}


	# File patterns that can be safely deleted (intermediate files)
	INTERMEDIATE_PATTERNS = [
	# GROMACS backup files (created when overwriting existing files)
	"#*#", # Matches #filename.number# backup files

	# Intermediate trajectory processing files
	"md_whole_*.xtc",
	"md_nopbcjump_*.xtc",
	"md_{temp}.xtc", # Raw trajectory before processing
	"md_{temp}.gro", # Raw structure before processing

	# Equilibration files
	"nvt_*.gro",
	"nvt_*.xtc",
	"nvt_*.tpr",
	"nvt_*.cpt",
	"nvt_*.edr",
	"nvt_*.log",
	"npt_*.gro",
	"npt_*.xtc",
	"npt_*.tpr",
	"npt_*.cpt",
	"npt_*.edr",
	"npt_*.log",

	# System setup intermediates
	"box.gro",
	"solv.gro",
	"solv_ions.gro",
	"em.gro",
	"em.tpr",
	"em.edr",
	"em.log",
	"ions.tpr",

	# Covariance analysis intermediates
	"md_final_covar_*.xtc",
	"covar_*.trr",
	"covar_*.xvg",
	"covar.log",
	"avg_covar*.pdb",
	"covar_matrix_*.dat",

	# Custom simulation time files (when simulation_time != 100)
	"md_{temp}_*.gro", # e.g., md_300_2.gro
	"md_{temp}_*.xtc", # e.g., md_300_2.xtc
	"md_{temp}_*.tpr", # e.g., md_300_2.tpr

	# Other intermediate files
	"em.trr", # Energy minimization trajectory
	"hbond.ndx", # Temporary index file for hydrogen bonds

	# PropKa output
	"*.pka",

	# MD run logs (except sconf logs which are kept)
	"md_*.log",
	"md_*.edr",

	# Checkpoint files (can be regenerated)
	"*.cpt",

	# Temporary MDP files created in work dir
	"nvt_*.mdp",
	"npt_*.mdp",
	"md_*.mdp",
	"mdout.mdp", # GROMACS output MDP file
	"ions.mdp",
	"em.mdp",

	# Topology include files (generated during pdb2gmx)
	"posre_*.itp", # Position restraint files
	"topol_*.itp", # Topology include files for chains
	]


	def get_required_files(work_dir: Path, antibody_name: str, temperatures: List[str]) -> Set[str]:
	"""
	Generate set of required file patterns based on antibody name and temperatures.

	Args:
	work_dir: Working directory path
	antibody_name: Name of antibody
	temperatures: List of temperature strings (e.g., ['300', '350', '400'])

	Returns:
	Set of required file paths (absolute)
	"""
	required = set()

	# Structure files
	for pattern in REQUIRED_FILES["structure"]:
	file_path = work_dir / pattern.format(antibody_name=antibody_name)
	required.add(str(file_path))

	# MD final outputs (per temperature)
	for temp in temperatures:
	for pattern in REQUIRED_FILES["md_final"]:
	file_path = work_dir / pattern.format(temp=temp)
	required.add(str(file_path))

	# Descriptor outputs (per temperature)
	for temp in temperatures:
	for pattern in REQUIRED_FILES["descriptors"]:
	if "{temp}" in pattern:
	file_path = work_dir / pattern.format(temp=temp)
	required.add(str(file_path))
	elif pattern == "*.xvg":
	# Add all XVG files
	xvg_files = list(work_dir.glob("*.xvg"))
	required.update(str(f) for f in xvg_files)
	elif pattern == "res_sasa_{temp}.np":
	file_path = work_dir / pattern.format(temp=temp)
	required.add(str(file_path))
	elif pattern == "sconf_{temp}.log":
	file_path = work_dir / pattern.format(temp=temp)
	required.add(str(file_path))

	# Descriptor files (not temperature-specific)
	for pattern in ["descriptors.csv", "descriptors.pkl"]:
	file_path = work_dir / pattern
	required.add(str(file_path))

	# Order parameter files (optional - match any)
	order_s2_files = list(work_dir.glob("order_s2_*.csv"))
	order_lambda_files = list(work_dir.glob("order_lambda_*.csv"))
	required.update(str(f) for f in order_s2_files)
	required.update(str(f) for f in order_lambda_files)

	# Prediction files
	for pattern in REQUIRED_FILES["predictions"]:
	file_path = work_dir / pattern.format(antibody_name=antibody_name)
	required.add(str(file_path))

	return required


	def get_intermediate_files(work_dir: Path, temperatures: List[str]) -> Set[str]:
	"""
	Find all intermediate files that can be deleted.

	Args:
	work_dir: Working directory path
	temperatures: List of temperature strings

	Returns:
	Set of intermediate file paths (absolute)
	"""
	intermediate = set()

	# Convert patterns to actual file matches
	for pattern in INTERMEDIATE_PATTERNS:
	# Handle temperature-specific patterns
	if "{temp}" in pattern:
	for temp in temperatures:
	actual_pattern = pattern.format(temp=temp)
	matches = list(work_dir.glob(actual_pattern))
	intermediate.update(str(f) for f in matches)
	else:
	# Special handling for GROMACS backup files (#*# pattern)
	if pattern == "#*#":
	# Match files that start with # and end with #
	# This is a glob pattern that matches GROMACS backup files
	matches = []
	for f in work_dir.rglob("*"):
	if f.is_file() and f.name.startswith("#") and f.name.endswith("#"):
	matches.append(f)
	intermediate.update(str(f) for f in matches)
	else:
	matches = list(work_dir.glob(pattern))
	intermediate.update(str(f) for f in matches)

	return intermediate


	def cleanup_temp_directory(
	work_dir: Path,
	antibody_name: str,
	temperatures: List[str],
	dry_run: bool = True,
	keep_order_params: bool = True
	) -> Dict[str, int]:
	"""
	Clean up temporary directory, removing intermediate files.

	Args:
	work_dir: Working directory to clean
	antibody_name: Name of antibody
	temperatures: List of temperature strings
	dry_run: If True, only report what would be deleted without deleting
	keep_order_params: If True, keep order parameter CSV files

	Returns:
	Dictionary with cleanup statistics
	"""
	work_dir = Path(work_dir).resolve()

	if not work_dir.exists():
	raise ValueError(f"Work directory does not exist: {work_dir}")

	# Get required files
	required = get_required_files(work_dir, antibody_name, temperatures)

	# Get intermediate files
	intermediate = get_intermediate_files(work_dir, temperatures)

	# Remove order param files from intermediate if keeping them
	if keep_order_params:
	order_param_files = set(str(f) for f in work_dir.glob("order_*.csv"))
	intermediate -= order_param_files

	# Find all files in directory
	all_files = set(str(f) for f in work_dir.rglob("*") if f.is_file())

	# Files to delete = intermediate files that are not required
	to_delete = intermediate - required

	# Also check for any other files not in required set (safety check)
	# But exclude hidden files and common non-GROMACS files
	other_files = all_files - required - intermediate
	suspicious = set()
	for f in other_files:
	f_path = Path(f)
	# Keep common non-GROMACS files
	if f_path.suffix in ['.py', '.yaml', '.yml', '.txt', '.md', '.json']:
	continue
	# Keep hidden files
	if f_path.name.startswith('.'):
	continue
	# Keep prediction CSV files (may have different naming conventions)
	if 'prediction' in f_path.name.lower() and f_path.suffix == '.csv':
	continue
	suspicious.add(f)

	stats = {
	"total_files": len(all_files),
	"required_files": len(required),
	"intermediate_files": len(intermediate),
	"files_to_delete": len(to_delete),
	"suspicious_files": len(suspicious),
	}

	if dry_run:
	logger.info("DRY RUN - No files will be deleted")
	logger.info(f"Total files: {stats['total_files']}")
	logger.info(f"Required files: {stats['required_files']}")
	logger.info(f"Files to delete: {stats['files_to_delete']}")
	if suspicious:
	logger.warning(f"Suspicious files (not in required or intermediate): {len(suspicious)}")
	logger.warning("These files will NOT be deleted. Review manually:")
	for f in sorted(suspicious)[:10]: # Show first 10
	logger.warning(f" {Path(f).name}")
	else:
	# Actually delete files
	deleted_count = 0
	failed_count = 0

	for file_path in sorted(to_delete):
	try:
	Path(file_path).unlink()
	deleted_count += 1
	except Exception as e:
	logger.error(f"Failed to delete {file_path}: {e}")
	failed_count += 1

	stats["deleted"] = deleted_count
	stats["failed"] = failed_count

	logger.info(f"Cleanup completed:")
	logger.info(f" Deleted: {deleted_count} files")
	logger.info(f" Failed: {failed_count} files")
	logger.info(f" Remaining: {stats['total_files'] - deleted_count} files")

	return stats


	def main():
	"""CLI entry point for cleanup utility."""
	import argparse

	parser = argparse.ArgumentParser(
	description="Clean up AbMelt temp directory, removing intermediate GROMACS files"
	)
	parser.add_argument(
	"work_dir",
	type=str,
	help="Path to working directory (temp directory)"
	)
	parser.add_argument(
	"--antibody-name",
	type=str,
	required=True,
	help="Antibody name (for finding prediction files)"
	)
	parser.add_argument(
	"--temperatures",
	type=str,
	nargs="+",
	default=["300", "350", "400"],
	help="List of temperatures used in simulation"
	)
	parser.add_argument(
	"--dry-run",
	action="store_true",
	help="Show what would be deleted without actually deleting"
	)
	parser.add_argument(
	"--delete-order-params",
	action="store_true",
	help="Also delete order parameter CSV files (default: keep them)"
	)

	args = parser.parse_args()

	# Setup logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# Run cleanup
	stats = cleanup_temp_directory(
	work_dir=Path(args.work_dir),
	antibody_name=args.antibody_name,
	temperatures=args.temperatures,
	dry_run=args.dry_run,
	keep_order_params=not args.delete_order_params
	)

	print("\n" + "="*60)
	print("Cleanup Summary")
	print("="*60)
	for key, value in stats.items():
	print(f" {key}: {value}")
	print("="*60)


	if __name__ == "__main__":
	main()