Spaces:

hugging-science
/

abmelt-benchmark

Build error

App Files Files Community

abmelt-benchmark / denormalize_all.py

ZijianGuan

Upload folder using huggingface_hub

8ef403e verified 4 days ago

raw

history blame contribute delete

11.6 kB

	#!/usr/bin/env python3
	"""
	Denormalize Tagg, Tm, and Tmon values from normalized holdout sets.

	This script:
	1. Loads reference file (tm_holdout_4.csv) to get merck_id and name mapping
	2. Loads normalized values from tagg_holdout_normalized.csv, tm_holdout_normalized.csv, and tmon_holdout_normalized.csv
	3. Filters to only include antibodies present in reference file
	4. Denormalizes the normalized values using utils.py
	5. Saves denormalized values to separate CSV files with _denormalized postfix

	Note: The tmon file uses column 'tmonset' which represents T_mon_onset (tmon).
	"""

	import sys
	import pandas as pd
	import numpy as np
	from pathlib import Path
	from scipy.stats import pearsonr

	# Add src to path for imports
	sys.path.append(str(Path(__file__).parent / "src"))

	from utils import renormalize, DEFAULT_STATS

	def denormalize_temperature_type(normalized_df, reference_df, temp_type, column_name, output_file):
	"""
	Denormalize a specific temperature type.

	Args:
	normalized_df: DataFrame with normalized values
	reference_df: DataFrame with merck_id and name mapping
	temp_type: Temperature type ('tagg', 'tm', or 'tmon')
	column_name: Name of the column in normalized_df (e.g., 'tagg', 'tm', 'tmonset')
	output_file: Path to output CSV file

	Returns:
	DataFrame with denormalized values
	"""
	print(f"\n{'='*80}")
	print(f"Processing {temp_type.upper()}")
	print(f"{'='*80}")

	# Check if column exists
	if column_name not in normalized_df.columns:
	print(f"ERROR: '{column_name}' column not found!")
	print(f"Available columns: {list(normalized_df.columns)}")
	return None

	# Filter normalized_df to only include antibodies present in reference file
	# The 'name' column in normalized_df contains Merck IDs
	print(f"\nFiltering to antibodies present in reference file...")
	filtered_df = normalized_df[normalized_df['name'].isin(reference_df['merck_id'])].copy()
	print(f" Found {len(filtered_df)} matching antibodies")

	if len(filtered_df) == 0:
	print(f"ERROR: No matching antibodies found!")
	print(f"Reference antibodies (merck_id): {reference_df['merck_id'].tolist()}")
	print(f"Normalized antibodies (name): {normalized_df['name'].tolist()}")
	return None

	# Merge with reference to get merck_id and name
	merged_df = pd.merge(
	filtered_df[['name', column_name]],
	reference_df[['merck_id', 'name']],
	left_on='name',
	right_on='merck_id',
	how='inner'
	)

	# Denormalize the normalized values
	print(f"\nDenormalizing normalized {temp_type.upper()} values...")
	normalized_values = merged_df[column_name].values
	denormalized_values = renormalize(normalized_values, temp_type=temp_type)

	# Create output dataframe with merck_id, name, and denormalized value
	output_column = temp_type # Use 'tmon' instead of 'tmonset' for output
	output_df = pd.DataFrame({
	'merck_id': merged_df['merck_id'],
	'name': merged_df['name_y'],
	output_column: denormalized_values
	})

	# Display results
	print(f"\nStatistics used:")
	print(f" Mean: {DEFAULT_STATS[temp_type]['mean']:.2f}°C")
	print(f" Std: {DEFAULT_STATS[temp_type]['std']:.2f}°C")

	print(f"\n{'Merck ID':<15} {'Name':<20} {'Normalized':<15} {'Denormalized':<15}")
	print("-" * 65)

	for _, row in merged_df.iterrows():
	merck_id = row['merck_id']
	antibody_name = row['name_y']
	normalized_val = row[column_name]
	denormalized_val = output_df[output_df['merck_id'] == merck_id][output_column].values[0]
	print(f"{merck_id:<15} {antibody_name:<20} {normalized_val:<15.4f} {denormalized_val:<15.2f}")

	# Summary statistics
	print(f"\nSUMMARY STATISTICS")
	print(f"Mean Denormalized {temp_type.upper()}: {denormalized_values.mean():.2f}°C")
	print(f"Std Denormalized {temp_type.upper()}: {denormalized_values.std():.2f}°C")
	print(f"Min Denormalized {temp_type.upper()}: {denormalized_values.min():.2f}°C")
	print(f"Max Denormalized {temp_type.upper()}: {denormalized_values.max():.2f}°C")

	# Save results to CSV
	output_df.to_csv(output_file, index=False)
	print(f"\nDenormalized values saved to: {output_file}")

	return output_df

	def compare_tm_values(actual_df, denormalized_df, normalized_df, data_dir):
	"""
	Compare actual TM values with denormalized values.

	Args:
	actual_df: DataFrame with actual TM values (from tm_holdout_4.csv)
	denormalized_df: DataFrame with denormalized TM values
	normalized_df: DataFrame with normalized TM values
	data_dir: Path to data directory

	Returns:
	DataFrame with comparison results
	"""
	print(f"\n{'='*80}")
	print("COMPARING ACTUAL vs DENORMALIZED TM VALUES")
	print(f"{'='*80}")

	# Merge actual, normalized, and denormalized values
	# First merge actual with normalized (on merck_id = name in normalized_df)
	temp_df = pd.merge(
	actual_df[['merck_id', 'name', 'tm']],
	normalized_df[['name', 'tm']],
	left_on='merck_id',
	right_on='name',
	how='inner',
	suffixes=('_actual', '_normalized')
	)

	# Rename columns from first merge
	temp_df = temp_df.rename(columns={
	'tm_actual': 'actual_tm',
	'tm_normalized': 'normalized_tm',
	'name_actual': 'antibody_name'
	})

	# Drop duplicate name column if it exists
	if 'name_normalized' in temp_df.columns:
	temp_df = temp_df.drop(columns=['name_normalized'])

	# Then merge with denormalized
	merged_df = pd.merge(
	temp_df,
	denormalized_df[['merck_id', 'tm']],
	on='merck_id',
	how='inner'
	)

	if len(merged_df) == 0:
	print("ERROR: No matching antibodies found for comparison!")
	return None

	# Rename denormalized tm column
	merged_df = merged_df.rename(columns={
	'tm': 'denormalized_tm'
	})

	# Calculate errors
	merged_df['error'] = merged_df['denormalized_tm'] - merged_df['actual_tm']
	merged_df['abs_error'] = np.abs(merged_df['error'])
	merged_df['abs_error_percent'] = (merged_df['abs_error'] / merged_df['actual_tm']) * 100

	# Display results
	print(f"\nStatistics used for denormalization:")
	print(f" Mean: {DEFAULT_STATS['tm']['mean']:.2f}°C")
	print(f" Std: {DEFAULT_STATS['tm']['std']:.2f}°C")

	print(f"\n{'Antibody':<20} {'Merck ID':<12} {'Actual TM':<12} {'Normalized':<12} {'Denormalized':<15} {'Error':<12} {'Abs Error':<12} {'Error %':<10}")
	print("-" * 110)

	for _, row in merged_df.iterrows():
	antibody_name = row['antibody_name']
	print(f"{antibody_name:<20} "
	f"{row['merck_id']:<12} "
	f"{row['actual_tm']:<12.2f} "
	f"{row['normalized_tm']:<12.4f} "
	f"{row['denormalized_tm']:<15.2f} "
	f"{row['error']:<12.2f} "
	f"{row['abs_error']:<12.2f} "
	f"{row['abs_error_percent']:<10.2f}")

	# Summary statistics
	print(f"\n{'='*80}")
	print("SUMMARY STATISTICS")
	print(f"{'='*80}")
	print(f"\nMean Absolute Error (MAE): {merged_df['abs_error'].mean():.2f}°C")
	print(f"Root Mean Squared Error (RMSE): {np.sqrt((merged_df['error']**2).mean()):.2f}°C")
	print(f"Mean Absolute Percent Error: {merged_df['abs_error_percent'].mean():.2f}%")
	print(f"Max Absolute Error: {merged_df['abs_error'].max():.2f}°C")
	print(f"Min Absolute Error: {merged_df['abs_error'].min():.2f}°C")

	# Pearson correlation
	pearson_corr, pearson_pvalue = pearsonr(merged_df['actual_tm'], merged_df['denormalized_tm'])
	print(f"\nPearson Correlation (r): {pearson_corr:.4f}")
	print(f"Pearson Correlation p-value: {pearson_pvalue:.4f}")

	# Also show correlation using np.corrcoef for consistency
	correlation = np.corrcoef(merged_df['actual_tm'], merged_df['denormalized_tm'])[0, 1]
	print(f"Correlation (np.corrcoef): {correlation:.4f}")

	# R-squared
	ss_res = np.sum((merged_df['actual_tm'] - merged_df['denormalized_tm'])**2)
	ss_tot = np.sum((merged_df['actual_tm'] - merged_df['actual_tm'].mean())**2)
	r_squared = 1 - (ss_res / ss_tot)
	print(f"R-squared (R²): {r_squared:.4f}")

	# Save results to CSV
	output_file = data_dir / "tm_comparison_results.csv"
	merged_df.to_csv(output_file, index=False)
	print(f"\nComparison results saved to: {output_file}")

	return merged_df

	def main():
	# File paths
	data_dir = Path(__file__).parent / "data" / "abmelt"
	reference_file = data_dir / "tm_holdout_4.csv"

	normalized_files = {
	'tagg': data_dir / "tagg_holdout_normalized.csv",
	'tm': data_dir / "tm_holdout_normalized.csv",
	'tmon': data_dir / "tmon_holdout_normalized.csv"
	}

	output_files = {
	'tagg': data_dir / "tagg_holdout_denormalized.csv",
	'tm': data_dir / "tm_holdout_denormalized.csv",
	'tmon': data_dir / "tmon_holdout_denormalized.csv"
	}

	column_names = {
	'tagg': 'tagg',
	'tm': 'tm',
	'tmon': 'tmonset' # Note: column is named 'tmonset' not 'tmon'
	}

	# Load reference file to get merck_id and name mapping
	print("Loading reference file (tm_holdout_4.csv)...")
	reference_df = pd.read_csv(reference_file)
	print(f" Found {len(reference_df)} antibodies in reference file")
	print(f" Antibodies: {', '.join(reference_df['merck_id'].tolist())}")

	# Process each temperature type
	results = {}
	for temp_type in ['tagg', 'tm', 'tmon']:
	normalized_file = normalized_files[temp_type]
	output_file = output_files[temp_type]
	column_name = column_names[temp_type]

	# Load normalized values
	print(f"\n{'='*80}")
	print(f"Loading normalized {temp_type.upper()} values from {normalized_file.name}...")
	normalized_df = pd.read_csv(normalized_file)
	print(f" Found {len(normalized_df)} antibodies with normalized {temp_type.upper()} values")

	# Denormalize
	result_df = denormalize_temperature_type(
	normalized_df,
	reference_df,
	temp_type,
	column_name,
	output_file
	)

	if result_df is not None:
	results[temp_type] = result_df

	# Compare actual vs denormalized TM values if TM was processed
	if 'tm' in results:
	# Load normalized TM values for comparison
	normalized_tm_df = pd.read_csv(normalized_files['tm'])
	compare_tm_values(reference_df, results['tm'], normalized_tm_df, data_dir)

	# Summary
	print(f"\n{'='*80}")
	print("SUMMARY")
	print(f"{'='*80}")
	print(f"\nSuccessfully denormalized {len(results)} temperature types:")
	for temp_type in results.keys():
	print(f" - {temp_type.upper()}: {output_files[temp_type].name}")

	print(f"\n{'='*80}")

	if __name__ == "__main__":
	main()