Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """ | |
| Denormalize Tagg, Tm, and Tmon values from normalized holdout sets. | |
| This script: | |
| 1. Loads reference file (tm_holdout_4.csv) to get merck_id and name mapping | |
| 2. Loads normalized values from tagg_holdout_normalized.csv, tm_holdout_normalized.csv, and tmon_holdout_normalized.csv | |
| 3. Filters to only include antibodies present in reference file | |
| 4. Denormalizes the normalized values using utils.py | |
| 5. Saves denormalized values to separate CSV files with _denormalized postfix | |
| Note: The tmon file uses column 'tmonset' which represents T_mon_onset (tmon). | |
| """ | |
| import sys | |
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from scipy.stats import pearsonr | |
| # Add src to path for imports | |
| sys.path.append(str(Path(__file__).parent / "src")) | |
| from utils import renormalize, DEFAULT_STATS | |
| def denormalize_temperature_type(normalized_df, reference_df, temp_type, column_name, output_file): | |
| """ | |
| Denormalize a specific temperature type. | |
| Args: | |
| normalized_df: DataFrame with normalized values | |
| reference_df: DataFrame with merck_id and name mapping | |
| temp_type: Temperature type ('tagg', 'tm', or 'tmon') | |
| column_name: Name of the column in normalized_df (e.g., 'tagg', 'tm', 'tmonset') | |
| output_file: Path to output CSV file | |
| Returns: | |
| DataFrame with denormalized values | |
| """ | |
| print(f"\n{'='*80}") | |
| print(f"Processing {temp_type.upper()}") | |
| print(f"{'='*80}") | |
| # Check if column exists | |
| if column_name not in normalized_df.columns: | |
| print(f"ERROR: '{column_name}' column not found!") | |
| print(f"Available columns: {list(normalized_df.columns)}") | |
| return None | |
| # Filter normalized_df to only include antibodies present in reference file | |
| # The 'name' column in normalized_df contains Merck IDs | |
| print(f"\nFiltering to antibodies present in reference file...") | |
| filtered_df = normalized_df[normalized_df['name'].isin(reference_df['merck_id'])].copy() | |
| print(f" Found {len(filtered_df)} matching antibodies") | |
| if len(filtered_df) == 0: | |
| print(f"ERROR: No matching antibodies found!") | |
| print(f"Reference antibodies (merck_id): {reference_df['merck_id'].tolist()}") | |
| print(f"Normalized antibodies (name): {normalized_df['name'].tolist()}") | |
| return None | |
| # Merge with reference to get merck_id and name | |
| merged_df = pd.merge( | |
| filtered_df[['name', column_name]], | |
| reference_df[['merck_id', 'name']], | |
| left_on='name', | |
| right_on='merck_id', | |
| how='inner' | |
| ) | |
| # Denormalize the normalized values | |
| print(f"\nDenormalizing normalized {temp_type.upper()} values...") | |
| normalized_values = merged_df[column_name].values | |
| denormalized_values = renormalize(normalized_values, temp_type=temp_type) | |
| # Create output dataframe with merck_id, name, and denormalized value | |
| output_column = temp_type # Use 'tmon' instead of 'tmonset' for output | |
| output_df = pd.DataFrame({ | |
| 'merck_id': merged_df['merck_id'], | |
| 'name': merged_df['name_y'], | |
| output_column: denormalized_values | |
| }) | |
| # Display results | |
| print(f"\nStatistics used:") | |
| print(f" Mean: {DEFAULT_STATS[temp_type]['mean']:.2f}°C") | |
| print(f" Std: {DEFAULT_STATS[temp_type]['std']:.2f}°C") | |
| print(f"\n{'Merck ID':<15} {'Name':<20} {'Normalized':<15} {'Denormalized':<15}") | |
| print("-" * 65) | |
| for _, row in merged_df.iterrows(): | |
| merck_id = row['merck_id'] | |
| antibody_name = row['name_y'] | |
| normalized_val = row[column_name] | |
| denormalized_val = output_df[output_df['merck_id'] == merck_id][output_column].values[0] | |
| print(f"{merck_id:<15} {antibody_name:<20} {normalized_val:<15.4f} {denormalized_val:<15.2f}") | |
| # Summary statistics | |
| print(f"\nSUMMARY STATISTICS") | |
| print(f"Mean Denormalized {temp_type.upper()}: {denormalized_values.mean():.2f}°C") | |
| print(f"Std Denormalized {temp_type.upper()}: {denormalized_values.std():.2f}°C") | |
| print(f"Min Denormalized {temp_type.upper()}: {denormalized_values.min():.2f}°C") | |
| print(f"Max Denormalized {temp_type.upper()}: {denormalized_values.max():.2f}°C") | |
| # Save results to CSV | |
| output_df.to_csv(output_file, index=False) | |
| print(f"\nDenormalized values saved to: {output_file}") | |
| return output_df | |
| def compare_tm_values(actual_df, denormalized_df, normalized_df, data_dir): | |
| """ | |
| Compare actual TM values with denormalized values. | |
| Args: | |
| actual_df: DataFrame with actual TM values (from tm_holdout_4.csv) | |
| denormalized_df: DataFrame with denormalized TM values | |
| normalized_df: DataFrame with normalized TM values | |
| data_dir: Path to data directory | |
| Returns: | |
| DataFrame with comparison results | |
| """ | |
| print(f"\n{'='*80}") | |
| print("COMPARING ACTUAL vs DENORMALIZED TM VALUES") | |
| print(f"{'='*80}") | |
| # Merge actual, normalized, and denormalized values | |
| # First merge actual with normalized (on merck_id = name in normalized_df) | |
| temp_df = pd.merge( | |
| actual_df[['merck_id', 'name', 'tm']], | |
| normalized_df[['name', 'tm']], | |
| left_on='merck_id', | |
| right_on='name', | |
| how='inner', | |
| suffixes=('_actual', '_normalized') | |
| ) | |
| # Rename columns from first merge | |
| temp_df = temp_df.rename(columns={ | |
| 'tm_actual': 'actual_tm', | |
| 'tm_normalized': 'normalized_tm', | |
| 'name_actual': 'antibody_name' | |
| }) | |
| # Drop duplicate name column if it exists | |
| if 'name_normalized' in temp_df.columns: | |
| temp_df = temp_df.drop(columns=['name_normalized']) | |
| # Then merge with denormalized | |
| merged_df = pd.merge( | |
| temp_df, | |
| denormalized_df[['merck_id', 'tm']], | |
| on='merck_id', | |
| how='inner' | |
| ) | |
| if len(merged_df) == 0: | |
| print("ERROR: No matching antibodies found for comparison!") | |
| return None | |
| # Rename denormalized tm column | |
| merged_df = merged_df.rename(columns={ | |
| 'tm': 'denormalized_tm' | |
| }) | |
| # Calculate errors | |
| merged_df['error'] = merged_df['denormalized_tm'] - merged_df['actual_tm'] | |
| merged_df['abs_error'] = np.abs(merged_df['error']) | |
| merged_df['abs_error_percent'] = (merged_df['abs_error'] / merged_df['actual_tm']) * 100 | |
| # Display results | |
| print(f"\nStatistics used for denormalization:") | |
| print(f" Mean: {DEFAULT_STATS['tm']['mean']:.2f}°C") | |
| print(f" Std: {DEFAULT_STATS['tm']['std']:.2f}°C") | |
| print(f"\n{'Antibody':<20} {'Merck ID':<12} {'Actual TM':<12} {'Normalized':<12} {'Denormalized':<15} {'Error':<12} {'Abs Error':<12} {'Error %':<10}") | |
| print("-" * 110) | |
| for _, row in merged_df.iterrows(): | |
| antibody_name = row['antibody_name'] | |
| print(f"{antibody_name:<20} " | |
| f"{row['merck_id']:<12} " | |
| f"{row['actual_tm']:<12.2f} " | |
| f"{row['normalized_tm']:<12.4f} " | |
| f"{row['denormalized_tm']:<15.2f} " | |
| f"{row['error']:<12.2f} " | |
| f"{row['abs_error']:<12.2f} " | |
| f"{row['abs_error_percent']:<10.2f}") | |
| # Summary statistics | |
| print(f"\n{'='*80}") | |
| print("SUMMARY STATISTICS") | |
| print(f"{'='*80}") | |
| print(f"\nMean Absolute Error (MAE): {merged_df['abs_error'].mean():.2f}°C") | |
| print(f"Root Mean Squared Error (RMSE): {np.sqrt((merged_df['error']**2).mean()):.2f}°C") | |
| print(f"Mean Absolute Percent Error: {merged_df['abs_error_percent'].mean():.2f}%") | |
| print(f"Max Absolute Error: {merged_df['abs_error'].max():.2f}°C") | |
| print(f"Min Absolute Error: {merged_df['abs_error'].min():.2f}°C") | |
| # Pearson correlation | |
| pearson_corr, pearson_pvalue = pearsonr(merged_df['actual_tm'], merged_df['denormalized_tm']) | |
| print(f"\nPearson Correlation (r): {pearson_corr:.4f}") | |
| print(f"Pearson Correlation p-value: {pearson_pvalue:.4f}") | |
| # Also show correlation using np.corrcoef for consistency | |
| correlation = np.corrcoef(merged_df['actual_tm'], merged_df['denormalized_tm'])[0, 1] | |
| print(f"Correlation (np.corrcoef): {correlation:.4f}") | |
| # R-squared | |
| ss_res = np.sum((merged_df['actual_tm'] - merged_df['denormalized_tm'])**2) | |
| ss_tot = np.sum((merged_df['actual_tm'] - merged_df['actual_tm'].mean())**2) | |
| r_squared = 1 - (ss_res / ss_tot) | |
| print(f"R-squared (R²): {r_squared:.4f}") | |
| # Save results to CSV | |
| output_file = data_dir / "tm_comparison_results.csv" | |
| merged_df.to_csv(output_file, index=False) | |
| print(f"\nComparison results saved to: {output_file}") | |
| return merged_df | |
| def main(): | |
| # File paths | |
| data_dir = Path(__file__).parent / "data" / "abmelt" | |
| reference_file = data_dir / "tm_holdout_4.csv" | |
| normalized_files = { | |
| 'tagg': data_dir / "tagg_holdout_normalized.csv", | |
| 'tm': data_dir / "tm_holdout_normalized.csv", | |
| 'tmon': data_dir / "tmon_holdout_normalized.csv" | |
| } | |
| output_files = { | |
| 'tagg': data_dir / "tagg_holdout_denormalized.csv", | |
| 'tm': data_dir / "tm_holdout_denormalized.csv", | |
| 'tmon': data_dir / "tmon_holdout_denormalized.csv" | |
| } | |
| column_names = { | |
| 'tagg': 'tagg', | |
| 'tm': 'tm', | |
| 'tmon': 'tmonset' # Note: column is named 'tmonset' not 'tmon' | |
| } | |
| # Load reference file to get merck_id and name mapping | |
| print("Loading reference file (tm_holdout_4.csv)...") | |
| reference_df = pd.read_csv(reference_file) | |
| print(f" Found {len(reference_df)} antibodies in reference file") | |
| print(f" Antibodies: {', '.join(reference_df['merck_id'].tolist())}") | |
| # Process each temperature type | |
| results = {} | |
| for temp_type in ['tagg', 'tm', 'tmon']: | |
| normalized_file = normalized_files[temp_type] | |
| output_file = output_files[temp_type] | |
| column_name = column_names[temp_type] | |
| # Load normalized values | |
| print(f"\n{'='*80}") | |
| print(f"Loading normalized {temp_type.upper()} values from {normalized_file.name}...") | |
| normalized_df = pd.read_csv(normalized_file) | |
| print(f" Found {len(normalized_df)} antibodies with normalized {temp_type.upper()} values") | |
| # Denormalize | |
| result_df = denormalize_temperature_type( | |
| normalized_df, | |
| reference_df, | |
| temp_type, | |
| column_name, | |
| output_file | |
| ) | |
| if result_df is not None: | |
| results[temp_type] = result_df | |
| # Compare actual vs denormalized TM values if TM was processed | |
| if 'tm' in results: | |
| # Load normalized TM values for comparison | |
| normalized_tm_df = pd.read_csv(normalized_files['tm']) | |
| compare_tm_values(reference_df, results['tm'], normalized_tm_df, data_dir) | |
| # Summary | |
| print(f"\n{'='*80}") | |
| print("SUMMARY") | |
| print(f"{'='*80}") | |
| print(f"\nSuccessfully denormalized {len(results)} temperature types:") | |
| for temp_type in results.keys(): | |
| print(f" - {temp_type.upper()}: {output_files[temp_type].name}") | |
| print(f"\n{'='*80}") | |
| if __name__ == "__main__": | |
| main() | |