Spaces:
Sleeping
Sleeping
| import hdbscan | |
| import umap | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| def load_data(): | |
| # Load data | |
| embeddings = np.load(r'data\top_cluster_embeddings.npy') | |
| return embeddings | |
| def get_clusters(embeddings): | |
| # Get clusters | |
| umap_embeddings = umap.UMAP( | |
| n_neighbors=15, | |
| n_components=15, | |
| metric='cosine' | |
| ).fit_transform(embeddings) | |
| cluster = hdbscan.HDBSCAN( | |
| min_cluster_size=30, | |
| metric='euclidean', | |
| cluster_selection_method='eom' | |
| ).fit(umap_embeddings) | |
| return cluster.labels_ | |
| def get_2d_data_for_plotting(embeddings): | |
| # Get 2D data for plotting | |
| umap_embeddings = umap.UMAP( | |
| n_neighbors=15, | |
| n_components=2, | |
| metric='cosine' | |
| ).fit_transform(embeddings) | |
| return umap_embeddings | |
| def plot_clusters(embeddings, cluster_labels): | |
| umap_data = get_2d_data_for_plotting(embeddings) | |
| result = pd.DataFrame(umap_data, columns=['x', 'y']) | |
| result['labels'] = cluster_labels | |
| # Visualize clusters | |
| fig, ax = plt.subplots(figsize=(20, 10)) | |
| outliers = result.loc[result.labels == -1, :] | |
| clustered = result.loc[result.labels != -1, :] | |
| plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05) | |
| plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r') | |
| plt.colorbar() | |
| plt.savefig(r'plots\clusters.png', dpi=300) | |
| def main(): | |
| embeddings = load_data() | |
| cluster_labels = get_clusters(embeddings) | |
| plot_clusters(embeddings, cluster_labels) | |
| if __name__ == '__main__': | |
| main() |