Spaces:
Sleeping
Sleeping
File size: 5,394 Bytes
4202f60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
"""HuggingFace Datasets manager for FBMC data storage.
This utility manages uploading/downloading Parquet files to/from HuggingFace Datasets.
Following best practices: Code -> Git, Data -> HF Datasets (NOT Git LFS)
"""
import polars as pl
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
from pathlib import Path
from dotenv import load_dotenv
import os
from typing import Optional
class FBMCDatasetManager:
"""Manage FBMC data uploads/downloads via HuggingFace Datasets."""
def __init__(self):
"""Initialize with HF credentials from .env file."""
# Load environment variables from .env
load_dotenv()
self.hf_token = os.getenv('HF_TOKEN')
self.hf_username = os.getenv('HF_USERNAME')
if not self.hf_token or 'your_hf' in self.hf_token.lower():
print("⚠️ HF token not configured - upload features disabled")
self.api = None
else:
self.api = HfApi(token=self.hf_token)
def upload_dataset(
self,
parquet_path: Path,
dataset_name: str,
description: str = "",
private: bool = False
) -> Optional[str]:
"""Upload Parquet file to HuggingFace Datasets.
Args:
parquet_path: Path to local Parquet file
dataset_name: Name for HF dataset (e.g., 'fbmc-cnecs-2024-2025')
description: Optional dataset description
private: Whether dataset should be private (default: False for free storage)
Returns:
Full dataset name (username/dataset-name) or None if upload fails
"""
if not self.api:
print("❌ Cannot upload: HF token not configured")
return None
print(f"📤 Uploading {parquet_path.name} to HF Datasets...")
try:
# Load Parquet as polars, convert to HF Dataset
df = pl.read_parquet(parquet_path)
dataset = Dataset.from_pandas(df.to_pandas())
# Create full dataset name
full_name = f"{self.hf_username}/{dataset_name}"
# Upload to HF
dataset.push_to_hub(
full_name,
token=self.hf_token,
private=private
)
print(f"✅ Uploaded to: https://huggingface.co/datasets/{full_name}")
return full_name
except Exception as e:
print(f"❌ Upload failed: {e}")
return None
def download_dataset(
self,
dataset_name: str,
output_path: Path,
split: str = "train"
) -> Optional[pl.DataFrame]:
"""Download dataset from HF to local Parquet file.
Args:
dataset_name: HF dataset name (with or without username prefix)
output_path: Local path to save Parquet file
split: Dataset split to download (default: 'train')
Returns:
Polars DataFrame or None if download fails
"""
from datasets import load_dataset
# Add username prefix if not present
if '/' not in dataset_name:
dataset_name = f"{self.hf_username}/{dataset_name}"
print(f"📥 Downloading {dataset_name} from HF Datasets...")
try:
# Download from HF
dataset = load_dataset(dataset_name, split=split)
# Convert to polars and save
df = pl.from_pandas(dataset.to_pandas())
output_path.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(output_path)
print(f"✅ Downloaded to: {output_path}")
print(f" Shape: {df.shape}")
return df
except Exception as e:
print(f"❌ Download failed: {e}")
return None
def list_datasets(self, filter_fbmc: bool = True) -> list:
"""List all datasets for this user.
Args:
filter_fbmc: Only show FBMC-related datasets (default: True)
Returns:
List of dataset info dictionaries
"""
if not self.api:
print("❌ Cannot list: HF token not configured")
return []
try:
datasets = list(self.api.list_datasets(author=self.hf_username))
if filter_fbmc:
datasets = [d for d in datasets if 'fbmc' in d.id.lower()]
print(f"\n📊 {'FBMC ' if filter_fbmc else ''}Datasets for {self.hf_username}:")
for ds in datasets:
print(f" - {ds.id}")
return datasets
except Exception as e:
print(f"❌ List failed: {e}")
return []
# Example usage
if __name__ == "__main__":
manager = FBMCDatasetManager()
# Test configuration
print("HF Datasets Manager initialized")
print(f"Username: {manager.hf_username}")
print(f"Token configured: {manager.api is not None}")
# Upload example (will be used in Day 1)
# manager.upload_dataset(
# parquet_path=Path("data/raw/cnecs_2024_2025.parquet"),
# dataset_name="fbmc-cnecs-2024-2025",
# description="FBMC CNECs data: Oct 2024 - Sept 2025"
# )
# Download example (will be used when setting up new environments)
# manager.download_dataset(
# dataset_name="fbmc-cnecs-2024-2025",
# output_path=Path("data/raw/cnecs_2024_2025.parquet")
# )
|