File size: 5,394 Bytes
4202f60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""HuggingFace Datasets manager for FBMC data storage.

This utility manages uploading/downloading Parquet files to/from HuggingFace Datasets.
Following best practices: Code -> Git, Data -> HF Datasets (NOT Git LFS)
"""

import polars as pl
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
from pathlib import Path
from dotenv import load_dotenv
import os
from typing import Optional


class FBMCDatasetManager:
    """Manage FBMC data uploads/downloads via HuggingFace Datasets."""

    def __init__(self):
        """Initialize with HF credentials from .env file."""
        # Load environment variables from .env
        load_dotenv()

        self.hf_token = os.getenv('HF_TOKEN')
        self.hf_username = os.getenv('HF_USERNAME')

        if not self.hf_token or 'your_hf' in self.hf_token.lower():
            print("⚠️  HF token not configured - upload features disabled")
            self.api = None
        else:
            self.api = HfApi(token=self.hf_token)

    def upload_dataset(
        self,
        parquet_path: Path,
        dataset_name: str,
        description: str = "",
        private: bool = False
    ) -> Optional[str]:
        """Upload Parquet file to HuggingFace Datasets.

        Args:
            parquet_path: Path to local Parquet file
            dataset_name: Name for HF dataset (e.g., 'fbmc-cnecs-2024-2025')
            description: Optional dataset description
            private: Whether dataset should be private (default: False for free storage)

        Returns:
            Full dataset name (username/dataset-name) or None if upload fails
        """
        if not self.api:
            print("❌ Cannot upload: HF token not configured")
            return None

        print(f"📤 Uploading {parquet_path.name} to HF Datasets...")

        try:
            # Load Parquet as polars, convert to HF Dataset
            df = pl.read_parquet(parquet_path)
            dataset = Dataset.from_pandas(df.to_pandas())

            # Create full dataset name
            full_name = f"{self.hf_username}/{dataset_name}"

            # Upload to HF
            dataset.push_to_hub(
                full_name,
                token=self.hf_token,
                private=private
            )

            print(f"✅ Uploaded to: https://huggingface.co/datasets/{full_name}")
            return full_name

        except Exception as e:
            print(f"❌ Upload failed: {e}")
            return None

    def download_dataset(
        self,
        dataset_name: str,
        output_path: Path,
        split: str = "train"
    ) -> Optional[pl.DataFrame]:
        """Download dataset from HF to local Parquet file.

        Args:
            dataset_name: HF dataset name (with or without username prefix)
            output_path: Local path to save Parquet file
            split: Dataset split to download (default: 'train')

        Returns:
            Polars DataFrame or None if download fails
        """
        from datasets import load_dataset

        # Add username prefix if not present
        if '/' not in dataset_name:
            dataset_name = f"{self.hf_username}/{dataset_name}"

        print(f"📥 Downloading {dataset_name} from HF Datasets...")

        try:
            # Download from HF
            dataset = load_dataset(dataset_name, split=split)

            # Convert to polars and save
            df = pl.from_pandas(dataset.to_pandas())
            output_path.parent.mkdir(parents=True, exist_ok=True)
            df.write_parquet(output_path)

            print(f"✅ Downloaded to: {output_path}")
            print(f"   Shape: {df.shape}")
            return df

        except Exception as e:
            print(f"❌ Download failed: {e}")
            return None

    def list_datasets(self, filter_fbmc: bool = True) -> list:
        """List all datasets for this user.

        Args:
            filter_fbmc: Only show FBMC-related datasets (default: True)

        Returns:
            List of dataset info dictionaries
        """
        if not self.api:
            print("❌ Cannot list: HF token not configured")
            return []

        try:
            datasets = list(self.api.list_datasets(author=self.hf_username))

            if filter_fbmc:
                datasets = [d for d in datasets if 'fbmc' in d.id.lower()]

            print(f"\n📊 {'FBMC ' if filter_fbmc else ''}Datasets for {self.hf_username}:")
            for ds in datasets:
                print(f"  - {ds.id}")

            return datasets

        except Exception as e:
            print(f"❌ List failed: {e}")
            return []


# Example usage
if __name__ == "__main__":
    manager = FBMCDatasetManager()

    # Test configuration
    print("HF Datasets Manager initialized")
    print(f"Username: {manager.hf_username}")
    print(f"Token configured: {manager.api is not None}")

    # Upload example (will be used in Day 1)
    # manager.upload_dataset(
    #     parquet_path=Path("data/raw/cnecs_2024_2025.parquet"),
    #     dataset_name="fbmc-cnecs-2024-2025",
    #     description="FBMC CNECs data: Oct 2024 - Sept 2025"
    # )

    # Download example (will be used when setting up new environments)
    # manager.download_dataset(
    #     dataset_name="fbmc-cnecs-2024-2025",
    #     output_path=Path("data/raw/cnecs_2024_2025.parquet")
    # )