|
|
import os |
|
|
import json |
|
|
from typing import Dict, Any, Optional |
|
|
import logging |
|
|
import time |
|
|
from openai import OpenAI, APIError |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
MODEL_DATA_DIR = "model_data_json" |
|
|
EXPLANATION_KEY = "model_explanation_gemini" |
|
|
DESCRIPTION_KEY = "description" |
|
|
MAX_RETRIES = 3 |
|
|
RETRY_DELAY_SECONDS = 5 |
|
|
|
|
|
|
|
|
DEEPSEEK_API_KEY_ENV_VAR = "DEEPSEEK_API_KEY" |
|
|
DEEPSEEK_BASE_URL = "https://api.deepseek.com" |
|
|
DEEPSEEK_MODEL_NAME = "deepseek-chat" |
|
|
|
|
|
|
|
|
client: Optional[OpenAI] = None |
|
|
|
|
|
def configure_llm_client(): |
|
|
"""Configures the OpenAI client for DeepSeek API using the API key from environment variables.""" |
|
|
global client |
|
|
api_key = os.getenv(DEEPSEEK_API_KEY_ENV_VAR) |
|
|
if not api_key: |
|
|
logging.error(f"Error: {DEEPSEEK_API_KEY_ENV_VAR} environment variable not set.") |
|
|
logging.error("Please set the environment variable before running the script.") |
|
|
return False |
|
|
try: |
|
|
client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL) |
|
|
logging.info("DeepSeek API client configured successfully.") |
|
|
return True |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to configure DeepSeek API client: {e}") |
|
|
client = None |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def generate_explanation(model_id: str, description: str) -> Optional[str]: |
|
|
""" |
|
|
Generates a short English explanation for the model based on its description |
|
|
by calling the DeepSeek API via the OpenAI library. |
|
|
|
|
|
Args: |
|
|
model_id: The ID of the model (for context). |
|
|
description: The model description text. |
|
|
|
|
|
Returns: |
|
|
A short English explanation string from DeepSeek, or None if generation fails. |
|
|
""" |
|
|
global client |
|
|
if not client: |
|
|
logging.error(f"[{model_id}] DeepSeek client not configured. Cannot generate explanation.") |
|
|
return None |
|
|
|
|
|
if not description or not isinstance(description, str): |
|
|
logging.warning(f"[{model_id}] Description is empty or not a string. Skipping explanation generation.") |
|
|
return None |
|
|
|
|
|
|
|
|
max_desc_length = 4000 |
|
|
if len(description) > max_desc_length: |
|
|
logging.warning(f"[{model_id}] Description truncated to {max_desc_length} chars for API call.") |
|
|
description = description[:max_desc_length] + "... [truncated]" |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": "You are an AI assistant tasked with summarizing Hugging Face model descriptions concisely."}, |
|
|
{"role": "user", "content": ( |
|
|
f"Analyze the following description for the Hugging Face model '{model_id}'. " |
|
|
f"Based **only** on this description, provide a concise, one-sentence explanation in English " |
|
|
f"summarizing what this model does and its primary purpose or task. " |
|
|
f"Focus on the core functionality mentioned. Avoid adding introductory phrases like 'This model is...' or 'The model...'." |
|
|
f"\n\n---\nModel Description:\n{description}\n---\n\nConcise Explanation:" |
|
|
)} |
|
|
] |
|
|
|
|
|
retries = 0 |
|
|
while retries < MAX_RETRIES: |
|
|
try: |
|
|
logging.info(f"[{model_id}] Calling DeepSeek API (Attempt {retries + 1}/{MAX_RETRIES})...") |
|
|
response = client.chat.completions.create( |
|
|
model=DEEPSEEK_MODEL_NAME, |
|
|
messages=messages, |
|
|
stream=False, |
|
|
max_tokens=100, |
|
|
temperature=0.2 |
|
|
) |
|
|
|
|
|
explanation = response.choices[0].message.content.strip() |
|
|
logging.info(f"[{model_id}] Explanation received from DeepSeek: '{explanation}'") |
|
|
|
|
|
if explanation.startswith('"') and explanation.endswith('"'): |
|
|
explanation = explanation[1:-1] |
|
|
return explanation |
|
|
|
|
|
except APIError as e: |
|
|
retries += 1 |
|
|
logging.error(f"[{model_id}] DeepSeek API Error (Attempt {retries}/{MAX_RETRIES}): {e}") |
|
|
if retries < MAX_RETRIES: |
|
|
logging.info(f"Retrying in {RETRY_DELAY_SECONDS} seconds...") |
|
|
time.sleep(RETRY_DELAY_SECONDS) |
|
|
else: |
|
|
logging.error(f"[{model_id}] Max retries reached. Failed to generate explanation via DeepSeek.") |
|
|
return None |
|
|
except Exception as e: |
|
|
logging.error(f"[{model_id}] Unexpected error during DeepSeek API call: {e}") |
|
|
return None |
|
|
|
|
|
return None |
|
|
|
|
|
def process_json_file(filepath: str): |
|
|
"""Reads, updates, and writes a single JSON file.""" |
|
|
model_id = os.path.basename(filepath).replace('.json', '') |
|
|
logging.info(f"Processing {filepath}...") |
|
|
|
|
|
try: |
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
except json.JSONDecodeError: |
|
|
logging.error(f"[{model_id}] Invalid JSON format in {filepath}. Skipping.") |
|
|
return |
|
|
except FileNotFoundError: |
|
|
logging.error(f"[{model_id}] File not found: {filepath}. Skipping.") |
|
|
return |
|
|
except Exception as e: |
|
|
logging.error(f"[{model_id}] Error reading {filepath}: {e}. Skipping.") |
|
|
return |
|
|
|
|
|
if not isinstance(data, dict): |
|
|
logging.error(f"[{model_id}] Expected JSON object (dict) but got {type(data)} in {filepath}. Skipping.") |
|
|
return |
|
|
|
|
|
description = data.get(DESCRIPTION_KEY) |
|
|
explanation_overwritten = False |
|
|
|
|
|
|
|
|
if EXPLANATION_KEY in data: |
|
|
logging.info(f"[{model_id}] Existing explanation found. Deleting before regenerating.") |
|
|
del data[EXPLANATION_KEY] |
|
|
explanation_overwritten = True |
|
|
|
|
|
|
|
|
if not description: |
|
|
logging.warning(f"[{model_id}] Description field is missing or empty. Cannot generate explanation.") |
|
|
return |
|
|
|
|
|
explanation = generate_explanation(model_id, description) |
|
|
|
|
|
|
|
|
if explanation: |
|
|
data[EXPLANATION_KEY] = explanation |
|
|
try: |
|
|
with open(filepath, 'w', encoding='utf-8') as f: |
|
|
json.dump(data, f, ensure_ascii=False, indent=4) |
|
|
if explanation_overwritten: |
|
|
logging.info(f"[{model_id}] Successfully overwrote and updated {filepath} with new explanation.") |
|
|
else: |
|
|
logging.info(f"[{model_id}] Successfully generated and updated {filepath} with new explanation.") |
|
|
except IOError as e: |
|
|
logging.error(f"[{model_id}] Error writing updated data to {filepath}: {e}") |
|
|
except Exception as e: |
|
|
logging.error(f"[{model_id}] Unexpected error writing {filepath}: {e}") |
|
|
else: |
|
|
log_message = f"[{model_id}] Failed to generate new explanation for {filepath} via API." |
|
|
if explanation_overwritten: |
|
|
log_message += " Existing explanation was removed but not replaced due to API failure." |
|
|
logging.warning(log_message) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to iterate through the directory and process files.""" |
|
|
|
|
|
if not configure_llm_client(): |
|
|
return |
|
|
|
|
|
if not os.path.isdir(MODEL_DATA_DIR): |
|
|
logging.error(f"Directory not found: {MODEL_DATA_DIR}") |
|
|
return |
|
|
|
|
|
logging.info(f"Starting processing directory: {MODEL_DATA_DIR}") |
|
|
processed_files = 0 |
|
|
updated_files = 0 |
|
|
skipped_files = 0 |
|
|
|
|
|
all_files = [f for f in os.listdir(MODEL_DATA_DIR) if f.lower().endswith(".json")] |
|
|
total_files = len(all_files) |
|
|
logging.info(f"Found {total_files} JSON files to process.") |
|
|
|
|
|
for i, filename in enumerate(all_files): |
|
|
filepath = os.path.join(MODEL_DATA_DIR, filename) |
|
|
logging.info(f"--- Processing file {i+1}/{total_files}: {filename} ---") |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
process_json_file(filepath) |
|
|
processed_files +=1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Unexpected error processing file {filename}: {e}") |
|
|
skipped_files += 1 |
|
|
|
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
logging.info(f"--- Processing complete ---") |
|
|
|
|
|
logging.info(f"Total JSON files found: {total_files}") |
|
|
logging.info(f"Files processed (attempted): {processed_files}") |
|
|
|
|
|
logging.info(f"Files skipped due to unexpected errors: {skipped_files}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |