Spaces:
Sleeping
Sleeping
Junchen Fu commited on
Commit ·
baf02c1
1
Parent(s): 8a7f9d9
Perf: background preload RAG, thread-safe lazy loading, declare models in README for HF caching
Browse files
README.md
CHANGED
|
@@ -1,12 +1,19 @@
|
|
| 1 |
---
|
| 2 |
title: Llmpopcorn Demo
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.9.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Llmpopcorn Demo
|
| 3 |
+
emoji: 🍿
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.9.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
models:
|
| 11 |
+
- ByteDance/AnimateDiff-Lightning
|
| 12 |
+
- emilianJR/epiCRealism
|
| 13 |
+
- sentence-transformers/all-MiniLM-L12-v2
|
| 14 |
+
- meta-llama/Llama-3.3-70B-Instruct
|
| 15 |
+
datasets:
|
| 16 |
+
- junchenfu/microlens_rag
|
| 17 |
---
|
| 18 |
|
| 19 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
-
import
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
import faiss
|
|
@@ -14,64 +14,68 @@ from sentence_transformers import SentenceTransformer
|
|
| 14 |
from datasets import load_dataset
|
| 15 |
import spaces
|
| 16 |
|
| 17 |
-
LOG_PATH = "debug-44b0fa.log"
|
| 18 |
-
|
| 19 |
-
def _log(msg, data=None, hypothesis=None):
|
| 20 |
-
import json as _json
|
| 21 |
-
entry = {"sessionId": "44b0fa", "timestamp": int(time.time() * 1000),
|
| 22 |
-
"location": "app.py", "message": msg,
|
| 23 |
-
"data": data or {}, "hypothesisId": hypothesis or ""}
|
| 24 |
-
with open(LOG_PATH, "a") as f:
|
| 25 |
-
f.write(_json.dumps(entry) + "\n")
|
| 26 |
-
|
| 27 |
# --- 1. LLM client ---
|
| 28 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 29 |
-
_log("startup: HF_TOKEN present", {"token_present": HF_TOKEN is not None}, "H-A")
|
| 30 |
client = InferenceClient("meta-llama/Llama-3.3-70B-Instruct", token=HF_TOKEN)
|
| 31 |
|
| 32 |
-
# --- 2. Lazy globals ---
|
| 33 |
_pipe = None
|
| 34 |
_rag_df = None
|
| 35 |
_embed_model = None
|
| 36 |
_unique_partitions = None
|
| 37 |
_partition_embeddings = None
|
|
|
|
|
|
|
| 38 |
|
| 39 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 40 |
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 41 |
-
_log("startup: device detected", {"device": device}, "H-B")
|
| 42 |
|
| 43 |
def get_pipe():
|
| 44 |
global _pipe
|
| 45 |
if _pipe is None:
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return _pipe
|
| 59 |
|
| 60 |
def get_rag():
|
| 61 |
global _rag_df, _embed_model, _unique_partitions, _partition_embeddings
|
| 62 |
if _rag_df is None:
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
return _rag_df, _embed_model, _unique_partitions, _partition_embeddings
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
# --- 3. Basic LLMPopcorn ---
|
| 73 |
def generate_basic(query):
|
| 74 |
-
_log("generate_basic: called", {"query": query[:50]}, "H-A")
|
| 75 |
system_prompt = (
|
| 76 |
"You are a talented video creator. "
|
| 77 |
"Generate a response in JSON format with 'title', 'cover_prompt', and 'video_prompt' (3s)."
|
|
@@ -91,9 +95,7 @@ def generate_basic(query):
|
|
| 91 |
max_tokens=500,
|
| 92 |
response_format={"type": "json_object"},
|
| 93 |
)
|
| 94 |
-
|
| 95 |
-
_log("generate_basic: success", {"title": result.get("title", "")[:50]}, "H-A")
|
| 96 |
-
return result
|
| 97 |
|
| 98 |
# --- 4. PE: RAG + CoT ---
|
| 99 |
def build_rag_context(user_prompt, selected_videos_num=10, num_tags=1, ratio=0.1):
|
|
@@ -140,7 +142,6 @@ def build_rag_context(user_prompt, selected_videos_num=10, num_tags=1, ratio=0.1
|
|
| 140 |
return pos_ctx + "\n" + neg_ctx, top_partitions[0]
|
| 141 |
|
| 142 |
def generate_pe(query, vid_num=10):
|
| 143 |
-
_log("generate_pe: called", {"query": query[:50]}, "H-C")
|
| 144 |
rag_context, matched_tag = build_rag_context(query, selected_videos_num=vid_num)
|
| 145 |
cot_prompt = f"""You are a talented video creator. Think step-by-step using the reference videos below, then generate the most popular title, cover prompt, and 3-second video prompt.
|
| 146 |
|
|
@@ -167,18 +168,15 @@ Return JSON ONLY with keys: title (max 50 chars), cover_prompt, video_prompt (3s
|
|
| 167 |
)
|
| 168 |
result = json.loads(response.choices[0].message.content)
|
| 169 |
result["_matched_tag"] = matched_tag
|
| 170 |
-
_log("generate_pe: success", {"title": result.get("title", "")[:50], "tag": matched_tag}, "H-C")
|
| 171 |
return result
|
| 172 |
|
| 173 |
-
# --- 5. Video generation (lazy loaded
|
| 174 |
@spaces.GPU(duration=60)
|
| 175 |
def run_video_generation(video_prompt):
|
| 176 |
-
_log("run_video_generation: called", {"prompt": video_prompt[:80]}, "H-B")
|
| 177 |
pipe = get_pipe()
|
| 178 |
output = pipe(prompt=video_prompt, guidance_scale=1.0, num_inference_steps=4, num_frames=16)
|
| 179 |
gif_path = "output_video.gif"
|
| 180 |
export_to_gif(output.frames[0], gif_path)
|
| 181 |
-
_log("run_video_generation: success", {}, "H-B")
|
| 182 |
return gif_path
|
| 183 |
|
| 184 |
# --- 6. Gradio entrypoints ---
|
|
|
|
| 2 |
import torch
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
+
import threading
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
import faiss
|
|
|
|
| 14 |
from datasets import load_dataset
|
| 15 |
import spaces
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# --- 1. LLM client ---
|
| 18 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 19 |
client = InferenceClient("meta-llama/Llama-3.3-70B-Instruct", token=HF_TOKEN)
|
| 20 |
|
| 21 |
+
# --- 2. Lazy globals with threading lock ---
|
| 22 |
_pipe = None
|
| 23 |
_rag_df = None
|
| 24 |
_embed_model = None
|
| 25 |
_unique_partitions = None
|
| 26 |
_partition_embeddings = None
|
| 27 |
+
_pipe_lock = threading.Lock()
|
| 28 |
+
_rag_lock = threading.Lock()
|
| 29 |
|
| 30 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
|
|
|
| 32 |
|
| 33 |
def get_pipe():
|
| 34 |
global _pipe
|
| 35 |
if _pipe is None:
|
| 36 |
+
with _pipe_lock:
|
| 37 |
+
if _pipe is None:
|
| 38 |
+
print("Loading video pipeline (first use)...")
|
| 39 |
+
step = 4
|
| 40 |
+
repo = "ByteDance/AnimateDiff-Lightning"
|
| 41 |
+
ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
|
| 42 |
+
base = "emilianJR/epiCRealism"
|
| 43 |
+
adapter = MotionAdapter().to(device, dtype)
|
| 44 |
+
adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
|
| 45 |
+
_pipe = AnimateDiffPipeline.from_pretrained(
|
| 46 |
+
base, motion_adapter=adapter, torch_dtype=dtype
|
| 47 |
+
).to(device)
|
| 48 |
+
_pipe.scheduler = EulerDiscreteScheduler.from_config(
|
| 49 |
+
_pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear"
|
| 50 |
+
)
|
| 51 |
+
print("Video pipeline ready.")
|
| 52 |
return _pipe
|
| 53 |
|
| 54 |
def get_rag():
|
| 55 |
global _rag_df, _embed_model, _unique_partitions, _partition_embeddings
|
| 56 |
if _rag_df is None:
|
| 57 |
+
with _rag_lock:
|
| 58 |
+
if _rag_df is None:
|
| 59 |
+
print("Loading MicroLens RAG dataset (first use)...")
|
| 60 |
+
_rag_df = load_dataset("junchenfu/microlens_rag", split="train").to_pandas()
|
| 61 |
+
_rag_df["comment_count"] = _rag_df["comment_count"].fillna(0)
|
| 62 |
+
_embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
|
| 63 |
+
_unique_partitions = _rag_df["partition"].unique().tolist()
|
| 64 |
+
_partition_embeddings = _embed_model.encode(_unique_partitions)
|
| 65 |
+
print(f"RAG ready: {len(_rag_df)} videos, {len(_unique_partitions)} categories.")
|
| 66 |
return _rag_df, _embed_model, _unique_partitions, _partition_embeddings
|
| 67 |
|
| 68 |
+
# Pre-warm in background so the first user request is faster
|
| 69 |
+
def _preload():
|
| 70 |
+
try:
|
| 71 |
+
get_rag()
|
| 72 |
+
except Exception as e:
|
| 73 |
+
print(f"Background preload warning: {e}")
|
| 74 |
+
|
| 75 |
+
threading.Thread(target=_preload, daemon=True).start()
|
| 76 |
+
|
| 77 |
# --- 3. Basic LLMPopcorn ---
|
| 78 |
def generate_basic(query):
|
|
|
|
| 79 |
system_prompt = (
|
| 80 |
"You are a talented video creator. "
|
| 81 |
"Generate a response in JSON format with 'title', 'cover_prompt', and 'video_prompt' (3s)."
|
|
|
|
| 95 |
max_tokens=500,
|
| 96 |
response_format={"type": "json_object"},
|
| 97 |
)
|
| 98 |
+
return json.loads(response.choices[0].message.content)
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# --- 4. PE: RAG + CoT ---
|
| 101 |
def build_rag_context(user_prompt, selected_videos_num=10, num_tags=1, ratio=0.1):
|
|
|
|
| 142 |
return pos_ctx + "\n" + neg_ctx, top_partitions[0]
|
| 143 |
|
| 144 |
def generate_pe(query, vid_num=10):
|
|
|
|
| 145 |
rag_context, matched_tag = build_rag_context(query, selected_videos_num=vid_num)
|
| 146 |
cot_prompt = f"""You are a talented video creator. Think step-by-step using the reference videos below, then generate the most popular title, cover prompt, and 3-second video prompt.
|
| 147 |
|
|
|
|
| 168 |
)
|
| 169 |
result = json.loads(response.choices[0].message.content)
|
| 170 |
result["_matched_tag"] = matched_tag
|
|
|
|
| 171 |
return result
|
| 172 |
|
| 173 |
+
# --- 5. Video generation (lazy loaded inside GPU context) ---
|
| 174 |
@spaces.GPU(duration=60)
|
| 175 |
def run_video_generation(video_prompt):
|
|
|
|
| 176 |
pipe = get_pipe()
|
| 177 |
output = pipe(prompt=video_prompt, guidance_scale=1.0, num_inference_steps=4, num_frames=16)
|
| 178 |
gif_path = "output_video.gif"
|
| 179 |
export_to_gif(output.frames[0], gif_path)
|
|
|
|
| 180 |
return gif_path
|
| 181 |
|
| 182 |
# --- 6. Gradio entrypoints ---
|