Junchen Fu commited on
Commit
baf02c1
·
1 Parent(s): 8a7f9d9

Perf: background preload RAG, thread-safe lazy loading, declare models in README for HF caching

Browse files
Files changed (2) hide show
  1. README.md +10 -3
  2. app.py +40 -42
README.md CHANGED
@@ -1,12 +1,19 @@
1
  ---
2
  title: Llmpopcorn Demo
3
- emoji: 📈
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Llmpopcorn Demo
3
+ emoji: 🍿
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
10
+ models:
11
+ - ByteDance/AnimateDiff-Lightning
12
+ - emilianJR/epiCRealism
13
+ - sentence-transformers/all-MiniLM-L12-v2
14
+ - meta-llama/Llama-3.3-70B-Instruct
15
+ datasets:
16
+ - junchenfu/microlens_rag
17
  ---
18
 
19
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import torch
3
  import json
4
  import os
5
- import time
6
  import numpy as np
7
  import pandas as pd
8
  import faiss
@@ -14,64 +14,68 @@ from sentence_transformers import SentenceTransformer
14
  from datasets import load_dataset
15
  import spaces
16
 
17
- LOG_PATH = "debug-44b0fa.log"
18
-
19
- def _log(msg, data=None, hypothesis=None):
20
- import json as _json
21
- entry = {"sessionId": "44b0fa", "timestamp": int(time.time() * 1000),
22
- "location": "app.py", "message": msg,
23
- "data": data or {}, "hypothesisId": hypothesis or ""}
24
- with open(LOG_PATH, "a") as f:
25
- f.write(_json.dumps(entry) + "\n")
26
-
27
  # --- 1. LLM client ---
28
  HF_TOKEN = os.environ.get("HF_TOKEN")
29
- _log("startup: HF_TOKEN present", {"token_present": HF_TOKEN is not None}, "H-A")
30
  client = InferenceClient("meta-llama/Llama-3.3-70B-Instruct", token=HF_TOKEN)
31
 
32
- # --- 2. Lazy globals ---
33
  _pipe = None
34
  _rag_df = None
35
  _embed_model = None
36
  _unique_partitions = None
37
  _partition_embeddings = None
 
 
38
 
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
  dtype = torch.float16 if torch.cuda.is_available() else torch.float32
41
- _log("startup: device detected", {"device": device}, "H-B")
42
 
43
  def get_pipe():
44
  global _pipe
45
  if _pipe is None:
46
- _log("lazy_load: starting video pipeline download", {}, "H-B")
47
- step = 4
48
- repo = "ByteDance/AnimateDiff-Lightning"
49
- ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
50
- base = "emilianJR/epiCRealism"
51
- adapter = MotionAdapter().to(device, dtype)
52
- adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
53
- _pipe = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
54
- _pipe.scheduler = EulerDiscreteScheduler.from_config(
55
- _pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear"
56
- )
57
- _log("lazy_load: video pipeline ready", {}, "H-B")
 
 
 
 
58
  return _pipe
59
 
60
  def get_rag():
61
  global _rag_df, _embed_model, _unique_partitions, _partition_embeddings
62
  if _rag_df is None:
63
- _log("lazy_load: starting RAG dataset download", {}, "H-C")
64
- _rag_df = load_dataset("junchenfu/microlens_rag", split="train").to_pandas()
65
- _rag_df["comment_count"] = _rag_df["comment_count"].fillna(0)
66
- _embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
67
- _unique_partitions = _rag_df["partition"].unique().tolist()
68
- _partition_embeddings = _embed_model.encode(_unique_partitions)
69
- _log("lazy_load: RAG ready", {"rows": len(_rag_df), "partitions": len(_unique_partitions)}, "H-C")
 
 
70
  return _rag_df, _embed_model, _unique_partitions, _partition_embeddings
71
 
 
 
 
 
 
 
 
 
 
72
  # --- 3. Basic LLMPopcorn ---
73
  def generate_basic(query):
74
- _log("generate_basic: called", {"query": query[:50]}, "H-A")
75
  system_prompt = (
76
  "You are a talented video creator. "
77
  "Generate a response in JSON format with 'title', 'cover_prompt', and 'video_prompt' (3s)."
@@ -91,9 +95,7 @@ def generate_basic(query):
91
  max_tokens=500,
92
  response_format={"type": "json_object"},
93
  )
94
- result = json.loads(response.choices[0].message.content)
95
- _log("generate_basic: success", {"title": result.get("title", "")[:50]}, "H-A")
96
- return result
97
 
98
  # --- 4. PE: RAG + CoT ---
99
  def build_rag_context(user_prompt, selected_videos_num=10, num_tags=1, ratio=0.1):
@@ -140,7 +142,6 @@ def build_rag_context(user_prompt, selected_videos_num=10, num_tags=1, ratio=0.1
140
  return pos_ctx + "\n" + neg_ctx, top_partitions[0]
141
 
142
  def generate_pe(query, vid_num=10):
143
- _log("generate_pe: called", {"query": query[:50]}, "H-C")
144
  rag_context, matched_tag = build_rag_context(query, selected_videos_num=vid_num)
145
  cot_prompt = f"""You are a talented video creator. Think step-by-step using the reference videos below, then generate the most popular title, cover prompt, and 3-second video prompt.
146
 
@@ -167,18 +168,15 @@ Return JSON ONLY with keys: title (max 50 chars), cover_prompt, video_prompt (3s
167
  )
168
  result = json.loads(response.choices[0].message.content)
169
  result["_matched_tag"] = matched_tag
170
- _log("generate_pe: success", {"title": result.get("title", "")[:50], "tag": matched_tag}, "H-C")
171
  return result
172
 
173
- # --- 5. Video generation (lazy loaded, GPU) ---
174
  @spaces.GPU(duration=60)
175
  def run_video_generation(video_prompt):
176
- _log("run_video_generation: called", {"prompt": video_prompt[:80]}, "H-B")
177
  pipe = get_pipe()
178
  output = pipe(prompt=video_prompt, guidance_scale=1.0, num_inference_steps=4, num_frames=16)
179
  gif_path = "output_video.gif"
180
  export_to_gif(output.frames[0], gif_path)
181
- _log("run_video_generation: success", {}, "H-B")
182
  return gif_path
183
 
184
  # --- 6. Gradio entrypoints ---
 
2
  import torch
3
  import json
4
  import os
5
+ import threading
6
  import numpy as np
7
  import pandas as pd
8
  import faiss
 
14
  from datasets import load_dataset
15
  import spaces
16
 
 
 
 
 
 
 
 
 
 
 
17
  # --- 1. LLM client ---
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
19
  client = InferenceClient("meta-llama/Llama-3.3-70B-Instruct", token=HF_TOKEN)
20
 
21
+ # --- 2. Lazy globals with threading lock ---
22
  _pipe = None
23
  _rag_df = None
24
  _embed_model = None
25
  _unique_partitions = None
26
  _partition_embeddings = None
27
+ _pipe_lock = threading.Lock()
28
+ _rag_lock = threading.Lock()
29
 
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
31
  dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
32
 
33
  def get_pipe():
34
  global _pipe
35
  if _pipe is None:
36
+ with _pipe_lock:
37
+ if _pipe is None:
38
+ print("Loading video pipeline (first use)...")
39
+ step = 4
40
+ repo = "ByteDance/AnimateDiff-Lightning"
41
+ ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
42
+ base = "emilianJR/epiCRealism"
43
+ adapter = MotionAdapter().to(device, dtype)
44
+ adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
45
+ _pipe = AnimateDiffPipeline.from_pretrained(
46
+ base, motion_adapter=adapter, torch_dtype=dtype
47
+ ).to(device)
48
+ _pipe.scheduler = EulerDiscreteScheduler.from_config(
49
+ _pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear"
50
+ )
51
+ print("Video pipeline ready.")
52
  return _pipe
53
 
54
  def get_rag():
55
  global _rag_df, _embed_model, _unique_partitions, _partition_embeddings
56
  if _rag_df is None:
57
+ with _rag_lock:
58
+ if _rag_df is None:
59
+ print("Loading MicroLens RAG dataset (first use)...")
60
+ _rag_df = load_dataset("junchenfu/microlens_rag", split="train").to_pandas()
61
+ _rag_df["comment_count"] = _rag_df["comment_count"].fillna(0)
62
+ _embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
63
+ _unique_partitions = _rag_df["partition"].unique().tolist()
64
+ _partition_embeddings = _embed_model.encode(_unique_partitions)
65
+ print(f"RAG ready: {len(_rag_df)} videos, {len(_unique_partitions)} categories.")
66
  return _rag_df, _embed_model, _unique_partitions, _partition_embeddings
67
 
68
+ # Pre-warm in background so the first user request is faster
69
+ def _preload():
70
+ try:
71
+ get_rag()
72
+ except Exception as e:
73
+ print(f"Background preload warning: {e}")
74
+
75
+ threading.Thread(target=_preload, daemon=True).start()
76
+
77
  # --- 3. Basic LLMPopcorn ---
78
  def generate_basic(query):
 
79
  system_prompt = (
80
  "You are a talented video creator. "
81
  "Generate a response in JSON format with 'title', 'cover_prompt', and 'video_prompt' (3s)."
 
95
  max_tokens=500,
96
  response_format={"type": "json_object"},
97
  )
98
+ return json.loads(response.choices[0].message.content)
 
 
99
 
100
  # --- 4. PE: RAG + CoT ---
101
  def build_rag_context(user_prompt, selected_videos_num=10, num_tags=1, ratio=0.1):
 
142
  return pos_ctx + "\n" + neg_ctx, top_partitions[0]
143
 
144
  def generate_pe(query, vid_num=10):
 
145
  rag_context, matched_tag = build_rag_context(query, selected_videos_num=vid_num)
146
  cot_prompt = f"""You are a talented video creator. Think step-by-step using the reference videos below, then generate the most popular title, cover prompt, and 3-second video prompt.
147
 
 
168
  )
169
  result = json.loads(response.choices[0].message.content)
170
  result["_matched_tag"] = matched_tag
 
171
  return result
172
 
173
+ # --- 5. Video generation (lazy loaded inside GPU context) ---
174
  @spaces.GPU(duration=60)
175
  def run_video_generation(video_prompt):
 
176
  pipe = get_pipe()
177
  output = pipe(prompt=video_prompt, guidance_scale=1.0, num_inference_steps=4, num_frames=16)
178
  gif_path = "output_video.gif"
179
  export_to_gif(output.frames[0], gif_path)
 
180
  return gif_path
181
 
182
  # --- 6. Gradio entrypoints ---