Haldi247 commited on
Commit
ccd953f
·
verified ·
1 Parent(s): dd2ac70

Upload 3 files

Browse files
Files changed (3) hide show
  1. app_urdu.py +209 -0
  2. chunks_urdu_recursive.json +0 -0
  3. requirements.txt +6 -0
app_urdu.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, re, time
2
+ import numpy as np
3
+ import gradio as gr
4
+ from sentence_transformers import SentenceTransformer, CrossEncoder
5
+ from rank_bm25 import BM25Okapi
6
+ from pinecone import Pinecone
7
+ from huggingface_hub import InferenceClient
8
+
9
+ # CONFIG
10
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
11
+ HF_TOKEN = os.getenv("HF_TOKEN")
12
+ PINECONE_INDEX = "rag-nlp-project"
13
+
14
+ # LOAD RESOURCES
15
+ print("Loading resources...")
16
+
17
+ with open("chunks_urdu_recursive.json", encoding="utf-8") as f:
18
+ ALL_CHUNKS = json.load(f)
19
+
20
+ tokenized = [c["text"].lower().split() for c in ALL_CHUNKS]
21
+ bm25 = BM25Okapi(tokenized)
22
+
23
+ embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
24
+ reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
25
+
26
+ pc = Pinecone(api_key=PINECONE_API_KEY)
27
+ pine_index = pc.Index(PINECONE_INDEX)
28
+
29
+ llm = InferenceClient(token=HF_TOKEN)
30
+ print("All ready!")
31
+
32
+ # RETRIEVAL
33
+ def semantic_search(query, top_k=20):
34
+ qvec = embedder.encode(query).tolist()
35
+ res = pine_index.query(vector=qvec, top_k=top_k,
36
+ include_metadata=True, namespace="urdu_recursive")
37
+ return [{"id": m["id"], "text": m["metadata"]["text"],
38
+ "title": m["metadata"]["title"], "score": m["score"]}
39
+ for m in res["matches"]]
40
+
41
+ def bm25_search(query, top_k=20):
42
+ tokens = query.lower().split()
43
+ scores = bm25.get_scores(tokens)
44
+ top_idx = np.argsort(scores)[::-1][:top_k]
45
+ return [{"id": ALL_CHUNKS[i]["id"], "text": ALL_CHUNKS[i]["text"],
46
+ "title": ALL_CHUNKS[i]["title"], "score": float(scores[i])}
47
+ for i in top_idx if scores[i] > 0]
48
+
49
+ def rrf_fuse(lists_of_results, k=60):
50
+ scores, data = {}, {}
51
+ for results in lists_of_results:
52
+ for rank, item in enumerate(results):
53
+ did = item["id"]
54
+ scores[did] = scores.get(did, 0) + 1.0 / (k + rank + 1)
55
+ data[did] = {"text": item["text"], "title": item["title"]}
56
+ ranked = sorted(scores, key=lambda x: scores[x], reverse=True)
57
+ return [{"id": d, "rrf_score": scores[d], **data[d]} for d in ranked]
58
+
59
+ def cross_encoder_rerank(query, candidates, top_k=5):
60
+ if not candidates:
61
+ return []
62
+ pool = candidates[:30]
63
+ pairs = [(query, c["text"]) for c in pool]
64
+ ce_scores = reranker.predict(pairs)
65
+ for i, s in enumerate(ce_scores):
66
+ pool[i]["ce_score"] = float(s)
67
+ pool.sort(key=lambda x: x["ce_score"], reverse=True)
68
+ return pool[:top_k]
69
+
70
+ # LLM
71
+ def call_llm(prompt, max_tokens=512, temperature=0.3):
72
+ for model in ["mistralai/Mistral-7B-Instruct-v0.2", "meta-llama/Meta-Llama-3-8B-Instruct"]:
73
+ try:
74
+ resp = llm.chat_completion(
75
+ model=model,
76
+ messages=[{"role": "user", "content": prompt}],
77
+ max_tokens=max_tokens, temperature=temperature
78
+ )
79
+ return resp.choices[0].message.content.strip()
80
+ except:
81
+ continue
82
+ return "[LLM Error]"
83
+
84
+ def generate_answer(query, contexts):
85
+ ctx = "\n\n".join([f"{i+1}. [{c['title']}] {c['text']}" for i, c in enumerate(contexts)])
86
+ prompt = f"""درج ذیل معلومات کی بنیاد پر:
87
+
88
+ {ctx}
89
+
90
+ سوال کا تفصیلی جواب دیں: {query}
91
+ اگر معلومات سوال سے متعلق نہیں ہیں تو بتائیں کہ ڈیٹابیس میں مناسب جواب نہیں ملا۔"""
92
+ return call_llm(prompt)
93
+
94
+ # EVALUATION
95
+ def eval_faithfulness(answer, contexts):
96
+ context_str = "\n".join([c["text"] for c in contexts])[:3000]
97
+ claims_raw = call_llm(
98
+ f"Extract all factual claims as a numbered list.\n\nAnswer: {answer}\n\nClaims:",
99
+ max_tokens=400, temperature=0.1
100
+ )
101
+ claims = [re.sub(r"^[\d]+[\.\)]\s*", "", l.strip())
102
+ for l in claims_raw.split("\n")
103
+ if len(re.sub(r"^[\d]+[\.\)]\s*", "", l.strip())) > 15]
104
+ if not claims:
105
+ return 1.0, "No claims extracted."
106
+ supported = 0
107
+ details = []
108
+ for claim in claims[:8]:
109
+ verdict = call_llm(
110
+ f"Is this claim supported by the context? Reply ONLY 'SUPPORTED' or 'NOT SUPPORTED'.\n\n"
111
+ f"Context: {context_str}\n\nClaim: {claim}\n\nVerdict:",
112
+ max_tokens=10, temperature=0.1
113
+ ).upper()
114
+ ok = "SUPPORTED" in verdict and "NOT" not in verdict
115
+ if ok:
116
+ supported += 1
117
+ details.append(f"{'[Y]' if ok else '[N]'} {claim}")
118
+ return supported / len(claims[:8]), "\n".join(details)
119
+
120
+ def eval_relevancy(query, answer):
121
+ qs_raw = call_llm(
122
+ f"Generate exactly 3 questions that this answer directly addresses. "
123
+ f"One per line, no numbering.\n\nAnswer: {answer}\n\nQuestions:",
124
+ max_tokens=200, temperature=0.3
125
+ )
126
+ questions = [re.sub(r"^[\d]+[\.\)]\s*", "", l.strip())
127
+ for l in qs_raw.split("\n")
128
+ if len(re.sub(r"^[\d]+[\.\)]\s*", "", l.strip())) > 10][:3]
129
+ if not questions:
130
+ return 0.0, "Could not generate questions."
131
+ embs = embedder.encode([query] + questions)
132
+ q_emb = embs[0]
133
+ sims, detail_lines = [], []
134
+ for i, q in enumerate(questions):
135
+ sim = float(np.dot(q_emb, embs[i+1]) /
136
+ (np.linalg.norm(q_emb) * np.linalg.norm(embs[i+1])))
137
+ sims.append(sim)
138
+ detail_lines.append(f" Q{i+1}: {q} (sim={sim:.3f})")
139
+ return float(np.mean(sims)), "\n".join(detail_lines)
140
+
141
+ # MAIN PIPELINE
142
+ def run_query(query, run_eval):
143
+ if not query.strip():
144
+ return "براہ کرم سوال درج کریں۔", "", "", ""
145
+
146
+ t0 = time.time()
147
+ sem = semantic_search(query)
148
+ kw = bm25_search(query)
149
+ fused = rrf_fuse([sem, kw])
150
+ reranked = cross_encoder_rerank(query, fused)
151
+ t_retrieve = time.time() - t0
152
+
153
+ t1 = time.time()
154
+ answer = generate_answer(query, reranked)
155
+ t_generate = time.time() - t1
156
+
157
+ ctx_display = ""
158
+ for i, c in enumerate(reranked):
159
+ ctx_display += f"**[{i+1}] {c['title']}** (score: {c.get('ce_score', 0):.3f})\n"
160
+ ctx_display += f"{c['text']}\n\n---\n\n"
161
+
162
+ scores_display = ""
163
+ t_eval = 0
164
+ if run_eval:
165
+ t2 = time.time()
166
+ faith_score, faith_detail = eval_faithfulness(answer, reranked)
167
+ rel_score, rel_detail = eval_relevancy(query, answer)
168
+ t_eval = time.time() - t2
169
+ scores_display = (
170
+ f"### Faithfulness: {faith_score:.0%}\n{faith_detail}\n\n"
171
+ f"### Relevancy: {rel_score:.0%}\n{rel_detail}"
172
+ )
173
+ else:
174
+ scores_display = "*(Check the box to run evaluation)*"
175
+
176
+ timing = (f"Retrieval: {t_retrieve:.2f}s | Generation: {t_generate:.2f}s | "
177
+ f"Evaluation: {t_eval:.2f}s | Total: {t_retrieve + t_generate + t_eval:.2f}s")
178
+ return answer, ctx_display, scores_display, timing
179
+
180
+ # GRADIO UI
181
+ with gr.Blocks(title="RAG Q&A — Urdu") as demo:
182
+ gr.Markdown(
183
+ "# اردو سوال و جواب کا نظام\n"
184
+ "*Hybrid Search (BM25 + Semantic + RRF) - Cross-Encoder Reranking - LLM-as-a-Judge*"
185
+ )
186
+ with gr.Row():
187
+ query_box = gr.Textbox(label="اپنا سوال لکھیں",
188
+ placeholder="مثال: مصنوعی ذہانت کیا ہے؟",
189
+ scale=4, rtl=True)
190
+ eval_check = gr.Checkbox(label="Run Evaluation", value=True)
191
+ btn = gr.Button("پوچھیں", variant="primary", scale=1)
192
+
193
+ with gr.Tabs():
194
+ with gr.TabItem("جواب"):
195
+ answer_out = gr.Markdown(rtl=True)
196
+ with gr.TabItem("Retrieved Context"):
197
+ context_out = gr.Markdown(rtl=True)
198
+ with gr.TabItem("Evaluation Scores"):
199
+ scores_out = gr.Markdown()
200
+
201
+ timing_out = gr.Textbox(label="Timing", interactive=False)
202
+ btn.click(fn=run_query, inputs=[query_box, eval_check],
203
+ outputs=[answer_out, context_out, scores_out, timing_out])
204
+
205
+ gr.Markdown("---\n*Embedding: paraphrase-multilingual-MiniLM-L12-v2 | Reranker: ms-marco-MiniLM | "
206
+ "LLM: Mistral-7B / Llama-3-8B | Vector DB: Pinecone*")
207
+
208
+ if __name__ == "__main__":
209
+ demo.launch()
chunks_urdu_recursive.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ sentence-transformers
3
+ rank-bm25
4
+ pinecone
5
+ huggingface-hub
6
+ numpy