georgtawadrous commited on
Commit
fb888b3
·
verified ·
1 Parent(s): a2a7144

v2: BGE-M3 embeddings, hybrid BM25+Dense retrieval, HyDE, cross-encoder reranking

Browse files
Files changed (1) hide show
  1. rag/chain.py +257 -98
rag/chain.py CHANGED
@@ -1,5 +1,12 @@
1
  """
2
  Core RAG chain — wraps ChromaDB retrieval + LLM (Ollama / Gemini / HF / OpenRouter).
 
 
 
 
 
 
 
3
  """
4
  from __future__ import annotations
5
  import re
@@ -22,53 +29,38 @@ load_dotenv(override=True)
22
 
23
  # Greek Unicode ranges (excluding characters shared with Coptic)
24
  _GREEK_ONLY_RANGES = set()
25
- # Greek and Coptic block: U+0370–U+03FF
26
- # Greek Extended block: U+1F00–U+1FFF
27
- # These contain polytonic Greek, accented forms, archaic letters that are NOT Coptic
28
  for cp in range(0x0370, 0x0400):
29
  _GREEK_ONLY_RANGES.add(cp)
30
  for cp in range(0x1F00, 0x2000):
31
  _GREEK_ONLY_RANGES.add(cp)
32
 
33
- # Coptic Unicode block: U+2C80–U+2CFF (dedicated Coptic characters)
34
  _COPTIC_BLOCK = set(range(0x2C80, 0x2D00))
35
 
36
- # Characters shared between Greek and Coptic scripts (visually identical but
37
- # Coptic reuses Greek codepoints for these). We should NOT flag these as "Greek".
38
- # Common shared: Α-Ω / α-ω base letters that Coptic uses (U+0391-U+03C9 subset)
39
- # Coptic uses Greek codepoints for: α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ/ς τ υ φ χ ψ ω
40
  _SHARED_GREEK_COPTIC = set()
41
  for ch in "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψως":
42
  _SHARED_GREEK_COPTIC.add(ord(ch))
43
 
44
- # Coptic-specific letters (Demotic-derived) that confirm Coptic, not Greek
45
  _COPTIC_SPECIFIC = set()
46
  for ch in "ϣϩϫϭϯϥⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱϣϩϫϭϯϥⲋⲍⲹⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛ":
47
  _COPTIC_SPECIFIC.add(ord(ch))
48
 
49
- # Common Greek words that should NOT appear in Coptic text
50
  _GREEK_WORD_PATTERNS = re.compile(
51
  r'\b(τοῦ|τῆς|τῶν|τόν|τήν|καί|ἐν|εἰς|ἐκ|ἀπό|πρός|μετά|κατά|περί|ὑπό|παρά|διά|ἐπί'
52
- r'|ὁ|ἡ|τό|οἱ|αἱ|τά' # Greek articles
53
- r'|ἐστί[ν]?|εἶναι|λέγει|λέγων|ἔχει|ἔχων' # Common Greek verbs
54
- r'|αὐτός|αὐτή|αὐτό|αὐτοῦ|αὐτῆς' # Greek pronouns
55
- r'|θεός|θεοῦ|κύριος|κυρίου|λόγος|λόγου' # Common Greek nouns
56
  r'|ἄνθρωπος|ἀνθρώπου|κόσμος|κόσμου'
57
- r'|οὐ|οὐκ|μή|γάρ|δέ|ἀλλά|ὅτι|ἵνα|ὡς' # Greek particles
58
  r')\b',
59
  re.UNICODE
60
  )
61
 
62
 
63
  def _count_greek_indicators(text: str) -> dict:
64
- """
65
- Analyze text for Greek vs Coptic script usage.
66
- Returns counts of greek-only chars, coptic-specific chars, and greek word matches.
67
- """
68
  greek_only_count = 0
69
  coptic_specific_count = 0
70
  shared_count = 0
71
-
72
  for ch in text:
73
  cp = ord(ch)
74
  if cp in _COPTIC_SPECIFIC or cp in _COPTIC_BLOCK:
@@ -77,9 +69,7 @@ def _count_greek_indicators(text: str) -> dict:
77
  greek_only_count += 1
78
  elif cp in _SHARED_GREEK_COPTIC:
79
  shared_count += 1
80
-
81
  greek_words = _GREEK_WORD_PATTERNS.findall(text)
82
-
83
  return {
84
  "greek_only_chars": greek_only_count,
85
  "coptic_specific_chars": coptic_specific_count,
@@ -90,15 +80,9 @@ def _count_greek_indicators(text: str) -> dict:
90
 
91
 
92
  def _add_greek_warning(answer: str) -> str:
93
- """
94
- If the answer contains significant Greek text, append a warning.
95
- """
96
  analysis = _count_greek_indicators(answer)
97
-
98
- # If there are Greek-only characters or Greek words detected
99
  has_greek_words = analysis["greek_word_count"] > 0
100
  has_greek_chars = analysis["greek_only_chars"] > 3
101
-
102
  if has_greek_words or has_greek_chars:
103
  warning_parts = []
104
  if has_greek_words:
@@ -106,7 +90,6 @@ def _add_greek_warning(answer: str) -> str:
106
  warning_parts.append(f"Greek words detected: {sample}")
107
  if has_greek_chars:
108
  warning_parts.append(f"{analysis['greek_only_chars']} Greek-only characters found")
109
-
110
  warning = (
111
  "\n\n---\n"
112
  "⚠️ **Greek Content Warning**: This response may contain Greek text "
@@ -116,7 +99,6 @@ def _add_greek_warning(answer: str) -> str:
116
  "[CDO](https://coptic-dictionary.org) or [Coptic SCRIPTORIUM](https://copticscriptorium.org)."
117
  )
118
  return answer + warning
119
-
120
  return answer
121
 
122
 
@@ -171,6 +153,17 @@ INSTRUCTIONS:
171
  Unicode characters (ⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱ) and NOT Greek Unicode
172
  characters (αβγδεζηθικλμνξοπρστυφχψω).
173
 
 
 
 
 
 
 
 
 
 
 
 
174
  RETRIEVED KNOWLEDGE BASE CONTEXT:
175
  {context}
176
 
@@ -184,84 +177,220 @@ _prompt = PromptTemplate(
184
  template=Shenute_SYSTEM,
185
  )
186
 
187
- # ── Embedder helper (shared across chain + ingest when using OpenRouter) ─────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def get_embedder_for_provider(provider: str):
189
  """Return a LangChain embedder for the given provider.
190
- OpenRouter does not offer an embedding endpoint, so we fall back to
191
- HuggingFace embeddings (free, no key needed for public models) when
192
- OpenRouter is selected."""
 
 
193
  if provider == "Gemini API":
194
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
195
  return GoogleGenerativeAIEmbeddings(
196
  model="models/gemini-embedding-2-preview",
197
  google_api_key=os.environ.get("GEMINI_API_KEY"),
198
  )
199
- elif provider == "Hugging Face":
200
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
201
  return HuggingFaceEndpointEmbeddings(
202
  huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
203
- model="sentence-transformers/all-MiniLM-L6-v2",
204
- )
205
- elif provider == "OpenRouter":
206
- # OpenRouter has no embedding API — use HF embeddings as fallback
207
- from langchain_huggingface import HuggingFaceEndpointEmbeddings
208
- return HuggingFaceEndpointEmbeddings(
209
- huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
210
- model="sentence-transformers/all-MiniLM-L6-v2",
211
  )
212
  else:
213
- # Local AI (Ollama)
214
  base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
215
  return OllamaEmbeddings(model="nomic-embed-text", base_url=base_url)
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  # ── Chain builder ──────────────────────────────────────────────────────────────
219
  def build_chain(model: str = "qwen3:14b",
220
  top_k: int = 6,
221
  temperature: float = 0.1,
222
- provider: str = "Local AI") -> RetrievalQA:
 
 
 
223
 
224
- # 1. Setup Embedder & VectorDB
225
  embedder = get_embedder_for_provider(provider)
226
-
227
- vectordb = Chroma(
228
- persist_directory="./chroma_db",
229
- embedding_function=embedder,
230
- )
231
- retriever = vectordb.as_retriever(search_kwargs={"k": top_k})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- # 2. Setup chosen LLM
234
  if provider == "Gemini API":
235
  from langchain_google_genai import ChatGoogleGenerativeAI
236
  gemini_api_key = os.environ.get("GEMINI_API_KEY")
237
  if not gemini_api_key:
238
  raise ValueError("GEMINI_API_KEY is not set in the .env file.")
239
- llm = ChatGoogleGenerativeAI(
240
- model=model,
241
- temperature=temperature,
242
- google_api_key=gemini_api_key,
243
- )
244
  elif provider == "Hugging Face":
245
  from langchain_openai import ChatOpenAI
246
  hf_token = os.environ.get("HF_TOKEN")
247
  if not hf_token:
248
  raise ValueError("HF_TOKEN is not set in the .env file.")
249
- llm = ChatOpenAI(
250
- model=model,
251
- temperature=temperature,
252
- api_key=hf_token,
253
- base_url="https://router.huggingface.co/v1",
254
- )
255
  elif provider == "OpenRouter":
256
- # OpenRouter exposes an OpenAI-compatible API at https://openrouter.ai/api/v1
257
  from langchain_openai import ChatOpenAI
258
  openrouter_key = os.environ.get("OPENROUTER_API_KEY")
259
  if not openrouter_key:
260
  raise ValueError("OPENROUTER_API_KEY is not set. Add it as a Space secret or in your .env file.")
261
  llm = ChatOpenAI(
262
- model=model,
263
- temperature=temperature,
264
- api_key=openrouter_key,
265
  base_url="https://openrouter.ai/api/v1",
266
  default_headers={
267
  "HTTP-Referer": "https://huggingface.co/spaces/georgtawadrous/thoth_app",
@@ -269,50 +398,37 @@ def build_chain(model: str = "qwen3:14b",
269
  },
270
  )
271
  else:
272
- # Default to Local AI (Ollama)
273
  base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
274
- llm = OllamaLLM(
275
- model=model,
276
- temperature=temperature,
277
- num_ctx=4096,
278
- base_url=base_url,
279
- )
280
-
281
  chain = RetrievalQA.from_chain_type(
282
- llm=llm,
283
- retriever=retriever,
284
- chain_type="stuff",
285
- chain_type_kwargs={"prompt": _prompt},
286
- return_source_documents=True,
287
  )
288
 
289
- # Optional Feedback Retriever
290
  try:
291
  feedback_store = Chroma(
292
- persist_directory="./chroma_db",
293
- embedding_function=embedder,
294
  collection_name="Shenute_feedback",
295
  )
296
  feedback_retriever = feedback_store.as_retriever(search_kwargs={"k": 2})
297
  except Exception:
298
  feedback_retriever = None
299
 
300
- return chain, feedback_retriever
301
 
302
 
303
  def query_Shenute(question: str,
304
  model: str = "qwen3:14b",
305
  top_k: int = 6,
306
  temperature: float = 0.1,
307
- provider: str = "Local AI") -> dict:
308
- """
309
- Returns:
310
- {
311
- "answer": str,
312
- "sources": [{"text": str, "source": str, "page": int}]
313
- }
314
- """
315
- chain, feedback_retriever = build_chain(model, top_k, temperature, provider)
316
 
317
  feedback_context = ""
318
  if feedback_retriever:
@@ -327,15 +443,53 @@ def query_Shenute(question: str,
327
  if feedback_context:
328
  augmented_question = f"{question}\n\n[SYSTEM NOTE - PAST USER CORRECTIONS TO APPLY IF RELEVANT:\n{feedback_context}]"
329
 
330
- result = chain.invoke({"query": augmented_question})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- # Post-process: clean up think tags and check for Greek hallucination
333
- answer = result["result"]
334
  answer = _strip_think_tags(answer)
335
  answer = _add_greek_warning(answer)
336
 
337
  sources = []
338
- for doc in result.get("source_documents", []):
339
  sources.append({
340
  "text": doc.page_content[:300],
341
  "source": doc.metadata.get("source", "Unknown"),
@@ -345,4 +499,9 @@ def query_Shenute(question: str,
345
  return {
346
  "answer": answer,
347
  "sources": sources,
 
 
 
 
 
348
  }
 
1
  """
2
  Core RAG chain — wraps ChromaDB retrieval + LLM (Ollama / Gemini / HF / OpenRouter).
3
+
4
+ v2 improvements:
5
+ - BGE-M3 multilingual embeddings (replaces all-MiniLM-L6-v2)
6
+ - Hybrid BM25 + Dense retrieval with Reciprocal Rank Fusion
7
+ - Cross-encoder reranking with BGE-reranker-v2-m3
8
+ - HyDE (Hypothetical Document Embeddings) for query expansion
9
+ - Reduced context window (top-4 after reranking instead of top-6)
10
  """
11
  from __future__ import annotations
12
  import re
 
29
 
30
  # Greek Unicode ranges (excluding characters shared with Coptic)
31
  _GREEK_ONLY_RANGES = set()
 
 
 
32
  for cp in range(0x0370, 0x0400):
33
  _GREEK_ONLY_RANGES.add(cp)
34
  for cp in range(0x1F00, 0x2000):
35
  _GREEK_ONLY_RANGES.add(cp)
36
 
 
37
  _COPTIC_BLOCK = set(range(0x2C80, 0x2D00))
38
 
 
 
 
 
39
  _SHARED_GREEK_COPTIC = set()
40
  for ch in "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψως":
41
  _SHARED_GREEK_COPTIC.add(ord(ch))
42
 
 
43
  _COPTIC_SPECIFIC = set()
44
  for ch in "ϣϩϫϭϯϥⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱϣϩϫϭϯϥⲋⲍⲹⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛ":
45
  _COPTIC_SPECIFIC.add(ord(ch))
46
 
 
47
  _GREEK_WORD_PATTERNS = re.compile(
48
  r'\b(τοῦ|τῆς|τῶν|τόν|τήν|καί|ἐν|εἰς|ἐκ|ἀπό|πρός|μετά|κατά|περί|ὑπό|παρά|διά|ἐπί'
49
+ r'|ὁ|ἡ|τό|οἱ|αἱ|τά'
50
+ r'|ἐστί[ν]?|εἶναι|λέγει|λέγων|ἔχει|ἔχων'
51
+ r'|αὐτός|αὐτή|αὐτό|αὐτοῦ|αὐτῆς'
52
+ r'|θεός|θεοῦ|κύριος|κυρίου|λόγος|λόγου'
53
  r'|ἄνθρωπος|ἀνθρώπου|κόσμος|κόσμου'
54
+ r'|οὐ|οὐκ|μή|γάρ|δέ|ἀλλά|ὅτι|ἵνα|ὡς'
55
  r')\b',
56
  re.UNICODE
57
  )
58
 
59
 
60
  def _count_greek_indicators(text: str) -> dict:
 
 
 
 
61
  greek_only_count = 0
62
  coptic_specific_count = 0
63
  shared_count = 0
 
64
  for ch in text:
65
  cp = ord(ch)
66
  if cp in _COPTIC_SPECIFIC or cp in _COPTIC_BLOCK:
 
69
  greek_only_count += 1
70
  elif cp in _SHARED_GREEK_COPTIC:
71
  shared_count += 1
 
72
  greek_words = _GREEK_WORD_PATTERNS.findall(text)
 
73
  return {
74
  "greek_only_chars": greek_only_count,
75
  "coptic_specific_chars": coptic_specific_count,
 
80
 
81
 
82
  def _add_greek_warning(answer: str) -> str:
 
 
 
83
  analysis = _count_greek_indicators(answer)
 
 
84
  has_greek_words = analysis["greek_word_count"] > 0
85
  has_greek_chars = analysis["greek_only_chars"] > 3
 
86
  if has_greek_words or has_greek_chars:
87
  warning_parts = []
88
  if has_greek_words:
 
90
  warning_parts.append(f"Greek words detected: {sample}")
91
  if has_greek_chars:
92
  warning_parts.append(f"{analysis['greek_only_chars']} Greek-only characters found")
 
93
  warning = (
94
  "\n\n---\n"
95
  "⚠️ **Greek Content Warning**: This response may contain Greek text "
 
99
  "[CDO](https://coptic-dictionary.org) or [Coptic SCRIPTORIUM](https://copticscriptorium.org)."
100
  )
101
  return answer + warning
 
102
  return answer
103
 
104
 
 
153
  Unicode characters (ⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱ) and NOT Greek Unicode
154
  characters (αβγδεζηθικλμνξοπρστυφχψω).
155
 
156
+ ANSWER QUALITY RULES:
157
+ - Ground your answer ONLY in the retrieved context below. Do NOT fabricate
158
+ dictionary entries, paradigm tables, or grammatical forms from memory.
159
+ - If the retrieved context does not contain information to answer the question,
160
+ say so clearly. Do NOT invent plausible-sounding answers.
161
+ - When multiple retrieved chunks contain relevant information, SYNTHESIZE them
162
+ into a coherent answer rather than repeating each chunk separately.
163
+ - Prefer information from lexicon entries (CCL, Faulkner) over grammar PDFs
164
+ for vocabulary questions, and grammar sources (Allen, Layton, Lambdin) for
165
+ structural/syntactic questions.
166
+
167
  RETRIEVED KNOWLEDGE BASE CONTEXT:
168
  {context}
169
 
 
177
  template=Shenute_SYSTEM,
178
  )
179
 
180
+ # ── HyDE prompt for Coptic/Egyptian query expansion ─────────────────────────
181
+ _HYDE_TEMPLATE = """You are an expert in Coptic linguistics and Ancient Egyptian.
182
+ Given the following question, write a short hypothetical dictionary entry or grammar
183
+ explanation that would answer it. Write as if it were an entry in Crum's Coptic Dictionary,
184
+ Faulkner's Middle Egyptian Dictionary, or Layton's Coptic Grammar. Include the relevant
185
+ Coptic or Egyptian terms in proper Unicode script.
186
+
187
+ Question: {question}
188
+
189
+ Hypothetical entry:"""
190
+
191
+ _hyde_prompt = PromptTemplate(
192
+ input_variables=["question"],
193
+ template=_HYDE_TEMPLATE,
194
+ )
195
+
196
+
197
+ # ── Embedder helper ──────────────────────────────────────────────────────────
198
  def get_embedder_for_provider(provider: str):
199
  """Return a LangChain embedder for the given provider.
200
+
201
+ v2: Uses BGE-M3 multilingual embeddings for HF and OpenRouter providers
202
+ instead of all-MiniLM-L6-v2 (which is English-only and blind to Coptic script).
203
+ BGE-M3 supports 100+ languages, 1024-dim, 8192-token context.
204
+ """
205
  if provider == "Gemini API":
206
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
207
  return GoogleGenerativeAIEmbeddings(
208
  model="models/gemini-embedding-2-preview",
209
  google_api_key=os.environ.get("GEMINI_API_KEY"),
210
  )
211
+ elif provider in ("Hugging Face", "OpenRouter"):
212
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
213
  return HuggingFaceEndpointEmbeddings(
214
  huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
215
+ model="BAAI/bge-m3",
 
 
 
 
 
 
 
216
  )
217
  else:
 
218
  base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
219
  return OllamaEmbeddings(model="nomic-embed-text", base_url=base_url)
220
 
221
 
222
+ # ── Reranker ─────────────────────────────────────────────────────────────────
223
+ def _rerank_documents(query: str, docs: list, top_k: int = 4) -> list:
224
+ """Rerank using BAAI/bge-reranker-v2-m3 via HF Inference API."""
225
+ if not docs or len(docs) <= top_k:
226
+ return docs
227
+ try:
228
+ import requests
229
+ hf_token = os.environ.get("HF_TOKEN")
230
+ if not hf_token:
231
+ return docs[:top_k]
232
+ API_URL = "https://router.huggingface.co/hf-inference/models/BAAI/bge-reranker-v2-m3"
233
+ headers = {"Authorization": f"Bearer {hf_token}"}
234
+ texts = [doc.page_content for doc in docs]
235
+ payload = {"inputs": query, "parameters": {"texts": texts, "truncate": True}}
236
+ response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
237
+ if response.status_code == 200:
238
+ scores = response.json()
239
+ if isinstance(scores, list) and len(scores) > 0:
240
+ if isinstance(scores[0], dict):
241
+ scored_docs = [(s["score"], docs[s["index"]]) for s in scores]
242
+ else:
243
+ scored_docs = list(zip(scores, docs))
244
+ scored_docs.sort(key=lambda x: x[0], reverse=True)
245
+ return [doc for _, doc in scored_docs[:top_k]]
246
+ return docs[:top_k]
247
+ except Exception as e:
248
+ print(f"Reranking failed (falling back to top-k): {e}")
249
+ return docs[:top_k]
250
+
251
+
252
+ # ── HyDE helper ──────────────────────────────────────────────────────────────
253
+ def _generate_hyde_query(question: str, llm) -> str:
254
+ """Generate a hypothetical document using HyDE for better retrieval."""
255
+ try:
256
+ hyde_text = _HYDE_TEMPLATE.replace("{question}", question)
257
+ if hasattr(llm, 'invoke'):
258
+ result = llm.invoke(hyde_text)
259
+ if hasattr(result, 'content'):
260
+ return result.content.strip()
261
+ return str(result).strip()
262
+ return question
263
+ except Exception as e:
264
+ print(f"HyDE generation failed (using original query): {e}")
265
+ return question
266
+
267
+
268
+ # ── BM25 retriever builder ──────────────────────────────────────────────────
269
+ def _build_bm25_retriever(vectordb: Chroma, k: int = 6):
270
+ """Build a BM25 retriever from existing ChromaDB documents."""
271
+ try:
272
+ from langchain_classic.retrievers.bm25 import BM25Retriever
273
+ collection = vectordb._collection
274
+ result = collection.get(include=["documents", "metadatas"])
275
+ if not result["documents"]:
276
+ return None
277
+ from langchain_core.documents import Document
278
+ docs = []
279
+ for i, text in enumerate(result["documents"]):
280
+ meta = result["metadatas"][i] if result["metadatas"] else {}
281
+ docs.append(Document(page_content=text, metadata=meta))
282
+ bm25 = BM25Retriever.from_documents(docs, k=k)
283
+ return bm25
284
+ except Exception as e:
285
+ print(f"BM25 retriever build failed: {e}")
286
+ return None
287
+
288
+
289
+ # ── Hybrid retriever (BM25 + Dense with Reciprocal Rank Fusion) ─────────────
290
+ class HybridRetriever:
291
+ """
292
+ Fuses BM25 (exact keyword match) and dense (semantic embedding) retrieval
293
+ using Reciprocal Rank Fusion (RRF).
294
+
295
+ BM25 catches exact Coptic word-form matches (ⲥⲱⲧⲙ, ⲛⲟⲩⲧⲉ) that
296
+ dense embeddings might miss. Dense catches semantic similarity
297
+ (e.g., "God" → ⲛⲟⲩⲧⲉ) that BM25 misses for cross-lingual queries.
298
+ """
299
+ def __init__(self, bm25_retriever, dense_retriever, bm25_weight=0.4, dense_weight=0.6, k=6):
300
+ self.bm25 = bm25_retriever
301
+ self.dense = dense_retriever
302
+ self.bm25_weight = bm25_weight
303
+ self.dense_weight = dense_weight
304
+ self.k = k
305
+
306
+ def invoke(self, query: str) -> list:
307
+ bm25_docs = []
308
+ dense_docs = []
309
+ try:
310
+ bm25_docs = self.bm25.invoke(query)
311
+ except Exception as e:
312
+ print(f"BM25 retrieval failed: {e}")
313
+ try:
314
+ dense_docs = self.dense.invoke(query)
315
+ except Exception as e:
316
+ print(f"Dense retrieval failed: {e}")
317
+ if not bm25_docs and not dense_docs:
318
+ return []
319
+ if not bm25_docs:
320
+ return dense_docs[:self.k]
321
+ if not dense_docs:
322
+ return bm25_docs[:self.k]
323
+ # Reciprocal Rank Fusion
324
+ rrf_constant = 60
325
+ doc_scores = {}
326
+ for rank, doc in enumerate(bm25_docs):
327
+ key = hash(doc.page_content)
328
+ rrf_score = self.bm25_weight / (rrf_constant + rank + 1)
329
+ if key in doc_scores:
330
+ doc_scores[key] = (doc_scores[key][0] + rrf_score, doc)
331
+ else:
332
+ doc_scores[key] = (rrf_score, doc)
333
+ for rank, doc in enumerate(dense_docs):
334
+ key = hash(doc.page_content)
335
+ rrf_score = self.dense_weight / (rrf_constant + rank + 1)
336
+ if key in doc_scores:
337
+ doc_scores[key] = (doc_scores[key][0] + rrf_score, doc)
338
+ else:
339
+ doc_scores[key] = (rrf_score, doc)
340
+ ranked = sorted(doc_scores.values(), key=lambda x: x[0], reverse=True)
341
+ return [doc for _, doc in ranked[:self.k]]
342
+
343
+
344
  # ── Chain builder ──────────────────────────────────────────────────────────────
345
  def build_chain(model: str = "qwen3:14b",
346
  top_k: int = 6,
347
  temperature: float = 0.1,
348
+ provider: str = "Local AI",
349
+ use_hyde: bool = True,
350
+ use_reranking: bool = True,
351
+ use_hybrid: bool = True) -> tuple:
352
 
 
353
  embedder = get_embedder_for_provider(provider)
354
+ vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embedder)
355
+
356
+ dense_retriever = vectordb.as_retriever(search_kwargs={"k": top_k})
357
+
358
+ if use_hybrid:
359
+ try:
360
+ bm25_retriever = _build_bm25_retriever(vectordb, k=top_k)
361
+ if bm25_retriever:
362
+ retriever = HybridRetriever(
363
+ bm25_retriever=bm25_retriever,
364
+ dense_retriever=dense_retriever,
365
+ bm25_weight=0.4, dense_weight=0.6, k=top_k,
366
+ )
367
+ else:
368
+ retriever = dense_retriever
369
+ except Exception as e:
370
+ print(f"Hybrid retrieval setup failed, falling back to dense: {e}")
371
+ retriever = dense_retriever
372
+ else:
373
+ retriever = dense_retriever
374
 
 
375
  if provider == "Gemini API":
376
  from langchain_google_genai import ChatGoogleGenerativeAI
377
  gemini_api_key = os.environ.get("GEMINI_API_KEY")
378
  if not gemini_api_key:
379
  raise ValueError("GEMINI_API_KEY is not set in the .env file.")
380
+ llm = ChatGoogleGenerativeAI(model=model, temperature=temperature, google_api_key=gemini_api_key)
 
 
 
 
381
  elif provider == "Hugging Face":
382
  from langchain_openai import ChatOpenAI
383
  hf_token = os.environ.get("HF_TOKEN")
384
  if not hf_token:
385
  raise ValueError("HF_TOKEN is not set in the .env file.")
386
+ llm = ChatOpenAI(model=model, temperature=temperature, api_key=hf_token, base_url="https://router.huggingface.co/v1")
 
 
 
 
 
387
  elif provider == "OpenRouter":
 
388
  from langchain_openai import ChatOpenAI
389
  openrouter_key = os.environ.get("OPENROUTER_API_KEY")
390
  if not openrouter_key:
391
  raise ValueError("OPENROUTER_API_KEY is not set. Add it as a Space secret or in your .env file.")
392
  llm = ChatOpenAI(
393
+ model=model, temperature=temperature, api_key=openrouter_key,
 
 
394
  base_url="https://openrouter.ai/api/v1",
395
  default_headers={
396
  "HTTP-Referer": "https://huggingface.co/spaces/georgtawadrous/thoth_app",
 
398
  },
399
  )
400
  else:
 
401
  base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
402
+ llm = OllamaLLM(model=model, temperature=temperature, num_ctx=4096, base_url=base_url)
403
+
 
 
 
 
 
404
  chain = RetrievalQA.from_chain_type(
405
+ llm=llm, retriever=dense_retriever, chain_type="stuff",
406
+ chain_type_kwargs={"prompt": _prompt}, return_source_documents=True,
 
 
 
407
  )
408
 
 
409
  try:
410
  feedback_store = Chroma(
411
+ persist_directory="./chroma_db", embedding_function=embedder,
 
412
  collection_name="Shenute_feedback",
413
  )
414
  feedback_retriever = feedback_store.as_retriever(search_kwargs={"k": 2})
415
  except Exception:
416
  feedback_retriever = None
417
 
418
+ return chain, feedback_retriever, llm, retriever
419
 
420
 
421
  def query_Shenute(question: str,
422
  model: str = "qwen3:14b",
423
  top_k: int = 6,
424
  temperature: float = 0.1,
425
+ provider: str = "Local AI",
426
+ use_hyde: bool = True,
427
+ use_reranking: bool = True,
428
+ use_hybrid: bool = True) -> dict:
429
+ chain, feedback_retriever, llm, retriever = build_chain(
430
+ model, top_k, temperature, provider, use_hyde, use_reranking, use_hybrid
431
+ )
 
 
432
 
433
  feedback_context = ""
434
  if feedback_retriever:
 
443
  if feedback_context:
444
  augmented_question = f"{question}\n\n[SYSTEM NOTE - PAST USER CORRECTIONS TO APPLY IF RELEVANT:\n{feedback_context}]"
445
 
446
+ # HyDE: Generate hypothetical document for better retrieval
447
+ retrieval_query = augmented_question
448
+ hyde_used = False
449
+ if use_hyde and provider != "Local AI":
450
+ try:
451
+ hyde_doc = _generate_hyde_query(question, llm)
452
+ if hyde_doc and len(hyde_doc) > 20:
453
+ retrieval_query = f"{augmented_question}\n\n{hyde_doc}"
454
+ hyde_used = True
455
+ except Exception as e:
456
+ print(f"HyDE failed: {e}")
457
+
458
+ # Retrieve with optional reranking
459
+ reranked = False
460
+ if use_reranking and provider != "Local AI":
461
+ try:
462
+ initial_docs = retriever.invoke(retrieval_query if hyde_used else augmented_question)
463
+ rerank_k = min(top_k, 4)
464
+ reranked_docs = _rerank_documents(question, initial_docs, top_k=rerank_k)
465
+ if reranked_docs:
466
+ reranked = True
467
+ context = "\n\n---\n\n".join([doc.page_content for doc in reranked_docs])
468
+ filled_prompt = Shenute_SYSTEM.replace("{context}", context).replace("{question}", augmented_question)
469
+ result_text = llm.invoke(filled_prompt)
470
+ if hasattr(result_text, 'content'):
471
+ result_text = result_text.content
472
+ answer = str(result_text)
473
+ source_documents = reranked_docs
474
+ else:
475
+ result = chain.invoke({"query": augmented_question})
476
+ answer = result["result"]
477
+ source_documents = result.get("source_documents", [])
478
+ except Exception as e:
479
+ print(f"Reranking pipeline failed, falling back to standard chain: {e}")
480
+ result = chain.invoke({"query": augmented_question})
481
+ answer = result["result"]
482
+ source_documents = result.get("source_documents", [])
483
+ else:
484
+ result = chain.invoke({"query": augmented_question})
485
+ answer = result["result"]
486
+ source_documents = result.get("source_documents", [])
487
 
 
 
488
  answer = _strip_think_tags(answer)
489
  answer = _add_greek_warning(answer)
490
 
491
  sources = []
492
+ for doc in source_documents:
493
  sources.append({
494
  "text": doc.page_content[:300],
495
  "source": doc.metadata.get("source", "Unknown"),
 
499
  return {
500
  "answer": answer,
501
  "sources": sources,
502
+ "retrieval_info": {
503
+ "hyde_used": hyde_used,
504
+ "reranked": reranked,
505
+ "hybrid": use_hybrid,
506
+ },
507
  }