Spaces:
Running
Running
v2: BGE-M3 embeddings, hybrid BM25+Dense retrieval, HyDE, cross-encoder reranking
Browse files- rag/chain.py +257 -98
rag/chain.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
| 1 |
"""
|
| 2 |
Core RAG chain — wraps ChromaDB retrieval + LLM (Ollama / Gemini / HF / OpenRouter).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
from __future__ import annotations
|
| 5 |
import re
|
|
@@ -22,53 +29,38 @@ load_dotenv(override=True)
|
|
| 22 |
|
| 23 |
# Greek Unicode ranges (excluding characters shared with Coptic)
|
| 24 |
_GREEK_ONLY_RANGES = set()
|
| 25 |
-
# Greek and Coptic block: U+0370–U+03FF
|
| 26 |
-
# Greek Extended block: U+1F00–U+1FFF
|
| 27 |
-
# These contain polytonic Greek, accented forms, archaic letters that are NOT Coptic
|
| 28 |
for cp in range(0x0370, 0x0400):
|
| 29 |
_GREEK_ONLY_RANGES.add(cp)
|
| 30 |
for cp in range(0x1F00, 0x2000):
|
| 31 |
_GREEK_ONLY_RANGES.add(cp)
|
| 32 |
|
| 33 |
-
# Coptic Unicode block: U+2C80–U+2CFF (dedicated Coptic characters)
|
| 34 |
_COPTIC_BLOCK = set(range(0x2C80, 0x2D00))
|
| 35 |
|
| 36 |
-
# Characters shared between Greek and Coptic scripts (visually identical but
|
| 37 |
-
# Coptic reuses Greek codepoints for these). We should NOT flag these as "Greek".
|
| 38 |
-
# Common shared: Α-Ω / α-ω base letters that Coptic uses (U+0391-U+03C9 subset)
|
| 39 |
-
# Coptic uses Greek codepoints for: α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ/ς τ υ φ χ ψ ω
|
| 40 |
_SHARED_GREEK_COPTIC = set()
|
| 41 |
for ch in "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψως":
|
| 42 |
_SHARED_GREEK_COPTIC.add(ord(ch))
|
| 43 |
|
| 44 |
-
# Coptic-specific letters (Demotic-derived) that confirm Coptic, not Greek
|
| 45 |
_COPTIC_SPECIFIC = set()
|
| 46 |
for ch in "ϣϩϫϭϯϥⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱϣϩϫϭϯϥⲋⲍⲹⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛ":
|
| 47 |
_COPTIC_SPECIFIC.add(ord(ch))
|
| 48 |
|
| 49 |
-
# Common Greek words that should NOT appear in Coptic text
|
| 50 |
_GREEK_WORD_PATTERNS = re.compile(
|
| 51 |
r'\b(τοῦ|τῆς|τῶν|τόν|τήν|καί|ἐν|εἰς|ἐκ|ἀπό|πρός|μετά|κατά|περί|ὑπό|παρά|διά|ἐπί'
|
| 52 |
-
r'|ὁ|ἡ|τό|οἱ|αἱ|τά'
|
| 53 |
-
r'|ἐστί[ν]?|εἶναι|λέγει|λέγων|ἔχει|ἔχων'
|
| 54 |
-
r'|αὐτός|αὐτή|αὐτό|αὐτοῦ|αὐτῆς'
|
| 55 |
-
r'|θεός|θεοῦ|κύριος|κυρίου|λόγος|λόγου'
|
| 56 |
r'|ἄνθρωπος|ἀνθρώπου|κόσμος|κόσμου'
|
| 57 |
-
r'|οὐ|οὐκ|μή|γάρ|δέ|ἀλλά|ὅτι|ἵνα|ὡς'
|
| 58 |
r')\b',
|
| 59 |
re.UNICODE
|
| 60 |
)
|
| 61 |
|
| 62 |
|
| 63 |
def _count_greek_indicators(text: str) -> dict:
|
| 64 |
-
"""
|
| 65 |
-
Analyze text for Greek vs Coptic script usage.
|
| 66 |
-
Returns counts of greek-only chars, coptic-specific chars, and greek word matches.
|
| 67 |
-
"""
|
| 68 |
greek_only_count = 0
|
| 69 |
coptic_specific_count = 0
|
| 70 |
shared_count = 0
|
| 71 |
-
|
| 72 |
for ch in text:
|
| 73 |
cp = ord(ch)
|
| 74 |
if cp in _COPTIC_SPECIFIC or cp in _COPTIC_BLOCK:
|
|
@@ -77,9 +69,7 @@ def _count_greek_indicators(text: str) -> dict:
|
|
| 77 |
greek_only_count += 1
|
| 78 |
elif cp in _SHARED_GREEK_COPTIC:
|
| 79 |
shared_count += 1
|
| 80 |
-
|
| 81 |
greek_words = _GREEK_WORD_PATTERNS.findall(text)
|
| 82 |
-
|
| 83 |
return {
|
| 84 |
"greek_only_chars": greek_only_count,
|
| 85 |
"coptic_specific_chars": coptic_specific_count,
|
|
@@ -90,15 +80,9 @@ def _count_greek_indicators(text: str) -> dict:
|
|
| 90 |
|
| 91 |
|
| 92 |
def _add_greek_warning(answer: str) -> str:
|
| 93 |
-
"""
|
| 94 |
-
If the answer contains significant Greek text, append a warning.
|
| 95 |
-
"""
|
| 96 |
analysis = _count_greek_indicators(answer)
|
| 97 |
-
|
| 98 |
-
# If there are Greek-only characters or Greek words detected
|
| 99 |
has_greek_words = analysis["greek_word_count"] > 0
|
| 100 |
has_greek_chars = analysis["greek_only_chars"] > 3
|
| 101 |
-
|
| 102 |
if has_greek_words or has_greek_chars:
|
| 103 |
warning_parts = []
|
| 104 |
if has_greek_words:
|
|
@@ -106,7 +90,6 @@ def _add_greek_warning(answer: str) -> str:
|
|
| 106 |
warning_parts.append(f"Greek words detected: {sample}")
|
| 107 |
if has_greek_chars:
|
| 108 |
warning_parts.append(f"{analysis['greek_only_chars']} Greek-only characters found")
|
| 109 |
-
|
| 110 |
warning = (
|
| 111 |
"\n\n---\n"
|
| 112 |
"⚠️ **Greek Content Warning**: This response may contain Greek text "
|
|
@@ -116,7 +99,6 @@ def _add_greek_warning(answer: str) -> str:
|
|
| 116 |
"[CDO](https://coptic-dictionary.org) or [Coptic SCRIPTORIUM](https://copticscriptorium.org)."
|
| 117 |
)
|
| 118 |
return answer + warning
|
| 119 |
-
|
| 120 |
return answer
|
| 121 |
|
| 122 |
|
|
@@ -171,6 +153,17 @@ INSTRUCTIONS:
|
|
| 171 |
Unicode characters (ⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱ) and NOT Greek Unicode
|
| 172 |
characters (αβγδεζηθικλμνξοπρστυφχψω).
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
RETRIEVED KNOWLEDGE BASE CONTEXT:
|
| 175 |
{context}
|
| 176 |
|
|
@@ -184,84 +177,220 @@ _prompt = PromptTemplate(
|
|
| 184 |
template=Shenute_SYSTEM,
|
| 185 |
)
|
| 186 |
|
| 187 |
-
# ──
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
def get_embedder_for_provider(provider: str):
|
| 189 |
"""Return a LangChain embedder for the given provider.
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
| 193 |
if provider == "Gemini API":
|
| 194 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 195 |
return GoogleGenerativeAIEmbeddings(
|
| 196 |
model="models/gemini-embedding-2-preview",
|
| 197 |
google_api_key=os.environ.get("GEMINI_API_KEY"),
|
| 198 |
)
|
| 199 |
-
elif provider
|
| 200 |
from langchain_huggingface import HuggingFaceEndpointEmbeddings
|
| 201 |
return HuggingFaceEndpointEmbeddings(
|
| 202 |
huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
|
| 203 |
-
model="
|
| 204 |
-
)
|
| 205 |
-
elif provider == "OpenRouter":
|
| 206 |
-
# OpenRouter has no embedding API — use HF embeddings as fallback
|
| 207 |
-
from langchain_huggingface import HuggingFaceEndpointEmbeddings
|
| 208 |
-
return HuggingFaceEndpointEmbeddings(
|
| 209 |
-
huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
|
| 210 |
-
model="sentence-transformers/all-MiniLM-L6-v2",
|
| 211 |
)
|
| 212 |
else:
|
| 213 |
-
# Local AI (Ollama)
|
| 214 |
base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
|
| 215 |
return OllamaEmbeddings(model="nomic-embed-text", base_url=base_url)
|
| 216 |
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
# ── Chain builder ──────────────────────────────────────────────────────────────
|
| 219 |
def build_chain(model: str = "qwen3:14b",
|
| 220 |
top_k: int = 6,
|
| 221 |
temperature: float = 0.1,
|
| 222 |
-
provider: str = "Local AI"
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
-
# 1. Setup Embedder & VectorDB
|
| 225 |
embedder = get_embedder_for_provider(provider)
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
-
# 2. Setup chosen LLM
|
| 234 |
if provider == "Gemini API":
|
| 235 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 236 |
gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
| 237 |
if not gemini_api_key:
|
| 238 |
raise ValueError("GEMINI_API_KEY is not set in the .env file.")
|
| 239 |
-
llm = ChatGoogleGenerativeAI(
|
| 240 |
-
model=model,
|
| 241 |
-
temperature=temperature,
|
| 242 |
-
google_api_key=gemini_api_key,
|
| 243 |
-
)
|
| 244 |
elif provider == "Hugging Face":
|
| 245 |
from langchain_openai import ChatOpenAI
|
| 246 |
hf_token = os.environ.get("HF_TOKEN")
|
| 247 |
if not hf_token:
|
| 248 |
raise ValueError("HF_TOKEN is not set in the .env file.")
|
| 249 |
-
llm = ChatOpenAI(
|
| 250 |
-
model=model,
|
| 251 |
-
temperature=temperature,
|
| 252 |
-
api_key=hf_token,
|
| 253 |
-
base_url="https://router.huggingface.co/v1",
|
| 254 |
-
)
|
| 255 |
elif provider == "OpenRouter":
|
| 256 |
-
# OpenRouter exposes an OpenAI-compatible API at https://openrouter.ai/api/v1
|
| 257 |
from langchain_openai import ChatOpenAI
|
| 258 |
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
|
| 259 |
if not openrouter_key:
|
| 260 |
raise ValueError("OPENROUTER_API_KEY is not set. Add it as a Space secret or in your .env file.")
|
| 261 |
llm = ChatOpenAI(
|
| 262 |
-
model=model,
|
| 263 |
-
temperature=temperature,
|
| 264 |
-
api_key=openrouter_key,
|
| 265 |
base_url="https://openrouter.ai/api/v1",
|
| 266 |
default_headers={
|
| 267 |
"HTTP-Referer": "https://huggingface.co/spaces/georgtawadrous/thoth_app",
|
|
@@ -269,50 +398,37 @@ def build_chain(model: str = "qwen3:14b",
|
|
| 269 |
},
|
| 270 |
)
|
| 271 |
else:
|
| 272 |
-
# Default to Local AI (Ollama)
|
| 273 |
base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
|
| 274 |
-
llm = OllamaLLM(
|
| 275 |
-
|
| 276 |
-
temperature=temperature,
|
| 277 |
-
num_ctx=4096,
|
| 278 |
-
base_url=base_url,
|
| 279 |
-
)
|
| 280 |
-
|
| 281 |
chain = RetrievalQA.from_chain_type(
|
| 282 |
-
llm=llm,
|
| 283 |
-
|
| 284 |
-
chain_type="stuff",
|
| 285 |
-
chain_type_kwargs={"prompt": _prompt},
|
| 286 |
-
return_source_documents=True,
|
| 287 |
)
|
| 288 |
|
| 289 |
-
# Optional Feedback Retriever
|
| 290 |
try:
|
| 291 |
feedback_store = Chroma(
|
| 292 |
-
persist_directory="./chroma_db",
|
| 293 |
-
embedding_function=embedder,
|
| 294 |
collection_name="Shenute_feedback",
|
| 295 |
)
|
| 296 |
feedback_retriever = feedback_store.as_retriever(search_kwargs={"k": 2})
|
| 297 |
except Exception:
|
| 298 |
feedback_retriever = None
|
| 299 |
|
| 300 |
-
return chain, feedback_retriever
|
| 301 |
|
| 302 |
|
| 303 |
def query_Shenute(question: str,
|
| 304 |
model: str = "qwen3:14b",
|
| 305 |
top_k: int = 6,
|
| 306 |
temperature: float = 0.1,
|
| 307 |
-
provider: str = "Local AI"
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
"""
|
| 315 |
-
chain, feedback_retriever = build_chain(model, top_k, temperature, provider)
|
| 316 |
|
| 317 |
feedback_context = ""
|
| 318 |
if feedback_retriever:
|
|
@@ -327,15 +443,53 @@ def query_Shenute(question: str,
|
|
| 327 |
if feedback_context:
|
| 328 |
augmented_question = f"{question}\n\n[SYSTEM NOTE - PAST USER CORRECTIONS TO APPLY IF RELEVANT:\n{feedback_context}]"
|
| 329 |
|
| 330 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
-
# Post-process: clean up think tags and check for Greek hallucination
|
| 333 |
-
answer = result["result"]
|
| 334 |
answer = _strip_think_tags(answer)
|
| 335 |
answer = _add_greek_warning(answer)
|
| 336 |
|
| 337 |
sources = []
|
| 338 |
-
for doc in
|
| 339 |
sources.append({
|
| 340 |
"text": doc.page_content[:300],
|
| 341 |
"source": doc.metadata.get("source", "Unknown"),
|
|
@@ -345,4 +499,9 @@ def query_Shenute(question: str,
|
|
| 345 |
return {
|
| 346 |
"answer": answer,
|
| 347 |
"sources": sources,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
}
|
|
|
|
| 1 |
"""
|
| 2 |
Core RAG chain — wraps ChromaDB retrieval + LLM (Ollama / Gemini / HF / OpenRouter).
|
| 3 |
+
|
| 4 |
+
v2 improvements:
|
| 5 |
+
- BGE-M3 multilingual embeddings (replaces all-MiniLM-L6-v2)
|
| 6 |
+
- Hybrid BM25 + Dense retrieval with Reciprocal Rank Fusion
|
| 7 |
+
- Cross-encoder reranking with BGE-reranker-v2-m3
|
| 8 |
+
- HyDE (Hypothetical Document Embeddings) for query expansion
|
| 9 |
+
- Reduced context window (top-4 after reranking instead of top-6)
|
| 10 |
"""
|
| 11 |
from __future__ import annotations
|
| 12 |
import re
|
|
|
|
| 29 |
|
| 30 |
# Greek Unicode ranges (excluding characters shared with Coptic)
|
| 31 |
_GREEK_ONLY_RANGES = set()
|
|
|
|
|
|
|
|
|
|
| 32 |
for cp in range(0x0370, 0x0400):
|
| 33 |
_GREEK_ONLY_RANGES.add(cp)
|
| 34 |
for cp in range(0x1F00, 0x2000):
|
| 35 |
_GREEK_ONLY_RANGES.add(cp)
|
| 36 |
|
|
|
|
| 37 |
_COPTIC_BLOCK = set(range(0x2C80, 0x2D00))
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
_SHARED_GREEK_COPTIC = set()
|
| 40 |
for ch in "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψως":
|
| 41 |
_SHARED_GREEK_COPTIC.add(ord(ch))
|
| 42 |
|
|
|
|
| 43 |
_COPTIC_SPECIFIC = set()
|
| 44 |
for ch in "ϣϩϫϭϯϥⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱϣϩϫϭϯϥⲋⲍⲹⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛ":
|
| 45 |
_COPTIC_SPECIFIC.add(ord(ch))
|
| 46 |
|
|
|
|
| 47 |
_GREEK_WORD_PATTERNS = re.compile(
|
| 48 |
r'\b(τοῦ|τῆς|τῶν|τόν|τήν|καί|ἐν|εἰς|ἐκ|ἀπό|πρός|μετά|κατά|περί|ὑπό|παρά|διά|ἐπί'
|
| 49 |
+
r'|ὁ|ἡ|τό|οἱ|αἱ|τά'
|
| 50 |
+
r'|ἐστί[ν]?|εἶναι|λέγει|λέγων|ἔχει|ἔχων'
|
| 51 |
+
r'|αὐτός|αὐτή|αὐτό|αὐτοῦ|αὐτῆς'
|
| 52 |
+
r'|θεός|θεοῦ|κύριος|κυρίου|λόγος|λόγου'
|
| 53 |
r'|ἄνθρωπος|ἀνθρώπου|κόσμος|κόσμου'
|
| 54 |
+
r'|οὐ|οὐκ|μή|γάρ|δέ|ἀλλά|ὅτι|ἵνα|ὡς'
|
| 55 |
r')\b',
|
| 56 |
re.UNICODE
|
| 57 |
)
|
| 58 |
|
| 59 |
|
| 60 |
def _count_greek_indicators(text: str) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
greek_only_count = 0
|
| 62 |
coptic_specific_count = 0
|
| 63 |
shared_count = 0
|
|
|
|
| 64 |
for ch in text:
|
| 65 |
cp = ord(ch)
|
| 66 |
if cp in _COPTIC_SPECIFIC or cp in _COPTIC_BLOCK:
|
|
|
|
| 69 |
greek_only_count += 1
|
| 70 |
elif cp in _SHARED_GREEK_COPTIC:
|
| 71 |
shared_count += 1
|
|
|
|
| 72 |
greek_words = _GREEK_WORD_PATTERNS.findall(text)
|
|
|
|
| 73 |
return {
|
| 74 |
"greek_only_chars": greek_only_count,
|
| 75 |
"coptic_specific_chars": coptic_specific_count,
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
def _add_greek_warning(answer: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 83 |
analysis = _count_greek_indicators(answer)
|
|
|
|
|
|
|
| 84 |
has_greek_words = analysis["greek_word_count"] > 0
|
| 85 |
has_greek_chars = analysis["greek_only_chars"] > 3
|
|
|
|
| 86 |
if has_greek_words or has_greek_chars:
|
| 87 |
warning_parts = []
|
| 88 |
if has_greek_words:
|
|
|
|
| 90 |
warning_parts.append(f"Greek words detected: {sample}")
|
| 91 |
if has_greek_chars:
|
| 92 |
warning_parts.append(f"{analysis['greek_only_chars']} Greek-only characters found")
|
|
|
|
| 93 |
warning = (
|
| 94 |
"\n\n---\n"
|
| 95 |
"⚠️ **Greek Content Warning**: This response may contain Greek text "
|
|
|
|
| 99 |
"[CDO](https://coptic-dictionary.org) or [Coptic SCRIPTORIUM](https://copticscriptorium.org)."
|
| 100 |
)
|
| 101 |
return answer + warning
|
|
|
|
| 102 |
return answer
|
| 103 |
|
| 104 |
|
|
|
|
| 153 |
Unicode characters (ⲁⲃⲅⲇⲉⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱ) and NOT Greek Unicode
|
| 154 |
characters (αβγδεζηθικλμνξοπρστυφχψω).
|
| 155 |
|
| 156 |
+
ANSWER QUALITY RULES:
|
| 157 |
+
- Ground your answer ONLY in the retrieved context below. Do NOT fabricate
|
| 158 |
+
dictionary entries, paradigm tables, or grammatical forms from memory.
|
| 159 |
+
- If the retrieved context does not contain information to answer the question,
|
| 160 |
+
say so clearly. Do NOT invent plausible-sounding answers.
|
| 161 |
+
- When multiple retrieved chunks contain relevant information, SYNTHESIZE them
|
| 162 |
+
into a coherent answer rather than repeating each chunk separately.
|
| 163 |
+
- Prefer information from lexicon entries (CCL, Faulkner) over grammar PDFs
|
| 164 |
+
for vocabulary questions, and grammar sources (Allen, Layton, Lambdin) for
|
| 165 |
+
structural/syntactic questions.
|
| 166 |
+
|
| 167 |
RETRIEVED KNOWLEDGE BASE CONTEXT:
|
| 168 |
{context}
|
| 169 |
|
|
|
|
| 177 |
template=Shenute_SYSTEM,
|
| 178 |
)
|
| 179 |
|
| 180 |
+
# ── HyDE prompt for Coptic/Egyptian query expansion ─────────────────────────
|
| 181 |
+
_HYDE_TEMPLATE = """You are an expert in Coptic linguistics and Ancient Egyptian.
|
| 182 |
+
Given the following question, write a short hypothetical dictionary entry or grammar
|
| 183 |
+
explanation that would answer it. Write as if it were an entry in Crum's Coptic Dictionary,
|
| 184 |
+
Faulkner's Middle Egyptian Dictionary, or Layton's Coptic Grammar. Include the relevant
|
| 185 |
+
Coptic or Egyptian terms in proper Unicode script.
|
| 186 |
+
|
| 187 |
+
Question: {question}
|
| 188 |
+
|
| 189 |
+
Hypothetical entry:"""
|
| 190 |
+
|
| 191 |
+
_hyde_prompt = PromptTemplate(
|
| 192 |
+
input_variables=["question"],
|
| 193 |
+
template=_HYDE_TEMPLATE,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
# ── Embedder helper ──────────────────────────────────────────────────────────
|
| 198 |
def get_embedder_for_provider(provider: str):
|
| 199 |
"""Return a LangChain embedder for the given provider.
|
| 200 |
+
|
| 201 |
+
v2: Uses BGE-M3 multilingual embeddings for HF and OpenRouter providers
|
| 202 |
+
instead of all-MiniLM-L6-v2 (which is English-only and blind to Coptic script).
|
| 203 |
+
BGE-M3 supports 100+ languages, 1024-dim, 8192-token context.
|
| 204 |
+
"""
|
| 205 |
if provider == "Gemini API":
|
| 206 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 207 |
return GoogleGenerativeAIEmbeddings(
|
| 208 |
model="models/gemini-embedding-2-preview",
|
| 209 |
google_api_key=os.environ.get("GEMINI_API_KEY"),
|
| 210 |
)
|
| 211 |
+
elif provider in ("Hugging Face", "OpenRouter"):
|
| 212 |
from langchain_huggingface import HuggingFaceEndpointEmbeddings
|
| 213 |
return HuggingFaceEndpointEmbeddings(
|
| 214 |
huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
|
| 215 |
+
model="BAAI/bge-m3",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
)
|
| 217 |
else:
|
|
|
|
| 218 |
base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
|
| 219 |
return OllamaEmbeddings(model="nomic-embed-text", base_url=base_url)
|
| 220 |
|
| 221 |
|
| 222 |
+
# ── Reranker ─────────────────────────────────────────────────────────────────
|
| 223 |
+
def _rerank_documents(query: str, docs: list, top_k: int = 4) -> list:
|
| 224 |
+
"""Rerank using BAAI/bge-reranker-v2-m3 via HF Inference API."""
|
| 225 |
+
if not docs or len(docs) <= top_k:
|
| 226 |
+
return docs
|
| 227 |
+
try:
|
| 228 |
+
import requests
|
| 229 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 230 |
+
if not hf_token:
|
| 231 |
+
return docs[:top_k]
|
| 232 |
+
API_URL = "https://router.huggingface.co/hf-inference/models/BAAI/bge-reranker-v2-m3"
|
| 233 |
+
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 234 |
+
texts = [doc.page_content for doc in docs]
|
| 235 |
+
payload = {"inputs": query, "parameters": {"texts": texts, "truncate": True}}
|
| 236 |
+
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
|
| 237 |
+
if response.status_code == 200:
|
| 238 |
+
scores = response.json()
|
| 239 |
+
if isinstance(scores, list) and len(scores) > 0:
|
| 240 |
+
if isinstance(scores[0], dict):
|
| 241 |
+
scored_docs = [(s["score"], docs[s["index"]]) for s in scores]
|
| 242 |
+
else:
|
| 243 |
+
scored_docs = list(zip(scores, docs))
|
| 244 |
+
scored_docs.sort(key=lambda x: x[0], reverse=True)
|
| 245 |
+
return [doc for _, doc in scored_docs[:top_k]]
|
| 246 |
+
return docs[:top_k]
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Reranking failed (falling back to top-k): {e}")
|
| 249 |
+
return docs[:top_k]
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
# ── HyDE helper ──────────────────────────────────────────────────────────────
|
| 253 |
+
def _generate_hyde_query(question: str, llm) -> str:
|
| 254 |
+
"""Generate a hypothetical document using HyDE for better retrieval."""
|
| 255 |
+
try:
|
| 256 |
+
hyde_text = _HYDE_TEMPLATE.replace("{question}", question)
|
| 257 |
+
if hasattr(llm, 'invoke'):
|
| 258 |
+
result = llm.invoke(hyde_text)
|
| 259 |
+
if hasattr(result, 'content'):
|
| 260 |
+
return result.content.strip()
|
| 261 |
+
return str(result).strip()
|
| 262 |
+
return question
|
| 263 |
+
except Exception as e:
|
| 264 |
+
print(f"HyDE generation failed (using original query): {e}")
|
| 265 |
+
return question
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
# ── BM25 retriever builder ──────────────────────────────────────────────────
|
| 269 |
+
def _build_bm25_retriever(vectordb: Chroma, k: int = 6):
|
| 270 |
+
"""Build a BM25 retriever from existing ChromaDB documents."""
|
| 271 |
+
try:
|
| 272 |
+
from langchain_classic.retrievers.bm25 import BM25Retriever
|
| 273 |
+
collection = vectordb._collection
|
| 274 |
+
result = collection.get(include=["documents", "metadatas"])
|
| 275 |
+
if not result["documents"]:
|
| 276 |
+
return None
|
| 277 |
+
from langchain_core.documents import Document
|
| 278 |
+
docs = []
|
| 279 |
+
for i, text in enumerate(result["documents"]):
|
| 280 |
+
meta = result["metadatas"][i] if result["metadatas"] else {}
|
| 281 |
+
docs.append(Document(page_content=text, metadata=meta))
|
| 282 |
+
bm25 = BM25Retriever.from_documents(docs, k=k)
|
| 283 |
+
return bm25
|
| 284 |
+
except Exception as e:
|
| 285 |
+
print(f"BM25 retriever build failed: {e}")
|
| 286 |
+
return None
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
# ── Hybrid retriever (BM25 + Dense with Reciprocal Rank Fusion) ─────────────
|
| 290 |
+
class HybridRetriever:
|
| 291 |
+
"""
|
| 292 |
+
Fuses BM25 (exact keyword match) and dense (semantic embedding) retrieval
|
| 293 |
+
using Reciprocal Rank Fusion (RRF).
|
| 294 |
+
|
| 295 |
+
BM25 catches exact Coptic word-form matches (ⲥⲱⲧⲙ, ⲛⲟⲩⲧⲉ) that
|
| 296 |
+
dense embeddings might miss. Dense catches semantic similarity
|
| 297 |
+
(e.g., "God" → ⲛⲟⲩⲧⲉ) that BM25 misses for cross-lingual queries.
|
| 298 |
+
"""
|
| 299 |
+
def __init__(self, bm25_retriever, dense_retriever, bm25_weight=0.4, dense_weight=0.6, k=6):
|
| 300 |
+
self.bm25 = bm25_retriever
|
| 301 |
+
self.dense = dense_retriever
|
| 302 |
+
self.bm25_weight = bm25_weight
|
| 303 |
+
self.dense_weight = dense_weight
|
| 304 |
+
self.k = k
|
| 305 |
+
|
| 306 |
+
def invoke(self, query: str) -> list:
|
| 307 |
+
bm25_docs = []
|
| 308 |
+
dense_docs = []
|
| 309 |
+
try:
|
| 310 |
+
bm25_docs = self.bm25.invoke(query)
|
| 311 |
+
except Exception as e:
|
| 312 |
+
print(f"BM25 retrieval failed: {e}")
|
| 313 |
+
try:
|
| 314 |
+
dense_docs = self.dense.invoke(query)
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f"Dense retrieval failed: {e}")
|
| 317 |
+
if not bm25_docs and not dense_docs:
|
| 318 |
+
return []
|
| 319 |
+
if not bm25_docs:
|
| 320 |
+
return dense_docs[:self.k]
|
| 321 |
+
if not dense_docs:
|
| 322 |
+
return bm25_docs[:self.k]
|
| 323 |
+
# Reciprocal Rank Fusion
|
| 324 |
+
rrf_constant = 60
|
| 325 |
+
doc_scores = {}
|
| 326 |
+
for rank, doc in enumerate(bm25_docs):
|
| 327 |
+
key = hash(doc.page_content)
|
| 328 |
+
rrf_score = self.bm25_weight / (rrf_constant + rank + 1)
|
| 329 |
+
if key in doc_scores:
|
| 330 |
+
doc_scores[key] = (doc_scores[key][0] + rrf_score, doc)
|
| 331 |
+
else:
|
| 332 |
+
doc_scores[key] = (rrf_score, doc)
|
| 333 |
+
for rank, doc in enumerate(dense_docs):
|
| 334 |
+
key = hash(doc.page_content)
|
| 335 |
+
rrf_score = self.dense_weight / (rrf_constant + rank + 1)
|
| 336 |
+
if key in doc_scores:
|
| 337 |
+
doc_scores[key] = (doc_scores[key][0] + rrf_score, doc)
|
| 338 |
+
else:
|
| 339 |
+
doc_scores[key] = (rrf_score, doc)
|
| 340 |
+
ranked = sorted(doc_scores.values(), key=lambda x: x[0], reverse=True)
|
| 341 |
+
return [doc for _, doc in ranked[:self.k]]
|
| 342 |
+
|
| 343 |
+
|
| 344 |
# ── Chain builder ──────────────────────────────────────────────────────────────
|
| 345 |
def build_chain(model: str = "qwen3:14b",
|
| 346 |
top_k: int = 6,
|
| 347 |
temperature: float = 0.1,
|
| 348 |
+
provider: str = "Local AI",
|
| 349 |
+
use_hyde: bool = True,
|
| 350 |
+
use_reranking: bool = True,
|
| 351 |
+
use_hybrid: bool = True) -> tuple:
|
| 352 |
|
|
|
|
| 353 |
embedder = get_embedder_for_provider(provider)
|
| 354 |
+
vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embedder)
|
| 355 |
+
|
| 356 |
+
dense_retriever = vectordb.as_retriever(search_kwargs={"k": top_k})
|
| 357 |
+
|
| 358 |
+
if use_hybrid:
|
| 359 |
+
try:
|
| 360 |
+
bm25_retriever = _build_bm25_retriever(vectordb, k=top_k)
|
| 361 |
+
if bm25_retriever:
|
| 362 |
+
retriever = HybridRetriever(
|
| 363 |
+
bm25_retriever=bm25_retriever,
|
| 364 |
+
dense_retriever=dense_retriever,
|
| 365 |
+
bm25_weight=0.4, dense_weight=0.6, k=top_k,
|
| 366 |
+
)
|
| 367 |
+
else:
|
| 368 |
+
retriever = dense_retriever
|
| 369 |
+
except Exception as e:
|
| 370 |
+
print(f"Hybrid retrieval setup failed, falling back to dense: {e}")
|
| 371 |
+
retriever = dense_retriever
|
| 372 |
+
else:
|
| 373 |
+
retriever = dense_retriever
|
| 374 |
|
|
|
|
| 375 |
if provider == "Gemini API":
|
| 376 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 377 |
gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
| 378 |
if not gemini_api_key:
|
| 379 |
raise ValueError("GEMINI_API_KEY is not set in the .env file.")
|
| 380 |
+
llm = ChatGoogleGenerativeAI(model=model, temperature=temperature, google_api_key=gemini_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
elif provider == "Hugging Face":
|
| 382 |
from langchain_openai import ChatOpenAI
|
| 383 |
hf_token = os.environ.get("HF_TOKEN")
|
| 384 |
if not hf_token:
|
| 385 |
raise ValueError("HF_TOKEN is not set in the .env file.")
|
| 386 |
+
llm = ChatOpenAI(model=model, temperature=temperature, api_key=hf_token, base_url="https://router.huggingface.co/v1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
elif provider == "OpenRouter":
|
|
|
|
| 388 |
from langchain_openai import ChatOpenAI
|
| 389 |
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
|
| 390 |
if not openrouter_key:
|
| 391 |
raise ValueError("OPENROUTER_API_KEY is not set. Add it as a Space secret or in your .env file.")
|
| 392 |
llm = ChatOpenAI(
|
| 393 |
+
model=model, temperature=temperature, api_key=openrouter_key,
|
|
|
|
|
|
|
| 394 |
base_url="https://openrouter.ai/api/v1",
|
| 395 |
default_headers={
|
| 396 |
"HTTP-Referer": "https://huggingface.co/spaces/georgtawadrous/thoth_app",
|
|
|
|
| 398 |
},
|
| 399 |
)
|
| 400 |
else:
|
|
|
|
| 401 |
base_url = os.environ.get("LOCAL_AI_BASE_URL", "http://127.0.0.1:11434")
|
| 402 |
+
llm = OllamaLLM(model=model, temperature=temperature, num_ctx=4096, base_url=base_url)
|
| 403 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
chain = RetrievalQA.from_chain_type(
|
| 405 |
+
llm=llm, retriever=dense_retriever, chain_type="stuff",
|
| 406 |
+
chain_type_kwargs={"prompt": _prompt}, return_source_documents=True,
|
|
|
|
|
|
|
|
|
|
| 407 |
)
|
| 408 |
|
|
|
|
| 409 |
try:
|
| 410 |
feedback_store = Chroma(
|
| 411 |
+
persist_directory="./chroma_db", embedding_function=embedder,
|
|
|
|
| 412 |
collection_name="Shenute_feedback",
|
| 413 |
)
|
| 414 |
feedback_retriever = feedback_store.as_retriever(search_kwargs={"k": 2})
|
| 415 |
except Exception:
|
| 416 |
feedback_retriever = None
|
| 417 |
|
| 418 |
+
return chain, feedback_retriever, llm, retriever
|
| 419 |
|
| 420 |
|
| 421 |
def query_Shenute(question: str,
|
| 422 |
model: str = "qwen3:14b",
|
| 423 |
top_k: int = 6,
|
| 424 |
temperature: float = 0.1,
|
| 425 |
+
provider: str = "Local AI",
|
| 426 |
+
use_hyde: bool = True,
|
| 427 |
+
use_reranking: bool = True,
|
| 428 |
+
use_hybrid: bool = True) -> dict:
|
| 429 |
+
chain, feedback_retriever, llm, retriever = build_chain(
|
| 430 |
+
model, top_k, temperature, provider, use_hyde, use_reranking, use_hybrid
|
| 431 |
+
)
|
|
|
|
|
|
|
| 432 |
|
| 433 |
feedback_context = ""
|
| 434 |
if feedback_retriever:
|
|
|
|
| 443 |
if feedback_context:
|
| 444 |
augmented_question = f"{question}\n\n[SYSTEM NOTE - PAST USER CORRECTIONS TO APPLY IF RELEVANT:\n{feedback_context}]"
|
| 445 |
|
| 446 |
+
# HyDE: Generate hypothetical document for better retrieval
|
| 447 |
+
retrieval_query = augmented_question
|
| 448 |
+
hyde_used = False
|
| 449 |
+
if use_hyde and provider != "Local AI":
|
| 450 |
+
try:
|
| 451 |
+
hyde_doc = _generate_hyde_query(question, llm)
|
| 452 |
+
if hyde_doc and len(hyde_doc) > 20:
|
| 453 |
+
retrieval_query = f"{augmented_question}\n\n{hyde_doc}"
|
| 454 |
+
hyde_used = True
|
| 455 |
+
except Exception as e:
|
| 456 |
+
print(f"HyDE failed: {e}")
|
| 457 |
+
|
| 458 |
+
# Retrieve with optional reranking
|
| 459 |
+
reranked = False
|
| 460 |
+
if use_reranking and provider != "Local AI":
|
| 461 |
+
try:
|
| 462 |
+
initial_docs = retriever.invoke(retrieval_query if hyde_used else augmented_question)
|
| 463 |
+
rerank_k = min(top_k, 4)
|
| 464 |
+
reranked_docs = _rerank_documents(question, initial_docs, top_k=rerank_k)
|
| 465 |
+
if reranked_docs:
|
| 466 |
+
reranked = True
|
| 467 |
+
context = "\n\n---\n\n".join([doc.page_content for doc in reranked_docs])
|
| 468 |
+
filled_prompt = Shenute_SYSTEM.replace("{context}", context).replace("{question}", augmented_question)
|
| 469 |
+
result_text = llm.invoke(filled_prompt)
|
| 470 |
+
if hasattr(result_text, 'content'):
|
| 471 |
+
result_text = result_text.content
|
| 472 |
+
answer = str(result_text)
|
| 473 |
+
source_documents = reranked_docs
|
| 474 |
+
else:
|
| 475 |
+
result = chain.invoke({"query": augmented_question})
|
| 476 |
+
answer = result["result"]
|
| 477 |
+
source_documents = result.get("source_documents", [])
|
| 478 |
+
except Exception as e:
|
| 479 |
+
print(f"Reranking pipeline failed, falling back to standard chain: {e}")
|
| 480 |
+
result = chain.invoke({"query": augmented_question})
|
| 481 |
+
answer = result["result"]
|
| 482 |
+
source_documents = result.get("source_documents", [])
|
| 483 |
+
else:
|
| 484 |
+
result = chain.invoke({"query": augmented_question})
|
| 485 |
+
answer = result["result"]
|
| 486 |
+
source_documents = result.get("source_documents", [])
|
| 487 |
|
|
|
|
|
|
|
| 488 |
answer = _strip_think_tags(answer)
|
| 489 |
answer = _add_greek_warning(answer)
|
| 490 |
|
| 491 |
sources = []
|
| 492 |
+
for doc in source_documents:
|
| 493 |
sources.append({
|
| 494 |
"text": doc.page_content[:300],
|
| 495 |
"source": doc.metadata.get("source", "Unknown"),
|
|
|
|
| 499 |
return {
|
| 500 |
"answer": answer,
|
| 501 |
"sources": sources,
|
| 502 |
+
"retrieval_info": {
|
| 503 |
+
"hyde_used": hyde_used,
|
| 504 |
+
"reranked": reranked,
|
| 505 |
+
"hybrid": use_hybrid,
|
| 506 |
+
},
|
| 507 |
}
|