Spaces:

danulr05
/

budget-proposals-search-api

Running

App Files Files Community

danulr05 commited on Aug 27, 2025

Commit

cf69ab1

verified ·

1 Parent(s): 6d72b86

Update app.py

Browse files

Files changed (1) hide show

app.py +395 -387

app.py CHANGED Viewed

@@ -1,387 +1,395 @@
-from flask import Flask, request, jsonify
-from flask_cors import CORS
-from sentence_transformers import SentenceTransformer
-from pinecone import Pinecone
-import os
-import logging
-import json
-app = Flask(__name__)
-CORS(app)  # Enable CORS for all routes
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Initialize Pinecone
-PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
-if not PINECONE_API_KEY:
-    raise ValueError("PINECONE_API_KEY environment variable is required")
-pc = Pinecone(api_key=PINECONE_API_KEY)
-# Configuration
-INDEX_NAME = "budget-proposals-optimized"  # Use the new optimized index
-# Load embedding model
-embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# Load dynamic metadata
-def load_dynamic_metadata():
-    """Load metadata from dynamic_metadata.json"""
-    try:
-        if os.path.exists("dynamic_metadata.json"):
-            with open("dynamic_metadata.json", 'r', encoding='utf-8') as f:
-                return json.load(f)
-    except Exception as e:
-        logger.error(f"Error loading dynamic metadata: {e}")
-    return {}
-# Load dynamic metadata
-DYNAMIC_METADATA = load_dynamic_metadata()
-def get_pinecone_index():
-    """Get the budget proposals Pinecone index"""
-    try:
-        return pc.Index(INDEX_NAME)
-    except Exception as e:
-        logger.error(f"Error accessing Pinecone index: {e}")
-        return None
-def semantic_search(query: str, top_k=1, category_filter=None):
-    """Perform semantic search on budget proposals - return relevant documents based on query specificity"""
-    try:
-        pc_index = get_pinecone_index()
-        if not pc_index:
-            return []
-        query_emb = embed_model.encode(query).tolist()
-        # Build filter if category is specified
-        filter_dict = {"source": "budget_proposals"}
-        if category_filter and category_filter != "All categories":
-            filter_dict["category"] = category_filter
-        # Get more results to find relevant documents
-        res = pc_index.query(
-            vector=query_emb,
-            top_k=50,  # Get more results to find relevant documents
-            include_metadata=True,
-            filter=filter_dict
-        )
-        # Track the best score for each unique document
-        best_scores = {}  # file_path -> best_score
-        for match in res["matches"]:
-            metadata = match["metadata"]
-            score = match["score"]
-            file_path = metadata.get("file_path", "")
-            # Keep track of the best score for each document
-            if file_path not in best_scores or score > best_scores[file_path]:
-                best_scores[file_path] = score
-        if not best_scores:
-            return []
-        # Sort documents by their best scores
-        sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
-        # Determine how many documents to return based on query specificity
-        max_score = sorted_docs[0][1]  # Best score
-        # If the best score is very high (>0.6), it's a specific query - show fewer results
-        # If the best score is moderate (0.3-0.6), it's a medium query - show some results
-        # If the best score is low (<0.3), it's a broad query - show more results
-        if max_score > 0.6:
-            # Specific query - show 1-2 documents
-            threshold = max_score * 0.8  # Show documents within 80% of best score
-            max_docs = 2
-        elif max_score > 0.3:
-            # Medium query - show 2-3 documents
-            threshold = max_score * 0.7  # Show documents within 70% of best score
-            max_docs = 3
-        else:
-            # Broad query - show 3-5 documents
-            threshold = max_score * 0.5  # Show documents within 50% of best score
-            max_docs = 5
-        results = []
-        doc_count = 0
-        for file_path, score in sorted_docs:
-            if doc_count >= max_docs or score < threshold:
-                break
-            # Get the metadata for this document
-            for match in res["matches"]:
-                metadata = match["metadata"]
-                if metadata.get("file_path", "") == file_path:
-                    # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
-                    proposal_data = DYNAMIC_METADATA.get(file_path, {
-                        "title": metadata.get("title", "Unknown Title"),
-                        "summary": metadata.get("summary", ""),
-                        "category": metadata.get("category", "Budget Proposal"),
-                        "costLKR": metadata.get("costLKR", "No Costing Available")
-                    })
-                    title = proposal_data["title"]
-                    summary = proposal_data["summary"]
-                    costLKR = proposal_data["costLKR"]
-                    category = proposal_data["category"]
-                    thumb_url = metadata.get("thumbUrl", "")
-                    result = {
-                        "title": title,
-                        "summary": summary,
-                        "costLKR": costLKR,
-                        "category": category,
-                        "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
-                        "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
-                        "score": score,
-                        "relevance_percentage": int(score * 100),
-                        "file_path": file_path,
-                        "id": match["id"],
-                        "content": metadata.get("content", "")  # Add the actual content
-                    }
-                    results.append(result)
-                    doc_count += 1
-                    break
-        return results
-    except Exception as e:
-        logger.error(f"Search error: {e}")
-        return []
-def get_all_proposals(category_filter=None):
-    """Get all budget proposals (for initial load or when no search query)"""
-    try:
-        pc_index = get_pinecone_index()
-        if not pc_index:
-            logger.warning("Pinecone index not available, returning empty list")
-            return []
-        # Build filter if category is specified
-        filter_dict = {"source": "budget_proposals"}
-        if category_filter and category_filter != "All categories":
-            filter_dict["category"] = category_filter
-        # Query with a dummy vector to get all documents
-        # Use a more realistic dummy vector (all 0.1 instead of 0.0)
-        dummy_vector = [0.1] * 384  # 384 is the dimension of all-MiniLM-L6-v2
-        res = pc_index.query(
-            vector=dummy_vector,
-            top_k=100,  # Get all proposals
-            include_metadata=True,
-            filter=filter_dict
-        )
-        logger.info(f"Query returned {len(res['matches'])} matches")
-        results = []
-        seen_files = set()  # Track unique files to avoid duplicates
-        for match in res["matches"]:
-            metadata = match["metadata"]
-            file_path = metadata.get("file_path", "")
-            # Skip if we've already included this file (avoid duplicates from chunks)
-            if file_path in seen_files:
-                continue
-            seen_files.add(file_path)
-            # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
-            proposal_data = DYNAMIC_METADATA.get(file_path, {
-                "title": metadata.get("title", "Unknown Title"),
-                "summary": metadata.get("summary", ""),
-                "category": metadata.get("category", "Budget Proposal"),
-                "costLKR": metadata.get("costLKR", "No Costing Available")
-            })
-            title = proposal_data["title"]
-            summary = proposal_data["summary"]
-            costLKR = proposal_data["costLKR"]
-            category = proposal_data["category"]
-            thumb_url = metadata.get("thumbUrl", "")
-            result = {
-                "title": title,
-                "summary": summary,
-                "costLKR": costLKR,
-                "category": category,
-                "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
-                "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
-                "score": 1.0,  # Default score for all proposals
-                "relevance_percentage": 100,
-                "file_path": file_path,
-                "id": match["id"]
-            }
-            results.append(result)
-        return results
-    except Exception as e:
-        logger.error(f"Error getting all proposals: {e}")
-        return []
-@app.route('/api/search', methods=['POST'])
-def search_proposals():
-    """API endpoint for searching budget proposals"""
-    try:
-        data = request.get_json()
-        query = data.get('query', '').strip()
-        top_k = data.get('top_k', 10)
-        category_filter = data.get('category_filter')
-        if not query:
-            # If no query, return all proposals
-            results = get_all_proposals(category_filter)
-        else:
-            results = semantic_search(query, top_k, category_filter)
-        return jsonify({
-            "query": query,
-            "results": results,
-            "total_results": len(results),
-            "category_filter": category_filter
-        })
-    except Exception as e:
-        logger.error(f"API error: {e}")
-        return jsonify({"error": str(e)}), 500
-@app.route('/api/search', methods=['GET'])
-def search_proposals_get():
-    """API endpoint for searching proposals (GET method)"""
-    try:
-        query = request.args.get('query', '').strip()
-        top_k = int(request.args.get('top_k', 10))
-        category_filter = request.args.get('category_filter')
-        if not query:
-            # If no query, return all proposals
-            results = get_all_proposals(category_filter)
-        else:
-            results = semantic_search(query, top_k, category_filter)
-        return jsonify({
-            "query": query,
-            "results": results,
-            "total_results": len(results),
-            "category_filter": category_filter
-        })
-    except Exception as e:
-        logger.error(f"API error: {e}")
-        return jsonify({"error": str(e)}), 500
-@app.route('/api/proposals', methods=['GET'])
-def get_proposals():
-    """Get all budget proposals"""
-    try:
-        category_filter = request.args.get('category_filter')
-        results = get_all_proposals(category_filter)
-        return jsonify({
-            "results": results,
-            "total_results": len(results),
-            "category_filter": category_filter
-        })
-    except Exception as e:
-        logger.error(f"API error: {e}")
-        return jsonify({"error": str(e)}), 500
-@app.route('/api/categories', methods=['GET'])
-def get_categories():
-    """Get all available categories"""
-    try:
-        # Get categories directly from dynamic metadata for reliability
-        categories = set()
-        for file_path, metadata in DYNAMIC_METADATA.items():
-            category = metadata.get("category")
-            if category:
-                categories.add(category)
-        # If no categories from metadata, fallback to Pinecone
-        if not categories:
-            all_proposals = get_all_proposals()
-            for proposal in all_proposals:
-                category = proposal.get("category")
-                if category:
-                    categories.add(category)
-        return jsonify({
-            "categories": sorted(list(categories))
-        })
-    except Exception as e:
-        logger.error(f"API error: {e}")
-        return jsonify({"error": str(e)}), 500
-@app.route('/api/health', methods=['GET'])
-def health_check():
-    """Health check endpoint"""
-    try:
-        pc_index = get_pinecone_index()
-        if pc_index:
-            stats = pc_index.describe_index_stats()
-            return jsonify({
-                "status": "healthy",
-                "message": "Budget proposals semantic search API is running",
-                "index_stats": {
-                    "total_vector_count": stats.total_vector_count,
-                    "dimension": stats.dimension,
-                    "index_fullness": stats.index_fullness
-                }
-            })
-        else:
-            return jsonify({
-                "status": "unhealthy",
-                "message": "Cannot connect to Pinecone index"
-            }), 500
-    except Exception as e:
-        return jsonify({
-            "status": "unhealthy",
-            "message": f"Error: {str(e)}"
-        }), 500
-@app.route('/api/stats', methods=['GET'])
-def get_stats():
-    """Get index statistics"""
-    try:
-        pc_index = get_pinecone_index()
-        if not pc_index:
-            return jsonify({"error": "Cannot connect to Pinecone index"}), 500
-        stats = pc_index.describe_index_stats()
-        return jsonify({
-            "total_vector_count": stats.total_vector_count,
-            "dimension": stats.dimension,
-            "index_fullness": stats.index_fullness
-        })
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-@app.route('/', methods=['GET'])
-def home():
-    """Home endpoint with API documentation"""
-    return jsonify({
-        "message": "Budget Proposals Semantic Search API",
-        "version": "1.0.0",
-        "endpoints": {
-            "POST /api/search": "Search proposals with JSON body",
-            "GET /api/search?query=<search_term>": "Search proposals with query parameter",
-            "GET /api/proposals": "Get all proposals",
-            "GET /api/categories": "Get all categories",
-            "GET /api/health": "Health check",
-            "GET /api/stats": "Index statistics"
-        },
-        "status": "running"
-    })
-if __name__ == '__main__':
-    app.run(debug=False, host='0.0.0.0', port=7860)

+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from sentence_transformers import SentenceTransformer
+from pinecone import Pinecone
+import os
+import logging
+import json
+app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize Pinecone
+PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
+if not PINECONE_API_KEY:
+    raise ValueError("PINECONE_API_KEY environment variable is required")
+pc = Pinecone(api_key=PINECONE_API_KEY)
+# Configuration
+INDEX_NAME = "budget-proposals-optimized"  # Use the new optimized index
+# Load embedding model
+embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# Load dynamic metadata
+def load_dynamic_metadata():
+    """Load metadata from dynamic_metadata.json"""
+    try:
+        if os.path.exists("dynamic_metadata.json"):
+            with open("dynamic_metadata.json", 'r', encoding='utf-8') as f:
+                return json.load(f)
+    except Exception as e:
+        logger.error(f"Error loading dynamic metadata: {e}")
+    return {}
+# Load dynamic metadata
+DYNAMIC_METADATA = load_dynamic_metadata()
+def get_pinecone_index():
+    """Get the budget proposals Pinecone index"""
+    try:
+        return pc.Index(INDEX_NAME)
+    except Exception as e:
+        logger.error(f"Error accessing Pinecone index: {e}")
+        return None
+def semantic_search(query: str, top_k=1, category_filter=None):
+    """Perform semantic search on budget proposals - return relevant documents based on query specificity"""
+    try:
+        pc_index = get_pinecone_index()
+        if not pc_index:
+            return []
+        query_emb = embed_model.encode(query).tolist()
+        # Build filter if category is specified
+        filter_dict = {"source": "budget_proposals"}
+        if category_filter and category_filter != "All categories":
+            filter_dict["category"] = category_filter
+        # Get more results to find relevant documents
+        res = pc_index.query(
+            vector=query_emb,
+            top_k=50,  # Get more results to find relevant documents
+            include_metadata=True,
+            filter=filter_dict
+        )
+        # Track the best score for each unique document
+        best_scores = {}  # file_path -> best_score
+        for match in res["matches"]:
+            metadata = match["metadata"]
+            score = match["score"]
+            file_path = metadata.get("file_path", "")
+            # Keep track of the best score for each document
+            if file_path not in best_scores or score > best_scores[file_path]:
+                best_scores[file_path] = score
+        if not best_scores:
+            return []
+        # Sort documents by their best scores
+        sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
+        # Determine how many documents to return based on query specificity
+        max_score = sorted_docs[0][1]  # Best score
+        # Minimum threshold - if best score is too low, return no results
+        MIN_SCORE_THRESHOLD = 0.2  # Adjust this value as needed
+        if max_score < MIN_SCORE_THRESHOLD:
+            # Score too low - return no results
+            return []
+        # If the best score is very high (>0.6), it's a specific query - show fewer results
+        # If the best score is moderate (0.3-0.6), it's a medium query - show some results
+        # If the best score is low but above threshold (0.2-0.3), it's a broad query - show more results
+        if max_score > 0.6:
+            # Specific query - show 1-2 documents
+            threshold = max_score * 0.8  # Show documents within 80% of best score
+            max_docs = 2
+        elif max_score > 0.3:
+            # Medium query - show 2-3 documents
+            threshold = max_score * 0.7  # Show documents within 70% of best score
+            max_docs = 3
+        else:
+            # Broad query - show 3-5 documents
+            threshold = max_score * 0.5  # Show documents within 50% of best score
+            max_docs = 5
+        results = []
+        doc_count = 0
+        for file_path, score in sorted_docs:
+            if doc_count >= max_docs or score < threshold:
+                break
+            # Get the metadata for this document
+            for match in res["matches"]:
+                metadata = match["metadata"]
+                if metadata.get("file_path", "") == file_path:
+                    # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
+                    proposal_data = DYNAMIC_METADATA.get(file_path, {
+                        "title": metadata.get("title", "Unknown Title"),
+                        "summary": metadata.get("summary", ""),
+                        "category": metadata.get("category", "Budget Proposal"),
+                        "costLKR": metadata.get("costLKR", "No Costing Available")
+                    })
+                    title = proposal_data["title"]
+                    summary = proposal_data["summary"]
+                    costLKR = proposal_data["costLKR"]
+                    category = proposal_data["category"]
+                    thumb_url = metadata.get("thumbUrl", "")
+                    result = {
+                        "title": title,
+                        "summary": summary,
+                        "costLKR": costLKR,
+                        "category": category,
+                        "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
+                        "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
+                        "score": score,
+                        "relevance_percentage": int(score * 100),
+                        "file_path": file_path,
+                        "id": match["id"],
+                        "content": metadata.get("content", "")  # Add the actual content
+                    }
+                    results.append(result)
+                    doc_count += 1
+                    break
+        return results
+    except Exception as e:
+        logger.error(f"Search error: {e}")
+        return []
+def get_all_proposals(category_filter=None):
+    """Get all budget proposals (for initial load or when no search query)"""
+    try:
+        pc_index = get_pinecone_index()
+        if not pc_index:
+            logger.warning("Pinecone index not available, returning empty list")
+            return []
+        # Build filter if category is specified
+        filter_dict = {"source": "budget_proposals"}
+        if category_filter and category_filter != "All categories":
+            filter_dict["category"] = category_filter
+        # Query with a dummy vector to get all documents
+        # Use a more realistic dummy vector (all 0.1 instead of 0.0)
+        dummy_vector = [0.1] * 384  # 384 is the dimension of all-MiniLM-L6-v2
+        res = pc_index.query(
+            vector=dummy_vector,
+            top_k=100,  # Get all proposals
+            include_metadata=True,
+            filter=filter_dict
+        )
+        logger.info(f"Query returned {len(res['matches'])} matches")
+        results = []
+        seen_files = set()  # Track unique files to avoid duplicates
+        for match in res["matches"]:
+            metadata = match["metadata"]
+            file_path = metadata.get("file_path", "")
+            # Skip if we've already included this file (avoid duplicates from chunks)
+            if file_path in seen_files:
+                continue
+            seen_files.add(file_path)
+            # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
+            proposal_data = DYNAMIC_METADATA.get(file_path, {
+                "title": metadata.get("title", "Unknown Title"),
+                "summary": metadata.get("summary", ""),
+                "category": metadata.get("category", "Budget Proposal"),
+                "costLKR": metadata.get("costLKR", "No Costing Available")
+            })
+            title = proposal_data["title"]
+            summary = proposal_data["summary"]
+            costLKR = proposal_data["costLKR"]
+            category = proposal_data["category"]
+            thumb_url = metadata.get("thumbUrl", "")
+            result = {
+                "title": title,
+                "summary": summary,
+                "costLKR": costLKR,
+                "category": category,
+                "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
+                "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
+                "score": 1.0,  # Default score for all proposals
+                "relevance_percentage": 100,
+                "file_path": file_path,
+                "id": match["id"]
+            }
+            results.append(result)
+        return results
+    except Exception as e:
+        logger.error(f"Error getting all proposals: {e}")
+        return []
+@app.route('/api/search', methods=['POST'])
+def search_proposals():
+    """API endpoint for searching budget proposals"""
+    try:
+        data = request.get_json()
+        query = data.get('query', '').strip()
+        top_k = data.get('top_k', 10)
+        category_filter = data.get('category_filter')
+        if not query:
+            # If no query, return all proposals
+            results = get_all_proposals(category_filter)
+        else:
+            results = semantic_search(query, top_k, category_filter)
+        return jsonify({
+            "query": query,
+            "results": results,
+            "total_results": len(results),
+            "category_filter": category_filter
+        })
+    except Exception as e:
+        logger.error(f"API error: {e}")
+        return jsonify({"error": str(e)}), 500
+@app.route('/api/search', methods=['GET'])
+def search_proposals_get():
+    """API endpoint for searching proposals (GET method)"""
+    try:
+        query = request.args.get('query', '').strip()
+        top_k = int(request.args.get('top_k', 10))
+        category_filter = request.args.get('category_filter')
+        if not query:
+            # If no query, return all proposals
+            results = get_all_proposals(category_filter)
+        else:
+            results = semantic_search(query, top_k, category_filter)
+        return jsonify({
+            "query": query,
+            "results": results,
+            "total_results": len(results),
+            "category_filter": category_filter
+        })
+    except Exception as e:
+        logger.error(f"API error: {e}")
+        return jsonify({"error": str(e)}), 500
+@app.route('/api/proposals', methods=['GET'])
+def get_proposals():
+    """Get all budget proposals"""
+    try:
+        category_filter = request.args.get('category_filter')
+        results = get_all_proposals(category_filter)
+        return jsonify({
+            "results": results,
+            "total_results": len(results),
+            "category_filter": category_filter
+        })
+    except Exception as e:
+        logger.error(f"API error: {e}")
+        return jsonify({"error": str(e)}), 500
+@app.route('/api/categories', methods=['GET'])
+def get_categories():
+    """Get all available categories"""
+    try:
+        # Get categories directly from dynamic metadata for reliability
+        categories = set()
+        for file_path, metadata in DYNAMIC_METADATA.items():
+            category = metadata.get("category")
+            if category:
+                categories.add(category)
+        # If no categories from metadata, fallback to Pinecone
+        if not categories:
+            all_proposals = get_all_proposals()
+            for proposal in all_proposals:
+                category = proposal.get("category")
+                if category:
+                    categories.add(category)
+        return jsonify({
+            "categories": sorted(list(categories))
+        })
+    except Exception as e:
+        logger.error(f"API error: {e}")
+        return jsonify({"error": str(e)}), 500
+@app.route('/api/health', methods=['GET'])
+def health_check():
+    """Health check endpoint"""
+    try:
+        pc_index = get_pinecone_index()
+        if pc_index:
+            stats = pc_index.describe_index_stats()
+            return jsonify({
+                "status": "healthy",
+                "message": "Budget proposals semantic search API is running",
+                "index_stats": {
+                    "total_vector_count": stats.total_vector_count,
+                    "dimension": stats.dimension,
+                    "index_fullness": stats.index_fullness
+                }
+            })
+        else:
+            return jsonify({
+                "status": "unhealthy",
+                "message": "Cannot connect to Pinecone index"
+            }), 500
+    except Exception as e:
+        return jsonify({
+            "status": "unhealthy",
+            "message": f"Error: {str(e)}"
+        }), 500
+@app.route('/api/stats', methods=['GET'])
+def get_stats():
+    """Get index statistics"""
+    try:
+        pc_index = get_pinecone_index()
+        if not pc_index:
+            return jsonify({"error": "Cannot connect to Pinecone index"}), 500
+        stats = pc_index.describe_index_stats()
+        return jsonify({
+            "total_vector_count": stats.total_vector_count,
+            "dimension": stats.dimension,
+            "index_fullness": stats.index_fullness
+        })
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+@app.route('/', methods=['GET'])
+def home():
+    """Home endpoint with API documentation"""
+    return jsonify({
+        "message": "Budget Proposals Semantic Search API",
+        "version": "1.0.0",
+        "endpoints": {
+            "POST /api/search": "Search proposals with JSON body",
+            "GET /api/search?query=<search_term>": "Search proposals with query parameter",
+            "GET /api/proposals": "Get all proposals",
+            "GET /api/categories": "Get all categories",
+            "GET /api/health": "Health check",
+            "GET /api/stats": "Index statistics"
+        },
+        "status": "running"
+    })
+if __name__ == '__main__':
+    app.run(debug=False, host='0.0.0.0', port=7860)