danulr05 commited on
Commit
cf69ab1
·
verified ·
1 Parent(s): 6d72b86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +395 -387
app.py CHANGED
@@ -1,387 +1,395 @@
1
- from flask import Flask, request, jsonify
2
- from flask_cors import CORS
3
- from sentence_transformers import SentenceTransformer
4
- from pinecone import Pinecone
5
- import os
6
- import logging
7
- import json
8
-
9
- app = Flask(__name__)
10
- CORS(app) # Enable CORS for all routes
11
-
12
- # Configure logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Initialize Pinecone
17
- PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
- if not PINECONE_API_KEY:
19
- raise ValueError("PINECONE_API_KEY environment variable is required")
20
-
21
- pc = Pinecone(api_key=PINECONE_API_KEY)
22
- # Configuration
23
- INDEX_NAME = "budget-proposals-optimized" # Use the new optimized index
24
-
25
- # Load embedding model
26
- embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
-
28
- # Load dynamic metadata
29
- def load_dynamic_metadata():
30
- """Load metadata from dynamic_metadata.json"""
31
- try:
32
- if os.path.exists("dynamic_metadata.json"):
33
- with open("dynamic_metadata.json", 'r', encoding='utf-8') as f:
34
- return json.load(f)
35
- except Exception as e:
36
- logger.error(f"Error loading dynamic metadata: {e}")
37
- return {}
38
-
39
- # Load dynamic metadata
40
- DYNAMIC_METADATA = load_dynamic_metadata()
41
-
42
- def get_pinecone_index():
43
- """Get the budget proposals Pinecone index"""
44
- try:
45
- return pc.Index(INDEX_NAME)
46
- except Exception as e:
47
- logger.error(f"Error accessing Pinecone index: {e}")
48
- return None
49
-
50
- def semantic_search(query: str, top_k=1, category_filter=None):
51
- """Perform semantic search on budget proposals - return relevant documents based on query specificity"""
52
- try:
53
- pc_index = get_pinecone_index()
54
- if not pc_index:
55
- return []
56
-
57
- query_emb = embed_model.encode(query).tolist()
58
-
59
- # Build filter if category is specified
60
- filter_dict = {"source": "budget_proposals"}
61
- if category_filter and category_filter != "All categories":
62
- filter_dict["category"] = category_filter
63
-
64
- # Get more results to find relevant documents
65
- res = pc_index.query(
66
- vector=query_emb,
67
- top_k=50, # Get more results to find relevant documents
68
- include_metadata=True,
69
- filter=filter_dict
70
- )
71
-
72
- # Track the best score for each unique document
73
- best_scores = {} # file_path -> best_score
74
-
75
- for match in res["matches"]:
76
- metadata = match["metadata"]
77
- score = match["score"]
78
- file_path = metadata.get("file_path", "")
79
-
80
- # Keep track of the best score for each document
81
- if file_path not in best_scores or score > best_scores[file_path]:
82
- best_scores[file_path] = score
83
-
84
- if not best_scores:
85
- return []
86
-
87
- # Sort documents by their best scores
88
- sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
89
-
90
- # Determine how many documents to return based on query specificity
91
- max_score = sorted_docs[0][1] # Best score
92
-
93
- # If the best score is very high (>0.6), it's a specific query - show fewer results
94
- # If the best score is moderate (0.3-0.6), it's a medium query - show some results
95
- # If the best score is low (<0.3), it's a broad query - show more results
96
- if max_score > 0.6:
97
- # Specific query - show 1-2 documents
98
- threshold = max_score * 0.8 # Show documents within 80% of best score
99
- max_docs = 2
100
- elif max_score > 0.3:
101
- # Medium query - show 2-3 documents
102
- threshold = max_score * 0.7 # Show documents within 70% of best score
103
- max_docs = 3
104
- else:
105
- # Broad query - show 3-5 documents
106
- threshold = max_score * 0.5 # Show documents within 50% of best score
107
- max_docs = 5
108
-
109
- results = []
110
- doc_count = 0
111
-
112
- for file_path, score in sorted_docs:
113
- if doc_count >= max_docs or score < threshold:
114
- break
115
-
116
- # Get the metadata for this document
117
- for match in res["matches"]:
118
- metadata = match["metadata"]
119
- if metadata.get("file_path", "") == file_path:
120
- # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
121
- proposal_data = DYNAMIC_METADATA.get(file_path, {
122
- "title": metadata.get("title", "Unknown Title"),
123
- "summary": metadata.get("summary", ""),
124
- "category": metadata.get("category", "Budget Proposal"),
125
- "costLKR": metadata.get("costLKR", "No Costing Available")
126
- })
127
-
128
- title = proposal_data["title"]
129
- summary = proposal_data["summary"]
130
- costLKR = proposal_data["costLKR"]
131
- category = proposal_data["category"]
132
- thumb_url = metadata.get("thumbUrl", "")
133
-
134
- result = {
135
- "title": title,
136
- "summary": summary,
137
- "costLKR": costLKR,
138
- "category": category,
139
- "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
140
- "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
141
- "score": score,
142
- "relevance_percentage": int(score * 100),
143
- "file_path": file_path,
144
- "id": match["id"],
145
- "content": metadata.get("content", "") # Add the actual content
146
- }
147
-
148
- results.append(result)
149
- doc_count += 1
150
- break
151
-
152
- return results
153
- except Exception as e:
154
- logger.error(f"Search error: {e}")
155
- return []
156
-
157
- def get_all_proposals(category_filter=None):
158
- """Get all budget proposals (for initial load or when no search query)"""
159
- try:
160
- pc_index = get_pinecone_index()
161
- if not pc_index:
162
- logger.warning("Pinecone index not available, returning empty list")
163
- return []
164
-
165
- # Build filter if category is specified
166
- filter_dict = {"source": "budget_proposals"}
167
- if category_filter and category_filter != "All categories":
168
- filter_dict["category"] = category_filter
169
-
170
- # Query with a dummy vector to get all documents
171
- # Use a more realistic dummy vector (all 0.1 instead of 0.0)
172
- dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
173
- res = pc_index.query(
174
- vector=dummy_vector,
175
- top_k=100, # Get all proposals
176
- include_metadata=True,
177
- filter=filter_dict
178
- )
179
-
180
- logger.info(f"Query returned {len(res['matches'])} matches")
181
-
182
- results = []
183
- seen_files = set() # Track unique files to avoid duplicates
184
-
185
- for match in res["matches"]:
186
- metadata = match["metadata"]
187
- file_path = metadata.get("file_path", "")
188
-
189
- # Skip if we've already included this file (avoid duplicates from chunks)
190
- if file_path in seen_files:
191
- continue
192
-
193
- seen_files.add(file_path)
194
-
195
- # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
196
- proposal_data = DYNAMIC_METADATA.get(file_path, {
197
- "title": metadata.get("title", "Unknown Title"),
198
- "summary": metadata.get("summary", ""),
199
- "category": metadata.get("category", "Budget Proposal"),
200
- "costLKR": metadata.get("costLKR", "No Costing Available")
201
- })
202
-
203
- title = proposal_data["title"]
204
- summary = proposal_data["summary"]
205
- costLKR = proposal_data["costLKR"]
206
- category = proposal_data["category"]
207
- thumb_url = metadata.get("thumbUrl", "")
208
-
209
- result = {
210
- "title": title,
211
- "summary": summary,
212
- "costLKR": costLKR,
213
- "category": category,
214
- "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
215
- "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
216
- "score": 1.0, # Default score for all proposals
217
- "relevance_percentage": 100,
218
- "file_path": file_path,
219
- "id": match["id"]
220
- }
221
-
222
- results.append(result)
223
-
224
- return results
225
-
226
- except Exception as e:
227
- logger.error(f"Error getting all proposals: {e}")
228
- return []
229
-
230
- @app.route('/api/search', methods=['POST'])
231
- def search_proposals():
232
- """API endpoint for searching budget proposals"""
233
- try:
234
- data = request.get_json()
235
- query = data.get('query', '').strip()
236
- top_k = data.get('top_k', 10)
237
- category_filter = data.get('category_filter')
238
-
239
- if not query:
240
- # If no query, return all proposals
241
- results = get_all_proposals(category_filter)
242
- else:
243
- results = semantic_search(query, top_k, category_filter)
244
-
245
- return jsonify({
246
- "query": query,
247
- "results": results,
248
- "total_results": len(results),
249
- "category_filter": category_filter
250
- })
251
-
252
- except Exception as e:
253
- logger.error(f"API error: {e}")
254
- return jsonify({"error": str(e)}), 500
255
-
256
- @app.route('/api/search', methods=['GET'])
257
- def search_proposals_get():
258
- """API endpoint for searching proposals (GET method)"""
259
- try:
260
- query = request.args.get('query', '').strip()
261
- top_k = int(request.args.get('top_k', 10))
262
- category_filter = request.args.get('category_filter')
263
-
264
- if not query:
265
- # If no query, return all proposals
266
- results = get_all_proposals(category_filter)
267
- else:
268
- results = semantic_search(query, top_k, category_filter)
269
-
270
- return jsonify({
271
- "query": query,
272
- "results": results,
273
- "total_results": len(results),
274
- "category_filter": category_filter
275
- })
276
-
277
- except Exception as e:
278
- logger.error(f"API error: {e}")
279
- return jsonify({"error": str(e)}), 500
280
-
281
- @app.route('/api/proposals', methods=['GET'])
282
- def get_proposals():
283
- """Get all budget proposals"""
284
- try:
285
- category_filter = request.args.get('category_filter')
286
- results = get_all_proposals(category_filter)
287
-
288
- return jsonify({
289
- "results": results,
290
- "total_results": len(results),
291
- "category_filter": category_filter
292
- })
293
-
294
- except Exception as e:
295
- logger.error(f"API error: {e}")
296
- return jsonify({"error": str(e)}), 500
297
-
298
- @app.route('/api/categories', methods=['GET'])
299
- def get_categories():
300
- """Get all available categories"""
301
- try:
302
- # Get categories directly from dynamic metadata for reliability
303
- categories = set()
304
- for file_path, metadata in DYNAMIC_METADATA.items():
305
- category = metadata.get("category")
306
- if category:
307
- categories.add(category)
308
-
309
- # If no categories from metadata, fallback to Pinecone
310
- if not categories:
311
- all_proposals = get_all_proposals()
312
- for proposal in all_proposals:
313
- category = proposal.get("category")
314
- if category:
315
- categories.add(category)
316
-
317
- return jsonify({
318
- "categories": sorted(list(categories))
319
- })
320
-
321
- except Exception as e:
322
- logger.error(f"API error: {e}")
323
- return jsonify({"error": str(e)}), 500
324
-
325
- @app.route('/api/health', methods=['GET'])
326
- def health_check():
327
- """Health check endpoint"""
328
- try:
329
- pc_index = get_pinecone_index()
330
- if pc_index:
331
- stats = pc_index.describe_index_stats()
332
- return jsonify({
333
- "status": "healthy",
334
- "message": "Budget proposals semantic search API is running",
335
- "index_stats": {
336
- "total_vector_count": stats.total_vector_count,
337
- "dimension": stats.dimension,
338
- "index_fullness": stats.index_fullness
339
- }
340
- })
341
- else:
342
- return jsonify({
343
- "status": "unhealthy",
344
- "message": "Cannot connect to Pinecone index"
345
- }), 500
346
- except Exception as e:
347
- return jsonify({
348
- "status": "unhealthy",
349
- "message": f"Error: {str(e)}"
350
- }), 500
351
-
352
- @app.route('/api/stats', methods=['GET'])
353
- def get_stats():
354
- """Get index statistics"""
355
- try:
356
- pc_index = get_pinecone_index()
357
- if not pc_index:
358
- return jsonify({"error": "Cannot connect to Pinecone index"}), 500
359
-
360
- stats = pc_index.describe_index_stats()
361
- return jsonify({
362
- "total_vector_count": stats.total_vector_count,
363
- "dimension": stats.dimension,
364
- "index_fullness": stats.index_fullness
365
- })
366
- except Exception as e:
367
- return jsonify({"error": str(e)}), 500
368
-
369
- @app.route('/', methods=['GET'])
370
- def home():
371
- """Home endpoint with API documentation"""
372
- return jsonify({
373
- "message": "Budget Proposals Semantic Search API",
374
- "version": "1.0.0",
375
- "endpoints": {
376
- "POST /api/search": "Search proposals with JSON body",
377
- "GET /api/search?query=<search_term>": "Search proposals with query parameter",
378
- "GET /api/proposals": "Get all proposals",
379
- "GET /api/categories": "Get all categories",
380
- "GET /api/health": "Health check",
381
- "GET /api/stats": "Index statistics"
382
- },
383
- "status": "running"
384
- })
385
-
386
- if __name__ == '__main__':
387
- app.run(debug=False, host='0.0.0.0', port=7860)
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+ from sentence_transformers import SentenceTransformer
4
+ from pinecone import Pinecone
5
+ import os
6
+ import logging
7
+ import json
8
+
9
+ app = Flask(__name__)
10
+ CORS(app) # Enable CORS for all routes
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize Pinecone
17
+ PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
+ if not PINECONE_API_KEY:
19
+ raise ValueError("PINECONE_API_KEY environment variable is required")
20
+
21
+ pc = Pinecone(api_key=PINECONE_API_KEY)
22
+ # Configuration
23
+ INDEX_NAME = "budget-proposals-optimized" # Use the new optimized index
24
+
25
+ # Load embedding model
26
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
+
28
+ # Load dynamic metadata
29
+ def load_dynamic_metadata():
30
+ """Load metadata from dynamic_metadata.json"""
31
+ try:
32
+ if os.path.exists("dynamic_metadata.json"):
33
+ with open("dynamic_metadata.json", 'r', encoding='utf-8') as f:
34
+ return json.load(f)
35
+ except Exception as e:
36
+ logger.error(f"Error loading dynamic metadata: {e}")
37
+ return {}
38
+
39
+ # Load dynamic metadata
40
+ DYNAMIC_METADATA = load_dynamic_metadata()
41
+
42
+ def get_pinecone_index():
43
+ """Get the budget proposals Pinecone index"""
44
+ try:
45
+ return pc.Index(INDEX_NAME)
46
+ except Exception as e:
47
+ logger.error(f"Error accessing Pinecone index: {e}")
48
+ return None
49
+
50
+ def semantic_search(query: str, top_k=1, category_filter=None):
51
+ """Perform semantic search on budget proposals - return relevant documents based on query specificity"""
52
+ try:
53
+ pc_index = get_pinecone_index()
54
+ if not pc_index:
55
+ return []
56
+
57
+ query_emb = embed_model.encode(query).tolist()
58
+
59
+ # Build filter if category is specified
60
+ filter_dict = {"source": "budget_proposals"}
61
+ if category_filter and category_filter != "All categories":
62
+ filter_dict["category"] = category_filter
63
+
64
+ # Get more results to find relevant documents
65
+ res = pc_index.query(
66
+ vector=query_emb,
67
+ top_k=50, # Get more results to find relevant documents
68
+ include_metadata=True,
69
+ filter=filter_dict
70
+ )
71
+
72
+ # Track the best score for each unique document
73
+ best_scores = {} # file_path -> best_score
74
+
75
+ for match in res["matches"]:
76
+ metadata = match["metadata"]
77
+ score = match["score"]
78
+ file_path = metadata.get("file_path", "")
79
+
80
+ # Keep track of the best score for each document
81
+ if file_path not in best_scores or score > best_scores[file_path]:
82
+ best_scores[file_path] = score
83
+
84
+ if not best_scores:
85
+ return []
86
+
87
+ # Sort documents by their best scores
88
+ sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
89
+
90
+
91
+ # Determine how many documents to return based on query specificity
92
+ max_score = sorted_docs[0][1] # Best score
93
+
94
+ # Minimum threshold - if best score is too low, return no results
95
+ MIN_SCORE_THRESHOLD = 0.2 # Adjust this value as needed
96
+
97
+ if max_score < MIN_SCORE_THRESHOLD:
98
+ # Score too low - return no results
99
+ return []
100
+
101
+ # If the best score is very high (>0.6), it's a specific query - show fewer results
102
+ # If the best score is moderate (0.3-0.6), it's a medium query - show some results
103
+ # If the best score is low but above threshold (0.2-0.3), it's a broad query - show more results
104
+ if max_score > 0.6:
105
+ # Specific query - show 1-2 documents
106
+ threshold = max_score * 0.8 # Show documents within 80% of best score
107
+ max_docs = 2
108
+ elif max_score > 0.3:
109
+ # Medium query - show 2-3 documents
110
+ threshold = max_score * 0.7 # Show documents within 70% of best score
111
+ max_docs = 3
112
+ else:
113
+ # Broad query - show 3-5 documents
114
+ threshold = max_score * 0.5 # Show documents within 50% of best score
115
+ max_docs = 5
116
+
117
+ results = []
118
+ doc_count = 0
119
+
120
+ for file_path, score in sorted_docs:
121
+ if doc_count >= max_docs or score < threshold:
122
+ break
123
+
124
+ # Get the metadata for this document
125
+ for match in res["matches"]:
126
+ metadata = match["metadata"]
127
+ if metadata.get("file_path", "") == file_path:
128
+ # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
129
+ proposal_data = DYNAMIC_METADATA.get(file_path, {
130
+ "title": metadata.get("title", "Unknown Title"),
131
+ "summary": metadata.get("summary", ""),
132
+ "category": metadata.get("category", "Budget Proposal"),
133
+ "costLKR": metadata.get("costLKR", "No Costing Available")
134
+ })
135
+
136
+ title = proposal_data["title"]
137
+ summary = proposal_data["summary"]
138
+ costLKR = proposal_data["costLKR"]
139
+ category = proposal_data["category"]
140
+ thumb_url = metadata.get("thumbUrl", "")
141
+
142
+ result = {
143
+ "title": title,
144
+ "summary": summary,
145
+ "costLKR": costLKR,
146
+ "category": category,
147
+ "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
148
+ "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
149
+ "score": score,
150
+ "relevance_percentage": int(score * 100),
151
+ "file_path": file_path,
152
+ "id": match["id"],
153
+ "content": metadata.get("content", "") # Add the actual content
154
+ }
155
+
156
+ results.append(result)
157
+ doc_count += 1
158
+ break
159
+
160
+ return results
161
+ except Exception as e:
162
+ logger.error(f"Search error: {e}")
163
+ return []
164
+
165
+ def get_all_proposals(category_filter=None):
166
+ """Get all budget proposals (for initial load or when no search query)"""
167
+ try:
168
+ pc_index = get_pinecone_index()
169
+ if not pc_index:
170
+ logger.warning("Pinecone index not available, returning empty list")
171
+ return []
172
+
173
+ # Build filter if category is specified
174
+ filter_dict = {"source": "budget_proposals"}
175
+ if category_filter and category_filter != "All categories":
176
+ filter_dict["category"] = category_filter
177
+
178
+ # Query with a dummy vector to get all documents
179
+ # Use a more realistic dummy vector (all 0.1 instead of 0.0)
180
+ dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
181
+ res = pc_index.query(
182
+ vector=dummy_vector,
183
+ top_k=100, # Get all proposals
184
+ include_metadata=True,
185
+ filter=filter_dict
186
+ )
187
+
188
+ logger.info(f"Query returned {len(res['matches'])} matches")
189
+
190
+ results = []
191
+ seen_files = set() # Track unique files to avoid duplicates
192
+
193
+ for match in res["matches"]:
194
+ metadata = match["metadata"]
195
+ file_path = metadata.get("file_path", "")
196
+
197
+ # Skip if we've already included this file (avoid duplicates from chunks)
198
+ if file_path in seen_files:
199
+ continue
200
+
201
+ seen_files.add(file_path)
202
+
203
+ # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
204
+ proposal_data = DYNAMIC_METADATA.get(file_path, {
205
+ "title": metadata.get("title", "Unknown Title"),
206
+ "summary": metadata.get("summary", ""),
207
+ "category": metadata.get("category", "Budget Proposal"),
208
+ "costLKR": metadata.get("costLKR", "No Costing Available")
209
+ })
210
+
211
+ title = proposal_data["title"]
212
+ summary = proposal_data["summary"]
213
+ costLKR = proposal_data["costLKR"]
214
+ category = proposal_data["category"]
215
+ thumb_url = metadata.get("thumbUrl", "")
216
+
217
+ result = {
218
+ "title": title,
219
+ "summary": summary,
220
+ "costLKR": costLKR,
221
+ "category": category,
222
+ "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
223
+ "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
224
+ "score": 1.0, # Default score for all proposals
225
+ "relevance_percentage": 100,
226
+ "file_path": file_path,
227
+ "id": match["id"]
228
+ }
229
+
230
+ results.append(result)
231
+
232
+ return results
233
+
234
+ except Exception as e:
235
+ logger.error(f"Error getting all proposals: {e}")
236
+ return []
237
+
238
+ @app.route('/api/search', methods=['POST'])
239
+ def search_proposals():
240
+ """API endpoint for searching budget proposals"""
241
+ try:
242
+ data = request.get_json()
243
+ query = data.get('query', '').strip()
244
+ top_k = data.get('top_k', 10)
245
+ category_filter = data.get('category_filter')
246
+
247
+ if not query:
248
+ # If no query, return all proposals
249
+ results = get_all_proposals(category_filter)
250
+ else:
251
+ results = semantic_search(query, top_k, category_filter)
252
+
253
+ return jsonify({
254
+ "query": query,
255
+ "results": results,
256
+ "total_results": len(results),
257
+ "category_filter": category_filter
258
+ })
259
+
260
+ except Exception as e:
261
+ logger.error(f"API error: {e}")
262
+ return jsonify({"error": str(e)}), 500
263
+
264
+ @app.route('/api/search', methods=['GET'])
265
+ def search_proposals_get():
266
+ """API endpoint for searching proposals (GET method)"""
267
+ try:
268
+ query = request.args.get('query', '').strip()
269
+ top_k = int(request.args.get('top_k', 10))
270
+ category_filter = request.args.get('category_filter')
271
+
272
+ if not query:
273
+ # If no query, return all proposals
274
+ results = get_all_proposals(category_filter)
275
+ else:
276
+ results = semantic_search(query, top_k, category_filter)
277
+
278
+ return jsonify({
279
+ "query": query,
280
+ "results": results,
281
+ "total_results": len(results),
282
+ "category_filter": category_filter
283
+ })
284
+
285
+ except Exception as e:
286
+ logger.error(f"API error: {e}")
287
+ return jsonify({"error": str(e)}), 500
288
+
289
+ @app.route('/api/proposals', methods=['GET'])
290
+ def get_proposals():
291
+ """Get all budget proposals"""
292
+ try:
293
+ category_filter = request.args.get('category_filter')
294
+ results = get_all_proposals(category_filter)
295
+
296
+ return jsonify({
297
+ "results": results,
298
+ "total_results": len(results),
299
+ "category_filter": category_filter
300
+ })
301
+
302
+ except Exception as e:
303
+ logger.error(f"API error: {e}")
304
+ return jsonify({"error": str(e)}), 500
305
+
306
+ @app.route('/api/categories', methods=['GET'])
307
+ def get_categories():
308
+ """Get all available categories"""
309
+ try:
310
+ # Get categories directly from dynamic metadata for reliability
311
+ categories = set()
312
+ for file_path, metadata in DYNAMIC_METADATA.items():
313
+ category = metadata.get("category")
314
+ if category:
315
+ categories.add(category)
316
+
317
+ # If no categories from metadata, fallback to Pinecone
318
+ if not categories:
319
+ all_proposals = get_all_proposals()
320
+ for proposal in all_proposals:
321
+ category = proposal.get("category")
322
+ if category:
323
+ categories.add(category)
324
+
325
+ return jsonify({
326
+ "categories": sorted(list(categories))
327
+ })
328
+
329
+ except Exception as e:
330
+ logger.error(f"API error: {e}")
331
+ return jsonify({"error": str(e)}), 500
332
+
333
+ @app.route('/api/health', methods=['GET'])
334
+ def health_check():
335
+ """Health check endpoint"""
336
+ try:
337
+ pc_index = get_pinecone_index()
338
+ if pc_index:
339
+ stats = pc_index.describe_index_stats()
340
+ return jsonify({
341
+ "status": "healthy",
342
+ "message": "Budget proposals semantic search API is running",
343
+ "index_stats": {
344
+ "total_vector_count": stats.total_vector_count,
345
+ "dimension": stats.dimension,
346
+ "index_fullness": stats.index_fullness
347
+ }
348
+ })
349
+ else:
350
+ return jsonify({
351
+ "status": "unhealthy",
352
+ "message": "Cannot connect to Pinecone index"
353
+ }), 500
354
+ except Exception as e:
355
+ return jsonify({
356
+ "status": "unhealthy",
357
+ "message": f"Error: {str(e)}"
358
+ }), 500
359
+
360
+ @app.route('/api/stats', methods=['GET'])
361
+ def get_stats():
362
+ """Get index statistics"""
363
+ try:
364
+ pc_index = get_pinecone_index()
365
+ if not pc_index:
366
+ return jsonify({"error": "Cannot connect to Pinecone index"}), 500
367
+
368
+ stats = pc_index.describe_index_stats()
369
+ return jsonify({
370
+ "total_vector_count": stats.total_vector_count,
371
+ "dimension": stats.dimension,
372
+ "index_fullness": stats.index_fullness
373
+ })
374
+ except Exception as e:
375
+ return jsonify({"error": str(e)}), 500
376
+
377
+ @app.route('/', methods=['GET'])
378
+ def home():
379
+ """Home endpoint with API documentation"""
380
+ return jsonify({
381
+ "message": "Budget Proposals Semantic Search API",
382
+ "version": "1.0.0",
383
+ "endpoints": {
384
+ "POST /api/search": "Search proposals with JSON body",
385
+ "GET /api/search?query=<search_term>": "Search proposals with query parameter",
386
+ "GET /api/proposals": "Get all proposals",
387
+ "GET /api/categories": "Get all categories",
388
+ "GET /api/health": "Health check",
389
+ "GET /api/stats": "Index statistics"
390
+ },
391
+ "status": "running"
392
+ })
393
+
394
+ if __name__ == '__main__':
395
+ app.run(debug=False, host='0.0.0.0', port=7860)