mtyrrell commited on
Commit
5e7d579
·
1 Parent(s): 8f0a9cd

sequential citation numbering fix

Browse files
Files changed (2) hide show
  1. .gitignore +3 -1
  2. utils/generator.py +39 -1
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  .DS_Store
2
- .env
 
 
 
1
  .DS_Store
2
+ .envapp.log
3
+ __pycache__/
4
+ .env
utils/generator.py CHANGED
@@ -90,7 +90,7 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
90
  logger.info(f"Processing citation [{citation_num}] -> source_index: {source_index}")
91
 
92
  if 0 <= source_index < len(processed_results):
93
- source = processed_results[source_index]
94
  cited_sources.append(source)
95
  logger.info(f"✓ Added source {citation_num}: filename='{source.get('filename', 'Unknown')}', page='{source.get('page', 'Unknown')}'")
96
  else:
@@ -249,6 +249,32 @@ async def _call_llm_streaming(messages: list) -> AsyncGenerator[str, None]:
249
  # ---------------------------------------------------------------------
250
  # Main Generation Functions
251
  # ---------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> Union[str, Dict[str, Any]]:
253
  """Generate an answer to a query using provided context through RAG"""
254
  if not query.strip():
@@ -270,6 +296,11 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
270
  if processed_results:
271
  cited_numbers = _parse_citations(answer)
272
  cited_sources = _extract_sources(processed_results, cited_numbers)
 
 
 
 
 
273
  result["sources"] = _create_sources_list(cited_sources)
274
  return result
275
  else:
@@ -312,6 +343,13 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
312
  if chatui_format and processed_results:
313
  cited_numbers = _parse_citations(cleaned_response)
314
  cited_sources = _extract_sources(processed_results, cited_numbers)
 
 
 
 
 
 
 
315
  sources = _create_sources_list(cited_sources)
316
  yield {"event": "sources", "data": {"sources": sources}}
317
 
 
90
  logger.info(f"Processing citation [{citation_num}] -> source_index: {source_index}")
91
 
92
  if 0 <= source_index < len(processed_results):
93
+ source = processed_results[source_index].copy() # Make a copy to avoid modifying original
94
  cited_sources.append(source)
95
  logger.info(f"✓ Added source {citation_num}: filename='{source.get('filename', 'Unknown')}', page='{source.get('page', 'Unknown')}'")
96
  else:
 
249
  # ---------------------------------------------------------------------
250
  # Main Generation Functions
251
  # ---------------------------------------------------------------------
252
+ def _renumber_citations(response: str, cited_numbers: List[int]) -> str:
253
+ """
254
+ Renumber citations in response to be sequential to match ChatUI display
255
+ We do this because otherwise when not all sources are cited, we would get not sequential source listing which is weird for the user
256
+ """
257
+ if not cited_numbers:
258
+ return response
259
+
260
+ # Create mapping from original citation numbers to sequential numbers
261
+ citation_mapping = {str(original): str(i+1) for i, original in enumerate(cited_numbers)}
262
+
263
+ logger.info(f"=== CITATION RENUMBERING DEBUG ===")
264
+ logger.info(f"Original citation numbers: {cited_numbers}")
265
+ logger.info(f"Citation mapping: {citation_mapping}")
266
+
267
+ # Replace citations in response text
268
+ updated_response = response
269
+ for original, sequential in citation_mapping.items():
270
+ # Replace [original] with [sequential]
271
+ pattern = rf'\[{re.escape(original)}\]'
272
+ replacement = f'[{sequential}]'
273
+ updated_response = re.sub(pattern, replacement, updated_response)
274
+ logger.info(f"Replacing [{original}] with [{sequential}]")
275
+
276
+ return updated_response
277
+
278
  async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> Union[str, Dict[str, Any]]:
279
  """Generate an answer to a query using provided context through RAG"""
280
  if not query.strip():
 
296
  if processed_results:
297
  cited_numbers = _parse_citations(answer)
298
  cited_sources = _extract_sources(processed_results, cited_numbers)
299
+
300
+ # Renumber citations to match sequential source numbering
301
+ answer = _renumber_citations(answer, cited_numbers)
302
+ result["answer"] = answer
303
+
304
  result["sources"] = _create_sources_list(cited_sources)
305
  return result
306
  else:
 
343
  if chatui_format and processed_results:
344
  cited_numbers = _parse_citations(cleaned_response)
345
  cited_sources = _extract_sources(processed_results, cited_numbers)
346
+
347
+ # Renumber citations to match sequential source numbering
348
+ final_response = _renumber_citations(cleaned_response, cited_numbers)
349
+
350
+ # Send the renumbered response as a correction
351
+ yield {"event": "correction", "data": final_response}
352
+
353
  sources = _create_sources_list(cited_sources)
354
  yield {"event": "sources", "data": {"sources": sources}}
355