Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
sequential citation numbering fix
Browse files- .gitignore +3 -1
- utils/generator.py +39 -1
.gitignore
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
.DS_Store
|
| 2 |
-
.
|
|
|
|
|
|
|
|
|
| 1 |
.DS_Store
|
| 2 |
+
.envapp.log
|
| 3 |
+
__pycache__/
|
| 4 |
+
.env
|
utils/generator.py
CHANGED
|
@@ -90,7 +90,7 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
|
|
| 90 |
logger.info(f"Processing citation [{citation_num}] -> source_index: {source_index}")
|
| 91 |
|
| 92 |
if 0 <= source_index < len(processed_results):
|
| 93 |
-
source = processed_results[source_index]
|
| 94 |
cited_sources.append(source)
|
| 95 |
logger.info(f"✓ Added source {citation_num}: filename='{source.get('filename', 'Unknown')}', page='{source.get('page', 'Unknown')}'")
|
| 96 |
else:
|
|
@@ -249,6 +249,32 @@ async def _call_llm_streaming(messages: list) -> AsyncGenerator[str, None]:
|
|
| 249 |
# ---------------------------------------------------------------------
|
| 250 |
# Main Generation Functions
|
| 251 |
# ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> Union[str, Dict[str, Any]]:
|
| 253 |
"""Generate an answer to a query using provided context through RAG"""
|
| 254 |
if not query.strip():
|
|
@@ -270,6 +296,11 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
|
|
| 270 |
if processed_results:
|
| 271 |
cited_numbers = _parse_citations(answer)
|
| 272 |
cited_sources = _extract_sources(processed_results, cited_numbers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
result["sources"] = _create_sources_list(cited_sources)
|
| 274 |
return result
|
| 275 |
else:
|
|
@@ -312,6 +343,13 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
|
|
| 312 |
if chatui_format and processed_results:
|
| 313 |
cited_numbers = _parse_citations(cleaned_response)
|
| 314 |
cited_sources = _extract_sources(processed_results, cited_numbers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
sources = _create_sources_list(cited_sources)
|
| 316 |
yield {"event": "sources", "data": {"sources": sources}}
|
| 317 |
|
|
|
|
| 90 |
logger.info(f"Processing citation [{citation_num}] -> source_index: {source_index}")
|
| 91 |
|
| 92 |
if 0 <= source_index < len(processed_results):
|
| 93 |
+
source = processed_results[source_index].copy() # Make a copy to avoid modifying original
|
| 94 |
cited_sources.append(source)
|
| 95 |
logger.info(f"✓ Added source {citation_num}: filename='{source.get('filename', 'Unknown')}', page='{source.get('page', 'Unknown')}'")
|
| 96 |
else:
|
|
|
|
| 249 |
# ---------------------------------------------------------------------
|
| 250 |
# Main Generation Functions
|
| 251 |
# ---------------------------------------------------------------------
|
| 252 |
+
def _renumber_citations(response: str, cited_numbers: List[int]) -> str:
|
| 253 |
+
"""
|
| 254 |
+
Renumber citations in response to be sequential to match ChatUI display
|
| 255 |
+
We do this because otherwise when not all sources are cited, we would get not sequential source listing which is weird for the user
|
| 256 |
+
"""
|
| 257 |
+
if not cited_numbers:
|
| 258 |
+
return response
|
| 259 |
+
|
| 260 |
+
# Create mapping from original citation numbers to sequential numbers
|
| 261 |
+
citation_mapping = {str(original): str(i+1) for i, original in enumerate(cited_numbers)}
|
| 262 |
+
|
| 263 |
+
logger.info(f"=== CITATION RENUMBERING DEBUG ===")
|
| 264 |
+
logger.info(f"Original citation numbers: {cited_numbers}")
|
| 265 |
+
logger.info(f"Citation mapping: {citation_mapping}")
|
| 266 |
+
|
| 267 |
+
# Replace citations in response text
|
| 268 |
+
updated_response = response
|
| 269 |
+
for original, sequential in citation_mapping.items():
|
| 270 |
+
# Replace [original] with [sequential]
|
| 271 |
+
pattern = rf'\[{re.escape(original)}\]'
|
| 272 |
+
replacement = f'[{sequential}]'
|
| 273 |
+
updated_response = re.sub(pattern, replacement, updated_response)
|
| 274 |
+
logger.info(f"Replacing [{original}] with [{sequential}]")
|
| 275 |
+
|
| 276 |
+
return updated_response
|
| 277 |
+
|
| 278 |
async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui_format: bool = False) -> Union[str, Dict[str, Any]]:
|
| 279 |
"""Generate an answer to a query using provided context through RAG"""
|
| 280 |
if not query.strip():
|
|
|
|
| 296 |
if processed_results:
|
| 297 |
cited_numbers = _parse_citations(answer)
|
| 298 |
cited_sources = _extract_sources(processed_results, cited_numbers)
|
| 299 |
+
|
| 300 |
+
# Renumber citations to match sequential source numbering
|
| 301 |
+
answer = _renumber_citations(answer, cited_numbers)
|
| 302 |
+
result["answer"] = answer
|
| 303 |
+
|
| 304 |
result["sources"] = _create_sources_list(cited_sources)
|
| 305 |
return result
|
| 306 |
else:
|
|
|
|
| 343 |
if chatui_format and processed_results:
|
| 344 |
cited_numbers = _parse_citations(cleaned_response)
|
| 345 |
cited_sources = _extract_sources(processed_results, cited_numbers)
|
| 346 |
+
|
| 347 |
+
# Renumber citations to match sequential source numbering
|
| 348 |
+
final_response = _renumber_citations(cleaned_response, cited_numbers)
|
| 349 |
+
|
| 350 |
+
# Send the renumbered response as a correction
|
| 351 |
+
yield {"event": "correction", "data": final_response}
|
| 352 |
+
|
| 353 |
sources = _create_sources_list(cited_sources)
|
| 354 |
yield {"event": "sources", "data": {"sources": sources}}
|
| 355 |
|