baba521 commited on
Commit
b13226f
·
1 Parent(s): c12304a

test chatbot

Browse files
app.py CHANGED
@@ -1617,51 +1617,55 @@ def create_tab_content(tab_name, company_name):
1617
  gr.Markdown("Report Preview", elem_classes=["font-medium", "mb-3"])
1618
  # 这里将显示报告预览
1619
 
 
 
1620
  def create_chat_panel():
1621
  """创建聊天面板组件"""
1622
  with gr.Column(elem_classes=["chat-panel"]):
 
 
1623
  # 聊天头部
1624
- with gr.Row(elem_classes=["p-4", "border-b", "border-gray-200", "items-center", "gap-2"]):
1625
- gr.Markdown("🤖", elem_classes=["text-xl", "text-blue-600"])
1626
- gr.Markdown("Financial Assistant", elem_classes=["font-medium"])
1627
 
1628
  # 聊天区域
1629
- chatbot = gr.Chatbot(
1630
- value=[
1631
- {"role": "assistant", "content": "I'm your financial assistant, how can I help you today?"},
1632
-
1633
- # {"role": "assistant", "content": "Hello! I can help you analyze financial data. Ask questions like \"Show revenue trends\" or \"Compare profitability ratios\""},
1634
- # {"role": "user", "content": "Show revenue trends for last 4 quarters"},
1635
- # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
1636
- # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
1637
- # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
1638
- # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"}
1639
- ],
1640
- type="messages",
1641
- # elem_classes=["min-h-0", "overflow-y-auto", "space-y-4", "chat-content-box"],
1642
- show_label=False,
1643
- autoscroll=True,
1644
- show_copy_button=True,
1645
- height=400,
1646
- container=False,
1647
- )
1648
 
1649
  # 输入区域
1650
- with gr.Row(elem_classes=["border-t", "border-gray-200", "gap-2"]):
1651
- msg = gr.Textbox(
1652
- placeholder="Ask a financial question...",
1653
- elem_classes=["flex-1", "border", "border-gray-300", "rounded-lg", "px-4", "py-2", "focus:border-blue-500"],
1654
- show_label=False,
1655
- lines=1,
1656
- submit_btn=True,
1657
- container=False,
1658
- )
1659
- msg.submit(
1660
- chat_bot,
1661
- [msg, chatbot],
1662
- [msg, chatbot],
1663
- queue=True,
1664
- )
1665
 
1666
  # def load_css_files(css_dir, filenames):
1667
  # css_content = ""
 
1617
  gr.Markdown("Report Preview", elem_classes=["font-medium", "mb-3"])
1618
  # 这里将显示报告预览
1619
 
1620
+ from chatbot.chat_main import create_financial_chatbot
1621
+
1622
  def create_chat_panel():
1623
  """创建聊天面板组件"""
1624
  with gr.Column(elem_classes=["chat-panel"]):
1625
+ chat_component = create_financial_chatbot()
1626
+ chat_component.render()
1627
  # 聊天头部
1628
+ # with gr.Row(elem_classes=["p-4", "border-b", "border-gray-200", "items-center", "gap-2"]):
1629
+ # gr.Markdown("🤖", elem_classes=["text-xl", "text-blue-600"])
1630
+ # gr.Markdown("Financial Assistant", elem_classes=["font-medium"])
1631
 
1632
  # 聊天区域
1633
+ # chatbot = gr.Chatbot(
1634
+ # value=[
1635
+ # {"role": "assistant", "content": "I'm your financial assistant, how can I help you today?"},
1636
+
1637
+ # # {"role": "assistant", "content": "Hello! I can help you analyze financial data. Ask questions like \"Show revenue trends\" or \"Compare profitability ratios\""},
1638
+ # # {"role": "user", "content": "Show revenue trends for last 4 quarters"},
1639
+ # # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
1640
+ # # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
1641
+ # # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
1642
+ # # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"}
1643
+ # ],
1644
+ # type="messages",
1645
+ # # elem_classes=["min-h-0", "overflow-y-auto", "space-y-4", "chat-content-box"],
1646
+ # show_label=False,
1647
+ # autoscroll=True,
1648
+ # show_copy_button=True,
1649
+ # height=400,
1650
+ # container=False,
1651
+ # )
1652
 
1653
  # 输入区域
1654
+ # with gr.Row(elem_classes=["border-t", "border-gray-200", "gap-2"]):
1655
+ # msg = gr.Textbox(
1656
+ # placeholder="Ask a financial question...",
1657
+ # elem_classes=["flex-1", "border", "border-gray-300", "rounded-lg", "px-4", "py-2", "focus:border-blue-500"],
1658
+ # show_label=False,
1659
+ # lines=1,
1660
+ # submit_btn=True,
1661
+ # container=False,
1662
+ # )
1663
+ # msg.submit(
1664
+ # chat_bot,
1665
+ # [msg, chatbot],
1666
+ # [msg, chatbot],
1667
+ # queue=True,
1668
+ # )
1669
 
1670
  # def load_css_files(css_dir, filenames):
1671
  # css_content = ""
chatbot/MCP_Financial_Report/financial_mcp_server.py ADDED
@@ -0,0 +1,1151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Financial Report MCP Server using the official MCP Python SDK
3
+
4
+ This server provides tools for downloading and processing financial reports.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ import os
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Optional, Dict, Any, List
13
+ from datetime import datetime
14
+ import aiohttp
15
+ import ssl
16
+ import pdfplumber
17
+ from bs4 import BeautifulSoup
18
+ import httpx
19
+ import json
20
+ import re
21
+ from huggingface_hub import InferenceClient
22
+
23
+ # Configure logging - write to stderr instead of stdout to avoid interfering with stdio communication
24
+ logging.basicConfig(level=logging.INFO, stream=sys.stderr)
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Import the official MCP SDK
28
+ try:
29
+ from mcp.server.fastmcp import FastMCP, Context
30
+ from mcp.server.session import ServerSession
31
+ logger.info("MCP SDK imported successfully")
32
+ except ImportError as e:
33
+ logger.error(f"Failed to import MCP SDK: {e}")
34
+ raise
35
+
36
+ # Create the MCP server
37
+ mcp = FastMCP("Financial Report MCP Server", "1.0.0")
38
+
39
+ # Ensure the financial_reports directory exists
40
+ reports_dir = Path("financial_reports")
41
+ reports_dir.mkdir(exist_ok=True)
42
+ logger.info(f"Financial reports directory: {reports_dir.absolute()}")
43
+
44
+ @mcp.tool()
45
+ async def download_financial_report(url: str) -> Dict[str, Any]:
46
+ """
47
+ Download a financial report from a URL
48
+
49
+ Args:
50
+ url: The URL of the financial report to download
51
+
52
+ Returns:
53
+ Dictionary with download information
54
+ """
55
+ logger.info(f"Downloading financial report from {url}")
56
+
57
+ try:
58
+ # Decode URL if it contains encoded characters
59
+ import urllib.parse
60
+ decoded_url = urllib.parse.unquote(url)
61
+ logger.info(f"Decoded URL: {decoded_url}")
62
+
63
+ # Re-encode the URL properly to handle spaces and other special characters
64
+ encoded_url = urllib.parse.quote(decoded_url, safe=':/?#[]@!$&\'()*+,;=%')
65
+ logger.info(f"Re-encoded URL: {encoded_url}")
66
+
67
+ # Create SSL context that doesn't verify certificates (for testing)
68
+ ssl_context = ssl.create_default_context()
69
+ ssl_context.check_hostname = False
70
+ ssl_context.verify_mode = ssl.CERT_NONE
71
+
72
+ # Add timeout and headers for better reliability
73
+ timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
74
+ headers = {
75
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
76
+ }
77
+
78
+ async with aiohttp.ClientSession(timeout=timeout) as session:
79
+ async with session.get(encoded_url, ssl=ssl_context, headers=headers) as response:
80
+ if response.status != 200:
81
+ raise Exception(f"HTTP {response.status} when downloading {encoded_url}")
82
+
83
+ # CRITICAL: Check if this is an HTML investor relations page
84
+ # If so, try to extract PDF links instead of downloading the HTML
85
+ content_type = response.headers.get('content-type', '').lower()
86
+ is_html = 'html' in content_type
87
+ is_investor_page = any(pattern in url.lower() for pattern in ['investor', 'ir.', 'press-release', 'earnings', 'financial'])
88
+
89
+ if is_html and is_investor_page:
90
+ logger.info(f"[DOWNLOAD] Detected HTML investor relations page, attempting to extract PDF links")
91
+ # Try to extract PDF links from this page
92
+ pdf_links = await extract_pdf_links_from_page(url, "")
93
+ if pdf_links:
94
+ # Found PDF link(s), download the first PDF instead
95
+ pdf_url = pdf_links[0]["url"]
96
+ logger.info(f"[DOWNLOAD] Found PDF link, redirecting download to: {pdf_url}")
97
+ # Recursively call ourselves with the PDF URL
98
+ return await download_financial_report(pdf_url)
99
+ else:
100
+ logger.warning(f"[DOWNLOAD] No PDF links found on investor page, downloading HTML anyway")
101
+
102
+ # Determine filename from decoded URL to preserve original filename
103
+ filename = decoded_url.split("/")[-1]
104
+ if not filename or "." not in filename:
105
+ if 'pdf' in content_type:
106
+ filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
107
+ elif 'html' in content_type:
108
+ filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
109
+ else:
110
+ filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.dat"
111
+
112
+ # Save file
113
+ file_path = Path("financial_reports") / filename
114
+ content = await response.read()
115
+
116
+ logger.info(f"Saving report to {file_path.absolute()}")
117
+ with open(file_path, "wb") as f:
118
+ f.write(content)
119
+
120
+ logger.info(f"Successfully downloaded report to {file_path}")
121
+
122
+ return {
123
+ "filename": filename,
124
+ "filepath": str(file_path),
125
+ "size": len(content),
126
+ "download_time": datetime.now().isoformat(),
127
+ "source_url": url # CRITICAL: Include original URL for analysis context
128
+ }
129
+ except aiohttp.ClientError as e:
130
+ logger.error(f"Network error downloading financial report: {str(e)}")
131
+ raise Exception(f"Network error downloading financial report: {str(e)}. This may be due to network restrictions in the execution environment.")
132
+ except Exception as e:
133
+ logger.error(f"Error downloading financial report: {str(e)}")
134
+ raise Exception(f"Error downloading financial report: {str(e)}")
135
+
136
+ @mcp.tool()
137
+ async def list_downloaded_reports() -> Dict[str, Any]:
138
+ """
139
+ List all downloaded financial reports
140
+
141
+ Returns:
142
+ Dictionary with list of reports
143
+ """
144
+ try:
145
+ reports = []
146
+ download_dir = Path("financial_reports")
147
+ if download_dir.exists():
148
+ for file_path in download_dir.iterdir():
149
+ if file_path.is_file():
150
+ stat = file_path.stat()
151
+ # Import urllib.parse here to avoid undefined name error
152
+ import urllib.parse
153
+ reports.append({
154
+ "filename": file_path.name,
155
+ "filepath": str(file_path),
156
+ "size": stat.st_size,
157
+ "modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
158
+ "encoded_filename": urllib.parse.quote(file_path.name, safe=':/?#[]@!$&\'()*+,;=%')
159
+ })
160
+
161
+ return {
162
+ "reports": reports
163
+ }
164
+ except Exception as e:
165
+ logger.error(f"Error listing downloaded reports: {str(e)}")
166
+ raise Exception(f"Error listing downloaded reports: {str(e)}")
167
+
168
+
169
+ @mcp.tool()
170
+ async def analyze_financial_report_file(filename: str, source_url: str = "") -> Dict[str, Any]:
171
+ """
172
+ Analyze a downloaded financial report file and provide investment insights
173
+
174
+ Args:
175
+ filename: Name of the financial report file to analyze
176
+ source_url: Optional original URL where the report was downloaded from
177
+
178
+ Returns:
179
+ Dictionary with analysis results and investment insights
180
+ """
181
+ logger.info(f"Analyzing financial report file: {filename}")
182
+ if source_url:
183
+ logger.info(f"Source URL: {source_url}")
184
+
185
+ try:
186
+ # CRITICAL: If filename is empty, auto-detect the most recently downloaded file
187
+ if not filename or filename.strip() == "":
188
+ logger.info("[AUTO-DETECT] No filename provided, looking for most recent downloaded file")
189
+ reports_dir = Path("financial_reports")
190
+ if reports_dir.exists():
191
+ # Get all files in the directory
192
+ files = [(f, f.stat().st_mtime) for f in reports_dir.iterdir() if f.is_file()]
193
+ if files:
194
+ # Sort by modification time (most recent first)
195
+ files.sort(key=lambda x: x[1], reverse=True)
196
+ filename = files[0][0].name
197
+ logger.info(f"[AUTO-DETECT] Found most recent file: {filename}")
198
+ else:
199
+ raise Exception("No filename provided and no downloaded files found in financial_reports directory")
200
+ else:
201
+ raise Exception("No filename provided and financial_reports directory does not exist")
202
+
203
+ # Use absolute path to ensure correct file access in different environments
204
+ reports_dir = Path("financial_reports").absolute()
205
+ file_path = reports_dir / filename
206
+
207
+ if not file_path.exists():
208
+ # Also check with relative path as fallback
209
+ relative_path = Path("financial_reports") / filename
210
+ if relative_path.exists():
211
+ file_path = relative_path
212
+ else:
213
+ raise Exception(f"File not found: {filename}. Searched in {reports_dir} and relative path {relative_path}")
214
+
215
+ # Handle PDF files properly
216
+ file_content = ""
217
+ if filename.lower().endswith('.pdf'):
218
+ try:
219
+ import pdfplumber
220
+ with pdfplumber.open(file_path) as pdf:
221
+ text = ""
222
+ # Extract text from first few pages to avoid overwhelming the model
223
+ pages_to_extract = min(10, len(pdf.pages)) # Limit to first 10 pages
224
+ for i in range(pages_to_extract):
225
+ page = pdf.pages[i]
226
+ text += page.extract_text() or ""
227
+ file_content = text
228
+ except Exception as e:
229
+ # If PDF extraction fails, return error message
230
+ logger.error(f"Error extracting text from PDF {filename}: {str(e)}")
231
+ file_content = f"Error extracting text from PDF {filename}: {str(e)}"
232
+ else:
233
+ # For text-based files, read normally
234
+ with open(file_path, "r", encoding="utf-8") as f:
235
+ file_content = f.read()
236
+
237
+ # CRITICAL: If this is HTML content and we have source_url, extract clean text instead
238
+ is_html = (
239
+ filename.lower().endswith('.html') or
240
+ '<html' in file_content.lower()[:500] or
241
+ '<!doctype html' in file_content.lower()[:500] or
242
+ '<meta' in file_content.lower()[:500]
243
+ )
244
+
245
+ if is_html and source_url:
246
+ logger.info(f"[HTML EXTRACTION] Detected HTML content, extracting text from source URL: {source_url}")
247
+ try:
248
+ from bs4 import BeautifulSoup
249
+
250
+ # Re-fetch the page to get full content (not truncated)
251
+ async with httpx.AsyncClient(timeout=30.0) as client:
252
+ response = await client.get(source_url, headers={
253
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
254
+ })
255
+ response.raise_for_status()
256
+
257
+ # Parse HTML and extract text
258
+ soup = BeautifulSoup(response.text, 'html.parser')
259
+
260
+ # Remove script, style, nav, header, footer
261
+ for element in soup(["script", "style", "nav", "header", "footer", "noscript"]):
262
+ element.decompose()
263
+
264
+ # Get text
265
+ text = soup.get_text(separator='\n', strip=True)
266
+
267
+ # Clean up whitespace
268
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
269
+ clean_text = '\n'.join(lines)
270
+
271
+ if clean_text:
272
+ file_content = clean_text
273
+ logger.info(f"[HTML EXTRACTION] Successfully extracted {len(file_content)} characters of clean text")
274
+ else:
275
+ logger.warning(f"[HTML EXTRACTION] No text extracted, using original HTML")
276
+
277
+ except Exception as e:
278
+ logger.error(f"[HTML EXTRACTION] Failed to extract text: {str(e)}")
279
+ logger.info(f"[HTML EXTRACTION] Falling back to original HTML content")
280
+ # Keep using the original HTML file_content
281
+
282
+ # Truncate content if too long for the model
283
+ if len(file_content) > 15000:
284
+ file_content = file_content[:15000] + "... (truncated)"
285
+
286
+ # Return file analysis trigger with content for the main app to process
287
+ # This allows app.py to do streaming analysis which is better for UX
288
+ result = {
289
+ "type": "file_analysis_trigger",
290
+ "file_path": str(file_path),
291
+ "filename": filename,
292
+ "content": file_content, # Include full content for analysis
293
+ "content_preview": file_content[:500] + "... (preview truncated)" if len(file_content) > 500 else file_content
294
+ }
295
+
296
+ # CRITICAL: Include source URL if available for analysis context
297
+ if source_url:
298
+ result["source_url"] = source_url
299
+ logger.info(f"Including source URL in analysis result: {source_url}")
300
+
301
+ return result
302
+ except Exception as e:
303
+ logger.error(f"Error analyzing financial report file {filename}: {str(e)}")
304
+ raise Exception(f"Error analyzing financial report file {filename}: {str(e)}")
305
+
306
+
307
+ # New tool for searching financial reports online
308
+ @mcp.tool()
309
+ async def search_and_extract_financial_report(user_query: str) -> Dict[str, Any]:
310
+ """
311
+ Search for financial reports online based on user's query and return raw search results for Agent analysis
312
+
313
+ Args:
314
+ user_query: The user's complete search query
315
+
316
+ Returns:
317
+ Dictionary with raw search results for Agent analysis
318
+ """
319
+
320
+ search_base_url = 'https://www.googleapis.com/customsearch/v1'
321
+
322
+ params = {
323
+ "key": "AIzaSyARhFllOKRdpHjij5idJZ-vXa-0fdIQqGI",
324
+ "cx": "51d2770bb9e304626",
325
+ "q": user_query
326
+ }
327
+
328
+ logger.info(f"Searching for financial reports with query: {user_query}")
329
+
330
+ try:
331
+ async with httpx.AsyncClient() as client:
332
+ response = await client.get(search_base_url, params=params)
333
+ response.raise_for_status()
334
+ search_results = response.json()
335
+
336
+ # Check if we have search results
337
+ if "items" in search_results and search_results["items"]:
338
+ # Return search results with proper structure
339
+ return {
340
+ "type": "search_results",
341
+ "results": search_results["items"],
342
+ "message": f"Successfully found {len(search_results['items'])} search results for query: {user_query}"
343
+ }
344
+ else:
345
+ # No results found
346
+ return {
347
+ "type": "search_no_results",
348
+ "message": f"No financial reports found for query: {user_query}",
349
+ "suggestion": "Please provide a direct URL (or PDF format URL) for the financial report you're looking for."
350
+ }
351
+ except httpx.RequestError as e:
352
+ logger.error(f"Error performing web search: {str(e)}")
353
+ return {
354
+ "type": "search_error",
355
+ "error": str(e),
356
+ "message": f"Exception while searching for financial reports with query '{user_query}': {str(e)}",
357
+ "suggestion": "Please ask user to provide a direct URL (or PDF format URL) for the financial report due to search error."
358
+ }
359
+
360
+
361
+ @mcp.tool()
362
+ def rank_pdf_links_by_relevance(pdf_links: List[Dict[str, str]], user_request: str) -> List[Dict[str, str]]:
363
+ """
364
+ Rank PDF links by relevance to user request
365
+
366
+ Args:
367
+ pdf_links: List of PDF links to rank
368
+ user_request: User's specific request
369
+
370
+ Returns:
371
+ Ranked list of PDF links
372
+ """
373
+ # Convert user request to lowercase for case-insensitive matching
374
+ user_request_lower = user_request.lower()
375
+
376
+ # Score each PDF link based on relevance using dynamic token matching
377
+ scored_links = []
378
+ for link in pdf_links:
379
+ title = link.get("title", "").lower()
380
+ snippet = link.get("snippet", "").lower()
381
+
382
+ score = 0
383
+
384
+ # Dynamic keyword matching - extract tokens from user request and compare
385
+ request_tokens = set(user_request_lower.split())
386
+ title_tokens = set(title.split())
387
+ snippet_tokens = set(snippet.split())
388
+
389
+ # Calculate token overlap
390
+ title_overlap = len(request_tokens & title_tokens)
391
+ snippet_overlap = len(request_tokens & snippet_tokens)
392
+
393
+ if title_overlap > 0:
394
+ score += title_overlap * 2 # Each matching word in title = +2 points
395
+ if snippet_overlap > 0:
396
+ score += snippet_overlap # Each matching word in snippet = +1 point
397
+
398
+ # Prefer more recent reports - dynamically check for year patterns
399
+ import re
400
+ year_matches = re.findall(r'\b(19|20)\d{2}\b', user_request_lower)
401
+ for year in year_matches:
402
+ if year in title or year in snippet:
403
+ score += 1
404
+
405
+ # Check for "recent" indicators dynamically
406
+ recent_indicators = ['最近', 'recent', 'latest', 'newest']
407
+ if any(indicator in user_request_lower for indicator in recent_indicators):
408
+ # Prefer links with recent years in title
409
+ current_year = datetime.now().year
410
+ for i in range(3): # Check for current year and 2 previous years
411
+ year_str = str(current_year - i)
412
+ if year_str in title or year_str in snippet:
413
+ score += (3 - i) # Higher score for more recent years
414
+
415
+ scored_links.append((score, link))
416
+
417
+ # Sort by score (descending)
418
+ scored_links.sort(key=lambda x: x[0], reverse=True)
419
+
420
+ # Return links without scores
421
+ return [link for score, link in scored_links]
422
+
423
+
424
+ async def extract_pdf_links_from_page(url: str, user_request: str = "") -> List[Dict[str, str]]:
425
+ """
426
+ Extract PDF links from a financial report index page and rank them based on user request
427
+
428
+ Args:
429
+ url: URL of the index page to parse
430
+ user_request: User's specific request for filtering relevant PDFs
431
+
432
+ Returns:
433
+ List of dictionaries containing PDF link information, sorted by relevance
434
+ """
435
+ logger.info(f"Extracting PDF links from page: {url}")
436
+
437
+ try:
438
+ # Create SSL context that doesn't verify certificates (for testing)
439
+ ssl_context = ssl.create_default_context()
440
+ ssl_context.check_hostname = False
441
+ ssl_context.verify_mode = ssl.CERT_NONE
442
+
443
+ # Add timeout and headers for better reliability
444
+ timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
445
+ headers = {
446
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
447
+ }
448
+
449
+ async with aiohttp.ClientSession(timeout=timeout) as session:
450
+ async with session.get(url, ssl=ssl_context, headers=headers) as response:
451
+ if response.status != 200:
452
+ logger.warning(f"HTTP {response.status} when fetching {url}")
453
+ return []
454
+
455
+ content = await response.text()
456
+ soup = BeautifulSoup(content, 'html.parser')
457
+
458
+ pdf_links = []
459
+
460
+ # Look for PDF links in the page
461
+ for link_elem in soup.find_all('a', href=True):
462
+ href = link_elem['href']
463
+ title = link_elem.get_text(strip=True)
464
+
465
+ # Check if this is a PDF link
466
+ if href.lower().endswith('.pdf'):
467
+ # Make absolute URL if needed
468
+ if href.startswith('//'):
469
+ href = 'https:' + href
470
+ elif href.startswith('/'):
471
+ # Construct absolute URL from base URL
472
+ from urllib.parse import urljoin
473
+ href = urljoin(url, href)
474
+ elif not href.startswith('http'):
475
+ # Relative URL, construct absolute URL
476
+ from urllib.parse import urljoin
477
+ href = urljoin(url, href)
478
+
479
+ pdf_links.append({
480
+ "url": href,
481
+ "title": title or "PDF Report",
482
+ "snippet": f"PDF document: {title}"
483
+ })
484
+
485
+ # Also look for links with potential PDF indicators in text
486
+ # Use dynamic matching instead of hardcoded keywords
487
+ for link_elem in soup.find_all('a', href=True):
488
+ href = link_elem['href']
489
+ title = link_elem.get_text(strip=True)
490
+ title_lower = title.lower()
491
+
492
+ # Dynamic check: if link text contains PDF-related terms from user request
493
+ # or common report indicators, consider it
494
+ request_tokens = set(user_request.lower().split()) if user_request else set()
495
+ title_tokens = set(title_lower.split())
496
+
497
+ # Check for overlap with user request OR common PDF indicators
498
+ has_request_match = len(request_tokens & title_tokens) > 0 if request_tokens else False
499
+ has_pdf_indicator = 'pdf' in title_lower or '.pdf' in href.lower()
500
+
501
+ if has_request_match or has_pdf_indicator:
502
+ # Make absolute URL if needed
503
+ if href.startswith('//'):
504
+ href = 'https:' + href
505
+ elif href.startswith('/'):
506
+ # Construct absolute URL from base URL
507
+ from urllib.parse import urljoin
508
+ href = urljoin(url, href)
509
+ elif not href.startswith('http'):
510
+ # Relative URL, construct absolute URL
511
+ from urllib.parse import urljoin
512
+ href = urljoin(url, href)
513
+
514
+ # If it's a PDF link, add it
515
+ if href.lower().endswith('.pdf'):
516
+ pdf_links.append({
517
+ "url": href,
518
+ "title": title or "PDF Report",
519
+ "snippet": f"PDF document: {title}"
520
+ })
521
+
522
+ # Rank PDF links based on user request
523
+ if user_request:
524
+ ranked_links = rank_pdf_links_by_relevance(pdf_links, user_request)
525
+ else:
526
+ ranked_links = pdf_links
527
+
528
+ logger.info(f"Found {len(ranked_links)} PDF links on page {url}")
529
+ return ranked_links
530
+ except Exception as e:
531
+ logger.error(f"Error extracting PDF links from {url}: {str(e)}")
532
+ return []
533
+
534
+
535
+ @mcp.tool()
536
+ async def deep_analyze_and_extract_download_link(search_results: List[Dict[str, Any]], user_request: str) -> Dict[str, Any]:
537
+ """
538
+ Deep analyze search results using LLM and extract the most relevant download link based on user request
539
+
540
+ Args:
541
+ search_results: List of search results from search_and_extract_financial_report
542
+ user_request: The user's specific request
543
+
544
+ Returns:
545
+ Dictionary with the most relevant download link and related information
546
+ """
547
+ logger.info(f"Deep analyzing search results for user request: {user_request}")
548
+
549
+ # CRITICAL: Detect if user is requesting MULTIPLE quarters/reports
550
+ # Use dynamic regex pattern matching instead of hardcoding quarter names
551
+ user_request_lower = user_request.lower()
552
+
553
+ # Detect quarter requests dynamically using regex
554
+ quarters_requested = []
555
+
556
+ # Pattern 1: Q1, Q2, Q3, Q4 (case insensitive)
557
+ import re
558
+ q_pattern = re.findall(r'\bq([1-4])\b', user_request_lower)
559
+ for q_num in q_pattern:
560
+ quarter_key = f'q{q_num}'
561
+ if quarter_key not in quarters_requested:
562
+ quarters_requested.append(quarter_key)
563
+
564
+ # Pattern 2: "first", "second", "third", "fourth" + "quarter"
565
+ quarter_words = {
566
+ 'first': 'q1',
567
+ 'second': 'q2',
568
+ 'third': 'q3',
569
+ 'fourth': 'q4',
570
+ '1st': 'q1',
571
+ '2nd': 'q2',
572
+ '3rd': 'q3',
573
+ '4th': 'q4'
574
+ }
575
+
576
+ for word, q_key in quarter_words.items():
577
+ if word in user_request_lower and 'quarter' in user_request_lower:
578
+ if q_key not in quarters_requested:
579
+ quarters_requested.append(q_key)
580
+
581
+ is_multiple_quarter_request = len(quarters_requested) > 1
582
+ logger.info(f"[MULTI-QUARTER DETECTION] Quarters requested: {quarters_requested}, is_multiple: {is_multiple_quarter_request}")
583
+
584
+ try:
585
+ # Convert search results to a more readable format for LLM analysis
586
+ formatted_results = []
587
+ for i, result in enumerate(search_results[:10]): # Limit to top 10 results
588
+ formatted_results.append({
589
+ "index": i,
590
+ "title": result.get("title", ""),
591
+ "link": result.get("link", ""),
592
+ "snippet": result.get("snippet", "")
593
+ })
594
+
595
+ # Create prompt for LLM to analyze search results
596
+ prompt = f"""
597
+ You are a financial report analysis expert. Your task is to analyze search results and identify the most relevant download link for a user's specific request.
598
+
599
+ User Request: {user_request}
600
+
601
+ Search Results:
602
+ {json.dumps(formatted_results, indent=2)}
603
+
604
+ Please analyze these search results and identify the most relevant financial report that matches the user's request. Consider factors such as:
605
+ 1. **CRITICAL: Prefer direct PDF download links (.pdf URLs) over web pages** - Users want downloadable files, not landing pages
606
+ 2. Relevance to the user's specific request (company name, report type, quarter/year, etc.)
607
+ 3. Source credibility (official company websites, SEC.gov, etc.)
608
+ 4. Match the exact period requested (e.g., if user asks for Q1 2025, prioritize Q1 2025 reports over annual reports)
609
+ 5. Avoid generic index pages or landing pages - look for specific report PDFs
610
+
611
+ Priority Rules:
612
+ - Direct PDF link for the exact period requested = HIGHEST PRIORITY
613
+ - Direct PDF link for a related period = HIGH PRIORITY
614
+ - Web page or landing page = LOW PRIORITY (only if no PDF available)
615
+
616
+ Respond with a JSON object in the following format:
617
+ {{
618
+ "selected_index": 0,
619
+ "reasoning": "Explanation of why this result was selected",
620
+ "confidence": "high|medium|low"
621
+ }}
622
+
623
+ If none of the results are relevant, respond with:
624
+ {{
625
+ "selected_index": -1,
626
+ "reasoning": "Explanation of why no results are relevant",
627
+ "confidence": "low"
628
+ }}
629
+ """
630
+
631
+ # Call LLM for analysis
632
+ try:
633
+ import sys
634
+ import os
635
+ print(f"[LLM-DEBUG] About to initialize InferenceClient...", file=sys.stderr)
636
+
637
+ # Get token from environment
638
+ hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
639
+ if hf_token:
640
+ print(f"[LLM-DEBUG] Found HUGGING_FACE_HUB_TOKEN (length: {len(hf_token)})", file=sys.stderr)
641
+ else:
642
+ print(f"[LLM-DEBUG] WARNING: No token found", file=sys.stderr)
643
+
644
+ # Initialize the Hugging Face Inference Client with explicit endpoint
645
+ from huggingface_hub import InferenceClient
646
+ client = InferenceClient(
647
+ token=hf_token,
648
+ base_url="https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct"
649
+ )
650
+ print(f"[LLM-DEBUG] InferenceClient initialized successfully", file=sys.stderr)
651
+
652
+
653
+ messages = [
654
+ {"role": "system", "content": "You are a precise JSON generator that helps analyze financial report search results. You are also helpful in guiding users to find the most relevant financial reports. You should ONLY generate valid JSON responses in the specified format."},
655
+ {"role": "user", "content": prompt}
656
+ ]
657
+
658
+ # Get response from LLM
659
+ response = client.chat.completions.create(
660
+ model="Qwen/Qwen2.5-72B-Instruct",
661
+ messages=messages,
662
+ max_tokens=500,
663
+ temperature=0.3,
664
+ )
665
+
666
+ # Extract the JSON response
667
+ if hasattr(response, 'choices') and len(response.choices) > 0:
668
+ content = response.choices[0].message.content if hasattr(response.choices[0].message, 'content') else str(response.choices[0].message)
669
+ else:
670
+ content = str(response)
671
+
672
+ # Try to parse as JSON
673
+ try:
674
+ # Extract JSON from the response if it's wrapped in other text
675
+ json_match = re.search(r'\{.*\}', content, re.DOTALL)
676
+ if json_match:
677
+ json_str = json_match.group(0)
678
+ llm_result = json.loads(json_str)
679
+
680
+ # Extract the selected index
681
+ selected_index = llm_result.get("selected_index", -1)
682
+ reasoning = llm_result.get("reasoning", "No reasoning provided")
683
+ confidence = llm_result.get("confidence", "low")
684
+
685
+ # If a valid index was selected, return that result
686
+ if 0 <= selected_index < len(formatted_results):
687
+ selected_result = formatted_results[selected_index]
688
+ original_result = search_results[selected_index]
689
+
690
+ # CRITICAL: If LLM selected a non-PDF link, try to extract PDF from the page first
691
+ link = selected_result["link"]
692
+ if not link.lower().endswith(".pdf"):
693
+ # Check if it looks like an investor relations page
694
+ if "investor" in link or "ir." in link or "press-release" in link or "earnings" in link:
695
+ logger.info(f"[LLM-SELECTED] Non-PDF link detected, attempting to extract PDF from page: {link}")
696
+ pdf_links = await extract_pdf_links_from_page(link, user_request)
697
+ if pdf_links:
698
+ # Return the first PDF link found
699
+ pdf_link = pdf_links[0]
700
+ logger.info(f"[LLM-SELECTED] Successfully extracted PDF: {pdf_link.get('title', 'PDF Report')}")
701
+ return {
702
+ "type": "download_link_extracted",
703
+ "title": f"{selected_result['title']} - {pdf_link.get('title', 'PDF Report')}",
704
+ "link": pdf_link["url"],
705
+ "snippet": pdf_link.get("snippet", selected_result["snippet"]),
706
+ "message": f"Found the most relevant financial report for your request: {pdf_link.get('title', 'PDF Report')}",
707
+ "confidence": confidence,
708
+ "reasoning": f"{reasoning}. Extracted PDF link from the selected page."
709
+ }
710
+ else:
711
+ logger.warning(f"[LLM-SELECTED] No PDF links found on page: {link}")
712
+
713
+ return {
714
+ "type": "download_link_extracted",
715
+ "title": selected_result["title"],
716
+ "link": selected_result["link"],
717
+ "snippet": selected_result["snippet"],
718
+ "message": f"Found the most relevant financial report for your request: {selected_result['title']}",
719
+ "confidence": confidence,
720
+ "reasoning": reasoning
721
+ }
722
+ elif selected_index == -1:
723
+ # No relevant results found
724
+ if search_results:
725
+ first_result = search_results[0]
726
+ return {
727
+ "type": "download_link_extracted",
728
+ "title": first_result.get("title", ""),
729
+ "link": first_result.get("link", ""),
730
+ "snippet": first_result.get("snippet", ""),
731
+ "message": "Found a potential financial report, but it may not exactly match your request.",
732
+ "confidence": "low",
733
+ "reasoning": reasoning
734
+ }
735
+ else:
736
+ return {
737
+ "type": "no_results",
738
+ "message": "No search results available to analyze.",
739
+ "suggestion": "Please try a different search or provide a direct URL.",
740
+ "reasoning": "No search results were provided for analysis."
741
+ }
742
+ else:
743
+ # Invalid index, fall back to heuristic-based selection
744
+ raise ValueError("Invalid selected_index from LLM response")
745
+ else:
746
+ # If no JSON found, fall back to heuristic-based selection
747
+ raise ValueError("No valid JSON found in LLM response")
748
+ except (json.JSONDecodeError, ValueError) as e:
749
+ # If JSON parsing fails, fall back to heuristic-based selection
750
+ logger.warning(f"LLM response parsing failed, falling back to heuristic analysis: {str(e)}")
751
+ pass
752
+ except Exception as llm_error:
753
+ # If LLM call fails, fall back to heuristic-based selection
754
+ logger.warning(f"LLM call failed, falling back to heuristic analysis: {str(llm_error)}")
755
+ pass
756
+
757
+ # Fallback: Simple heuristic-based selection
758
+ logger.info("Using heuristic-based selection as fallback")
759
+ best_match_index = -1
760
+ best_score = -1
761
+
762
+ user_request_lower = user_request.lower()
763
+
764
+ # CRITICAL: Dynamically extract company names from search results
765
+ # Strategy: Identify unique domains/companies that appear in results
766
+ # The company mentioned in MOST results is likely the requested company
767
+ company_mentions = {} # {company_identifier: count}
768
+ domain_to_company = {} # {domain: company_name}
769
+
770
+ # First pass: Learn which companies appear in the search results
771
+ for result in formatted_results:
772
+ title = result.get("title", "").lower()
773
+ link = result.get("link", "").lower()
774
+
775
+ # Extract domain
776
+ domain_match = re.search(r'https?://(?:www\.)?([^/]+)', link)
777
+ if domain_match:
778
+ domain = domain_match.group(1)
779
+
780
+ # Extract company identifier from domain dynamically
781
+ # Strategy: Use the main part of domain as company key
782
+ # e.g., "intc.com" -> "intc", "aboutamazon.com" -> "aboutamazon", "ir.tesla.com" -> "tesla"
783
+
784
+ # Remove common prefixes/suffixes
785
+ domain_parts = domain.replace('www.', '').replace('ir.', '').replace('investor.', '').replace('investors.', '')
786
+
787
+ # Get the core domain name (before .com/.net/etc)
788
+ core_domain = domain_parts.split('.')[0]
789
+
790
+ # Use core domain as company identifier
791
+ company_key = core_domain
792
+
793
+ # Track company mentions
794
+ company_mentions[company_key] = company_mentions.get(company_key, 0) + 1
795
+ domain_to_company[domain] = company_key
796
+
797
+ # Determine the PRIMARY requested company (most mentioned in results)
798
+ primary_company = None
799
+ if company_mentions:
800
+ primary_company = max(company_mentions.items(), key=lambda x: x[1])[0]
801
+ logger.info(f"[COMPANY DETECTION] Detected primary company: '{primary_company}' (mentioned in {company_mentions[primary_company]} results)")
802
+ logger.info(f"[COMPANY DETECTION] All companies found: {company_mentions}")
803
+
804
+ for i, result in enumerate(formatted_results):
805
+ title = result.get("title", "").lower()
806
+ snippet = result.get("snippet", "").lower()
807
+ link = result.get("link", "")
808
+
809
+ # Get original result for metadata access
810
+ original_result = search_results[i] if i < len(search_results) else {}
811
+
812
+ # Calculate relevance score
813
+ score = 0
814
+
815
+ # CRITICAL #1: Company matching (HIGHEST PRIORITY)
816
+ # If we detected a primary company from search results, prioritize results from that company
817
+ if primary_company:
818
+ # Extract domain from this result
819
+ domain_match = re.search(r'https?://(?:www\.)?([^/]+)', link)
820
+ if domain_match:
821
+ result_domain = domain_match.group(1)
822
+ result_company = domain_to_company.get(result_domain, None)
823
+
824
+ if result_company == primary_company:
825
+ # This result is from the primary company!
826
+ score += 30 # HUGE bonus for matching primary company
827
+ logger.info(f"[SCORE] Result {i} from primary company '{primary_company}' (domain: {result_domain}) - score +30")
828
+ elif result_company and result_company != primary_company:
829
+ # This result is from a DIFFERENT company
830
+ score -= 100 # MASSIVE penalty for wrong company
831
+ logger.info(f"[SCORE] Result {i} from WRONG company '{result_company}' (expected '{primary_company}') - score -100")
832
+
833
+ # CRITICAL #2: Heavily prefer direct PDF files
834
+ # Check both URL extension AND mime type metadata
835
+ is_pdf = False
836
+ if link.lower().endswith(".pdf"):
837
+ is_pdf = True
838
+ score += 10 # Base PDF score
839
+
840
+ # BONUS: Check for explicit PDF metadata (mime type and fileFormat)
841
+ if original_result.get("mime") == "application/pdf" or original_result.get("fileFormat") == "PDF/Adobe Acrobat":
842
+ is_pdf = True
843
+ score += 12 # Even higher score for confirmed PDFs with metadata!
844
+ logger.info(f"[SCORE] Result {i} has PDF metadata (mime/fileFormat) - score +12")
845
+
846
+ # Check for keywords/patterns matching between user request and result
847
+ # Extract key terms from user request dynamically
848
+ request_tokens = set(user_request_lower.split())
849
+ title_tokens = set(title.split())
850
+ snippet_tokens = set(snippet.split())
851
+
852
+ # Calculate token overlap (how many words match)
853
+ title_overlap = len(request_tokens & title_tokens)
854
+ snippet_overlap = len(request_tokens & snippet_tokens)
855
+
856
+ # Bonus for word matches
857
+ if title_overlap > 0:
858
+ score += title_overlap * 2 # Each matching word in title = +2 points
859
+ logger.info(f"[SCORE] Result {i} has {title_overlap} matching words in title - score +{title_overlap * 2}")
860
+
861
+ if snippet_overlap > 0:
862
+ score += snippet_overlap # Each matching word in snippet = +1 point
863
+ logger.info(f"[SCORE] Result {i} has {snippet_overlap} matching words in snippet - score +{snippet_overlap}")
864
+
865
+ # Check for year patterns in user request and result
866
+ year_patterns = re.findall(r'\b(19|20)\d{2}\b', user_request_lower)
867
+ for year in year_patterns:
868
+ if year in title or year in snippet or year in link:
869
+ score += 2
870
+ logger.info(f"[SCORE] Result {i} matches year '{year}' - score +2")
871
+
872
+ # Penalize landing/index pages if they're NOT PDFs
873
+ # Dynamic check: look for common index page patterns in URL
874
+ if not is_pdf:
875
+ # Check if URL looks like an index/landing page (contains common patterns)
876
+ index_patterns = ['results', 'default', 'index', 'overview', 'main', 'performance']
877
+ if any(pattern in link for pattern in index_patterns):
878
+ score -= 5 # Heavy penalty for index pages
879
+ logger.info(f"[SCORE] Result {i} is an index/landing page - score -5")
880
+
881
+ # Prefer press-release pages over performance/overview pages
882
+ if 'press-release' in link or 'press_release' in link or 'webcast' in link:
883
+ score += 8 # Bonus for press release pages (likely to have PDFs)
884
+ logger.info(f"[SCORE] Result {i} is a press-release page - score +8")
885
+
886
+ # Prefer official sources (but only if it's a PDF)
887
+ # Dynamic check: look for credible domain indicators
888
+ if is_pdf:
889
+ credible_indicators = ['.gov', 'investor', 'ir.', 'cdn']
890
+ if any(indicator in link for indicator in credible_indicators):
891
+ score += 2
892
+
893
+ # Update best match if this score is higher
894
+ if score > best_score:
895
+ best_score = score
896
+ best_match_index = i
897
+
898
+ # SPECIAL HANDLING: If user requested multiple quarters, return multiple links
899
+ if is_multiple_quarter_request and len(quarters_requested) > 1:
900
+ logger.info(f"[MULTI-QUARTER] User requested {len(quarters_requested)} quarters, returning multiple links")
901
+
902
+ # Group results by quarter using dynamic scoring
903
+ quarter_results = {q: [] for q in quarters_requested}
904
+
905
+ for i, result in enumerate(formatted_results):
906
+ title = result.get("title", "").lower()
907
+ snippet = result.get("snippet", "").lower()
908
+ link = result.get("link", "")
909
+
910
+ # Get original result for metadata access
911
+ original_result = search_results[i] if i < len(search_results) else {}
912
+
913
+ # CRITICAL: Check if this is a PDF link
914
+ is_pdf = link.lower().endswith('.pdf')
915
+
916
+ # Also check PDF metadata
917
+ if original_result.get("mime") == "application/pdf" or original_result.get("fileFormat") == "PDF/Adobe Acrobat":
918
+ is_pdf = True
919
+
920
+ # Calculate relevance score for each quarter dynamically
921
+ # This avoids hardcoding patterns
922
+ quarter_scores = {}
923
+ for quarter in quarters_requested:
924
+ score = 0
925
+
926
+ # PRIORITY #1: Company matching (if we detected primary company)
927
+ if primary_company:
928
+ domain_match = re.search(r'https?://(?:www\.)?([^/]+)', link)
929
+ if domain_match:
930
+ result_domain = domain_match.group(1)
931
+ result_company = domain_to_company.get(result_domain, None)
932
+
933
+ if result_company == primary_company:
934
+ score += 30 # HUGE bonus for matching primary company
935
+ elif result_company and result_company != primary_company:
936
+ score -= 100 # MASSIVE penalty for wrong company
937
+
938
+ # PRIORITY #2: HUGE bonus for PDF files - we want direct download links!
939
+ if is_pdf:
940
+ score += 20 # PDF links get massive priority
941
+
942
+ # PRIORITY #3: Check if quarter appears in title/snippet/link
943
+ if quarter in title or quarter in snippet or quarter in link.lower():
944
+ score += 10
945
+
946
+ # Also check for numeric representation (e.g., "1" for q1)
947
+ quarter_num = quarter[1] # Extract '1' from 'q1'
948
+ if f"q{quarter_num}" in title or f"q{quarter_num}" in snippet or f"q{quarter_num}" in link.lower():
949
+ score += 5
950
+
951
+ # Penalize index/landing pages
952
+ if not is_pdf:
953
+ index_indicators = ['default.aspx', 'investor-relations', '/overview/', 'index']
954
+ if any(indicator in link.lower() for indicator in index_indicators):
955
+ score -= 15 # Heavy penalty for index pages
956
+
957
+ quarter_scores[quarter] = score
958
+
959
+ # Assign to the quarter with highest score (if score > 0)
960
+ if quarter_scores:
961
+ best_quarter = max(quarter_scores.items(), key=lambda x: x[1])
962
+ if best_quarter[1] > 0: # Only assign if score > 0
963
+ quarter_results[best_quarter[0]].append({
964
+ "index": i,
965
+ "title": result.get("title", ""),
966
+ "link": link,
967
+ "snippet": result.get("snippet", ""),
968
+ "score": best_quarter[1],
969
+ "is_pdf": is_pdf
970
+ })
971
+
972
+ # Select best result for each requested quarter
973
+ selected_links = []
974
+ for quarter in quarters_requested:
975
+ if quarter_results[quarter]:
976
+ # Sort by score and get the best result (PDF links will rank highest)
977
+ sorted_results = sorted(quarter_results[quarter], key=lambda x: x.get("score", 0), reverse=True)
978
+ best_for_quarter = sorted_results[0]
979
+ selected_links.append({
980
+ "quarter": quarter.upper(),
981
+ "title": best_for_quarter["title"],
982
+ "link": best_for_quarter["link"],
983
+ "snippet": best_for_quarter["snippet"]
984
+ })
985
+ is_pdf_marker = "[PDF]" if best_for_quarter.get("is_pdf", False) else "[Web Page]"
986
+ logger.info(f"[MULTI-QUARTER] Found result for {quarter.upper()}: {is_pdf_marker} {best_for_quarter['title'][:50]} (score: {best_for_quarter['score']})")
987
+ else:
988
+ logger.warning(f"[MULTI-QUARTER] No result found for {quarter.upper()}")
989
+
990
+ if selected_links:
991
+ return {
992
+ "type": "multiple_download_links",
993
+ "links": selected_links,
994
+ "message": f"Found {len(selected_links)} financial reports for the requested quarters: {', '.join([q.upper() for q in quarters_requested])}",
995
+ "confidence": "high" if len(selected_links) == len(quarters_requested) else "medium",
996
+ "reasoning": f"Selected best result for each requested quarter. Found {len(selected_links)} out of {len(quarters_requested)} quarters."
997
+ }
998
+
999
+ # If we found a reasonable match (score > 0), return it
1000
+ if best_match_index >= 0 and best_score > 0:
1001
+ selected_result = formatted_results[best_match_index]
1002
+ original_result = search_results[best_match_index]
1003
+
1004
+ # Check if the link is an index page that needs further parsing
1005
+ link = selected_result["link"]
1006
+ if not link.lower().endswith(".pdf") and ("investor" in link or "ir." in link or "financial-report" in link):
1007
+ # Try to extract PDF links from the index page
1008
+ pdf_links = await extract_pdf_links_from_page(link, user_request)
1009
+ if pdf_links:
1010
+ # For requests asking for multiple reports (like "2份" or "two"), return multiple links
1011
+ if "2份" in user_request.lower() or "two" in user_request.lower() or "2" in user_request.lower():
1012
+ # Return up to 2 most relevant PDF links
1013
+ relevant_links = pdf_links[:2]
1014
+ return {
1015
+ "type": "download_links_extracted",
1016
+ "links": relevant_links,
1017
+ "message": f"Found {len(relevant_links)} most relevant financial reports for your request",
1018
+ "confidence": "high" if best_score >= 5 else ("medium" if best_score >= 2 else "low"),
1019
+ "reasoning": f"Selected based on relevance scoring (score: {best_score}) and extracted {len(relevant_links)} PDF links from index page."
1020
+ }
1021
+ else:
1022
+ # Return the first PDF link found
1023
+ pdf_link = pdf_links[0]
1024
+ return {
1025
+ "type": "download_link_extracted",
1026
+ "title": f"{selected_result['title']} - {pdf_link.get('title', 'PDF Report')}",
1027
+ "link": pdf_link["url"],
1028
+ "snippet": pdf_link.get("snippet", selected_result["snippet"]),
1029
+ "message": f"Found the most relevant financial report for your request: {pdf_link.get('title', 'PDF Report')}",
1030
+ "confidence": "high" if best_score >= 5 else ("medium" if best_score >= 2 else "low"),
1031
+ "reasoning": f"Selected based on relevance scoring (score: {best_score}) and extracted PDF link from index page."
1032
+ }
1033
+
1034
+ return {
1035
+ "type": "download_link_extracted",
1036
+ "title": selected_result["title"],
1037
+ "link": selected_result["link"],
1038
+ "snippet": selected_result["snippet"],
1039
+ "message": f"Found the most relevant financial report for your request: {selected_result['title']}",
1040
+ "confidence": "high" if best_score >= 5 else ("medium" if best_score >= 2 else "low"),
1041
+ "reasoning": f"Selected based on relevance scoring (score: {best_score}). This result matches key terms in your request."
1042
+ }
1043
+ else:
1044
+ # If no clearly relevant results, return the first result with low confidence
1045
+ if search_results:
1046
+ first_result = search_results[0]
1047
+ link = first_result.get("link", "")
1048
+
1049
+ # Check if the link is an index page that needs further parsing
1050
+ if not link.lower().endswith(".pdf") and ("investor" in link or "ir." in link or "financial-report" in link):
1051
+ # Try to extract PDF links from the index page
1052
+ pdf_links = await extract_pdf_links_from_page(link, user_request)
1053
+ if pdf_links:
1054
+ # For requests asking for multiple reports (like "2份" or "two"), return multiple links
1055
+ if "2份" in user_request.lower() or "two" in user_request.lower() or "2" in user_request.lower():
1056
+ # Return up to 2 most relevant PDF links
1057
+ relevant_links = pdf_links[:2]
1058
+ return {
1059
+ "type": "download_links_extracted",
1060
+ "links": relevant_links,
1061
+ "message": f"Found {len(relevant_links)} most relevant financial reports for your request",
1062
+ "confidence": "low",
1063
+ "reasoning": f"Extracted {len(relevant_links)} PDF links from index page. No highly relevant results found using keyword matching."
1064
+ }
1065
+ else:
1066
+ # Return the first PDF link found
1067
+ pdf_link = pdf_links[0]
1068
+ return {
1069
+ "type": "download_link_extracted",
1070
+ "title": pdf_link.get("title", f"{first_result.get('title', 'Financial Report')} - PDF"),
1071
+ "link": pdf_link["url"],
1072
+ "snippet": pdf_link.get("snippet", first_result.get("snippet", "")),
1073
+ "message": f"Found a potential financial report: {pdf_link.get('title', 'PDF Report')}",
1074
+ "confidence": "low",
1075
+ "reasoning": "Extracted PDF link from index page. No highly relevant results found using keyword matching."
1076
+ }
1077
+
1078
+ return {
1079
+ "type": "download_link_extracted",
1080
+ "title": first_result.get("title", ""),
1081
+ "link": first_result.get("link", ""),
1082
+ "snippet": first_result.get("snippet", ""),
1083
+ "message": "Found a potential financial report, but it may not exactly match your request.",
1084
+ "confidence": "low",
1085
+ "reasoning": "No highly relevant results found using keyword matching."
1086
+ }
1087
+ else:
1088
+ return {
1089
+ "type": "no_results",
1090
+ "message": "No search results available to analyze.",
1091
+ "suggestion": "Please try a different search or provide a direct URL.",
1092
+ "reasoning": "No search results were provided for analysis."
1093
+ }
1094
+ except Exception as e:
1095
+ logger.error(f"Error in deep analysis: {str(e)}")
1096
+ return {
1097
+ "type": "analysis_error",
1098
+ "error": str(e),
1099
+ "message": f"Error occurred while analyzing search results: {str(e)}",
1100
+ "suggestion": "Please try again or provide a direct URL for the financial report."
1101
+ }
1102
+
1103
+ # Resource for accessing extracted financial report content
1104
+ @mcp.resource("financial-report://{filename}")
1105
+ def get_financial_report_content(filename: str) -> str:
1106
+ """
1107
+ Get the content of an extracted financial report
1108
+
1109
+ Args:
1110
+ filename: Name of the extracted file
1111
+
1112
+ Returns:
1113
+ Content of the financial report
1114
+ """
1115
+ # Use absolute path to ensure correct file access in different environments
1116
+ reports_dir = Path("financial_reports").absolute()
1117
+ file_path = reports_dir / filename
1118
+
1119
+ if not file_path.exists():
1120
+ # Also check with relative path as fallback
1121
+ relative_path = Path("financial_reports") / filename
1122
+ if relative_path.exists():
1123
+ file_path = relative_path
1124
+ else:
1125
+ raise Exception(f"File not found: {filename}. Searched in {reports_dir} and relative path {relative_path}")
1126
+
1127
+ # Handle PDF files properly
1128
+ if filename.lower().endswith('.pdf'):
1129
+ try:
1130
+ import pdfplumber
1131
+ with pdfplumber.open(file_path) as pdf:
1132
+ text = ""
1133
+ for page in pdf.pages:
1134
+ text += page.extract_text() or ""
1135
+ return text
1136
+ except Exception as e:
1137
+ # If PDF extraction fails, return error message
1138
+ logger.error(f"Error extracting text from PDF {filename}: {str(e)}")
1139
+ return f"Error extracting text from PDF {filename}: {str(e)}"
1140
+ else:
1141
+ # For text-based files, read normally
1142
+ with open(file_path, "r", encoding="utf-8") as f:
1143
+ return f.read()
1144
+
1145
+ if __name__ == "__main__":
1146
+ # Run the server with stdio transport
1147
+ # Note: We should avoid printing to stdout here as it interferes with stdio communication
1148
+ # Log to stderr instead
1149
+ import sys
1150
+ print("MCP SDK imported successfully", file=sys.stderr)
1151
+ mcp.run(transport="stdio")
chatbot/chat_main.py ADDED
The diff for this file is too large to render. See raw diff