test chatbot
Browse files- app.py +41 -37
- chatbot/MCP_Financial_Report/financial_mcp_server.py +1151 -0
- chatbot/chat_main.py +0 -0
app.py
CHANGED
|
@@ -1617,51 +1617,55 @@ def create_tab_content(tab_name, company_name):
|
|
| 1617 |
gr.Markdown("Report Preview", elem_classes=["font-medium", "mb-3"])
|
| 1618 |
# 这里将显示报告预览
|
| 1619 |
|
|
|
|
|
|
|
| 1620 |
def create_chat_panel():
|
| 1621 |
"""创建聊天面板组件"""
|
| 1622 |
with gr.Column(elem_classes=["chat-panel"]):
|
|
|
|
|
|
|
| 1623 |
# 聊天头部
|
| 1624 |
-
with gr.Row(elem_classes=["p-4", "border-b", "border-gray-200", "items-center", "gap-2"]):
|
| 1625 |
-
|
| 1626 |
-
|
| 1627 |
|
| 1628 |
# 聊天区域
|
| 1629 |
-
chatbot = gr.Chatbot(
|
| 1630 |
-
|
| 1631 |
-
|
| 1632 |
-
|
| 1633 |
-
|
| 1634 |
-
|
| 1635 |
-
|
| 1636 |
-
|
| 1637 |
-
|
| 1638 |
-
|
| 1639 |
-
|
| 1640 |
-
|
| 1641 |
-
|
| 1642 |
-
|
| 1643 |
-
|
| 1644 |
-
|
| 1645 |
-
|
| 1646 |
-
|
| 1647 |
-
)
|
| 1648 |
|
| 1649 |
# 输入区域
|
| 1650 |
-
with gr.Row(elem_classes=["border-t", "border-gray-200", "gap-2"]):
|
| 1651 |
-
|
| 1652 |
-
|
| 1653 |
-
|
| 1654 |
-
|
| 1655 |
-
|
| 1656 |
-
|
| 1657 |
-
|
| 1658 |
-
|
| 1659 |
-
|
| 1660 |
-
|
| 1661 |
-
|
| 1662 |
-
|
| 1663 |
-
|
| 1664 |
-
|
| 1665 |
|
| 1666 |
# def load_css_files(css_dir, filenames):
|
| 1667 |
# css_content = ""
|
|
|
|
| 1617 |
gr.Markdown("Report Preview", elem_classes=["font-medium", "mb-3"])
|
| 1618 |
# 这里将显示报告预览
|
| 1619 |
|
| 1620 |
+
from chatbot.chat_main import create_financial_chatbot
|
| 1621 |
+
|
| 1622 |
def create_chat_panel():
|
| 1623 |
"""创建聊天面板组件"""
|
| 1624 |
with gr.Column(elem_classes=["chat-panel"]):
|
| 1625 |
+
chat_component = create_financial_chatbot()
|
| 1626 |
+
chat_component.render()
|
| 1627 |
# 聊天头部
|
| 1628 |
+
# with gr.Row(elem_classes=["p-4", "border-b", "border-gray-200", "items-center", "gap-2"]):
|
| 1629 |
+
# gr.Markdown("🤖", elem_classes=["text-xl", "text-blue-600"])
|
| 1630 |
+
# gr.Markdown("Financial Assistant", elem_classes=["font-medium"])
|
| 1631 |
|
| 1632 |
# 聊天区域
|
| 1633 |
+
# chatbot = gr.Chatbot(
|
| 1634 |
+
# value=[
|
| 1635 |
+
# {"role": "assistant", "content": "I'm your financial assistant, how can I help you today?"},
|
| 1636 |
+
|
| 1637 |
+
# # {"role": "assistant", "content": "Hello! I can help you analyze financial data. Ask questions like \"Show revenue trends\" or \"Compare profitability ratios\""},
|
| 1638 |
+
# # {"role": "user", "content": "Show revenue trends for last 4 quarters"},
|
| 1639 |
+
# # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
|
| 1640 |
+
# # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
|
| 1641 |
+
# # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"},
|
| 1642 |
+
# # {"role": "assistant", "content": "Revenue trend for GlobalTech Inc.:\n\nQ4 2024: $2.53B (+8.2%)\nQ1 2025: $2.61B (+9.8%)\nQ2 2025: $2.71B (+11.6%)\nQ3 2025: $2.84B (+12.4%)"}
|
| 1643 |
+
# ],
|
| 1644 |
+
# type="messages",
|
| 1645 |
+
# # elem_classes=["min-h-0", "overflow-y-auto", "space-y-4", "chat-content-box"],
|
| 1646 |
+
# show_label=False,
|
| 1647 |
+
# autoscroll=True,
|
| 1648 |
+
# show_copy_button=True,
|
| 1649 |
+
# height=400,
|
| 1650 |
+
# container=False,
|
| 1651 |
+
# )
|
| 1652 |
|
| 1653 |
# 输入区域
|
| 1654 |
+
# with gr.Row(elem_classes=["border-t", "border-gray-200", "gap-2"]):
|
| 1655 |
+
# msg = gr.Textbox(
|
| 1656 |
+
# placeholder="Ask a financial question...",
|
| 1657 |
+
# elem_classes=["flex-1", "border", "border-gray-300", "rounded-lg", "px-4", "py-2", "focus:border-blue-500"],
|
| 1658 |
+
# show_label=False,
|
| 1659 |
+
# lines=1,
|
| 1660 |
+
# submit_btn=True,
|
| 1661 |
+
# container=False,
|
| 1662 |
+
# )
|
| 1663 |
+
# msg.submit(
|
| 1664 |
+
# chat_bot,
|
| 1665 |
+
# [msg, chatbot],
|
| 1666 |
+
# [msg, chatbot],
|
| 1667 |
+
# queue=True,
|
| 1668 |
+
# )
|
| 1669 |
|
| 1670 |
# def load_css_files(css_dir, filenames):
|
| 1671 |
# css_content = ""
|
chatbot/MCP_Financial_Report/financial_mcp_server.py
ADDED
|
@@ -0,0 +1,1151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Financial Report MCP Server using the official MCP Python SDK
|
| 3 |
+
|
| 4 |
+
This server provides tools for downloading and processing financial reports.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Optional, Dict, Any, List
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import aiohttp
|
| 15 |
+
import ssl
|
| 16 |
+
import pdfplumber
|
| 17 |
+
from bs4 import BeautifulSoup
|
| 18 |
+
import httpx
|
| 19 |
+
import json
|
| 20 |
+
import re
|
| 21 |
+
from huggingface_hub import InferenceClient
|
| 22 |
+
|
| 23 |
+
# Configure logging - write to stderr instead of stdout to avoid interfering with stdio communication
|
| 24 |
+
logging.basicConfig(level=logging.INFO, stream=sys.stderr)
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
# Import the official MCP SDK
|
| 28 |
+
try:
|
| 29 |
+
from mcp.server.fastmcp import FastMCP, Context
|
| 30 |
+
from mcp.server.session import ServerSession
|
| 31 |
+
logger.info("MCP SDK imported successfully")
|
| 32 |
+
except ImportError as e:
|
| 33 |
+
logger.error(f"Failed to import MCP SDK: {e}")
|
| 34 |
+
raise
|
| 35 |
+
|
| 36 |
+
# Create the MCP server
|
| 37 |
+
mcp = FastMCP("Financial Report MCP Server", "1.0.0")
|
| 38 |
+
|
| 39 |
+
# Ensure the financial_reports directory exists
|
| 40 |
+
reports_dir = Path("financial_reports")
|
| 41 |
+
reports_dir.mkdir(exist_ok=True)
|
| 42 |
+
logger.info(f"Financial reports directory: {reports_dir.absolute()}")
|
| 43 |
+
|
| 44 |
+
@mcp.tool()
|
| 45 |
+
async def download_financial_report(url: str) -> Dict[str, Any]:
|
| 46 |
+
"""
|
| 47 |
+
Download a financial report from a URL
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
url: The URL of the financial report to download
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Dictionary with download information
|
| 54 |
+
"""
|
| 55 |
+
logger.info(f"Downloading financial report from {url}")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# Decode URL if it contains encoded characters
|
| 59 |
+
import urllib.parse
|
| 60 |
+
decoded_url = urllib.parse.unquote(url)
|
| 61 |
+
logger.info(f"Decoded URL: {decoded_url}")
|
| 62 |
+
|
| 63 |
+
# Re-encode the URL properly to handle spaces and other special characters
|
| 64 |
+
encoded_url = urllib.parse.quote(decoded_url, safe=':/?#[]@!$&\'()*+,;=%')
|
| 65 |
+
logger.info(f"Re-encoded URL: {encoded_url}")
|
| 66 |
+
|
| 67 |
+
# Create SSL context that doesn't verify certificates (for testing)
|
| 68 |
+
ssl_context = ssl.create_default_context()
|
| 69 |
+
ssl_context.check_hostname = False
|
| 70 |
+
ssl_context.verify_mode = ssl.CERT_NONE
|
| 71 |
+
|
| 72 |
+
# Add timeout and headers for better reliability
|
| 73 |
+
timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
|
| 74 |
+
headers = {
|
| 75 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
| 79 |
+
async with session.get(encoded_url, ssl=ssl_context, headers=headers) as response:
|
| 80 |
+
if response.status != 200:
|
| 81 |
+
raise Exception(f"HTTP {response.status} when downloading {encoded_url}")
|
| 82 |
+
|
| 83 |
+
# CRITICAL: Check if this is an HTML investor relations page
|
| 84 |
+
# If so, try to extract PDF links instead of downloading the HTML
|
| 85 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 86 |
+
is_html = 'html' in content_type
|
| 87 |
+
is_investor_page = any(pattern in url.lower() for pattern in ['investor', 'ir.', 'press-release', 'earnings', 'financial'])
|
| 88 |
+
|
| 89 |
+
if is_html and is_investor_page:
|
| 90 |
+
logger.info(f"[DOWNLOAD] Detected HTML investor relations page, attempting to extract PDF links")
|
| 91 |
+
# Try to extract PDF links from this page
|
| 92 |
+
pdf_links = await extract_pdf_links_from_page(url, "")
|
| 93 |
+
if pdf_links:
|
| 94 |
+
# Found PDF link(s), download the first PDF instead
|
| 95 |
+
pdf_url = pdf_links[0]["url"]
|
| 96 |
+
logger.info(f"[DOWNLOAD] Found PDF link, redirecting download to: {pdf_url}")
|
| 97 |
+
# Recursively call ourselves with the PDF URL
|
| 98 |
+
return await download_financial_report(pdf_url)
|
| 99 |
+
else:
|
| 100 |
+
logger.warning(f"[DOWNLOAD] No PDF links found on investor page, downloading HTML anyway")
|
| 101 |
+
|
| 102 |
+
# Determine filename from decoded URL to preserve original filename
|
| 103 |
+
filename = decoded_url.split("/")[-1]
|
| 104 |
+
if not filename or "." not in filename:
|
| 105 |
+
if 'pdf' in content_type:
|
| 106 |
+
filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
|
| 107 |
+
elif 'html' in content_type:
|
| 108 |
+
filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
|
| 109 |
+
else:
|
| 110 |
+
filename = f"report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.dat"
|
| 111 |
+
|
| 112 |
+
# Save file
|
| 113 |
+
file_path = Path("financial_reports") / filename
|
| 114 |
+
content = await response.read()
|
| 115 |
+
|
| 116 |
+
logger.info(f"Saving report to {file_path.absolute()}")
|
| 117 |
+
with open(file_path, "wb") as f:
|
| 118 |
+
f.write(content)
|
| 119 |
+
|
| 120 |
+
logger.info(f"Successfully downloaded report to {file_path}")
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
"filename": filename,
|
| 124 |
+
"filepath": str(file_path),
|
| 125 |
+
"size": len(content),
|
| 126 |
+
"download_time": datetime.now().isoformat(),
|
| 127 |
+
"source_url": url # CRITICAL: Include original URL for analysis context
|
| 128 |
+
}
|
| 129 |
+
except aiohttp.ClientError as e:
|
| 130 |
+
logger.error(f"Network error downloading financial report: {str(e)}")
|
| 131 |
+
raise Exception(f"Network error downloading financial report: {str(e)}. This may be due to network restrictions in the execution environment.")
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logger.error(f"Error downloading financial report: {str(e)}")
|
| 134 |
+
raise Exception(f"Error downloading financial report: {str(e)}")
|
| 135 |
+
|
| 136 |
+
@mcp.tool()
|
| 137 |
+
async def list_downloaded_reports() -> Dict[str, Any]:
|
| 138 |
+
"""
|
| 139 |
+
List all downloaded financial reports
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Dictionary with list of reports
|
| 143 |
+
"""
|
| 144 |
+
try:
|
| 145 |
+
reports = []
|
| 146 |
+
download_dir = Path("financial_reports")
|
| 147 |
+
if download_dir.exists():
|
| 148 |
+
for file_path in download_dir.iterdir():
|
| 149 |
+
if file_path.is_file():
|
| 150 |
+
stat = file_path.stat()
|
| 151 |
+
# Import urllib.parse here to avoid undefined name error
|
| 152 |
+
import urllib.parse
|
| 153 |
+
reports.append({
|
| 154 |
+
"filename": file_path.name,
|
| 155 |
+
"filepath": str(file_path),
|
| 156 |
+
"size": stat.st_size,
|
| 157 |
+
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
| 158 |
+
"encoded_filename": urllib.parse.quote(file_path.name, safe=':/?#[]@!$&\'()*+,;=%')
|
| 159 |
+
})
|
| 160 |
+
|
| 161 |
+
return {
|
| 162 |
+
"reports": reports
|
| 163 |
+
}
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.error(f"Error listing downloaded reports: {str(e)}")
|
| 166 |
+
raise Exception(f"Error listing downloaded reports: {str(e)}")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
@mcp.tool()
|
| 170 |
+
async def analyze_financial_report_file(filename: str, source_url: str = "") -> Dict[str, Any]:
|
| 171 |
+
"""
|
| 172 |
+
Analyze a downloaded financial report file and provide investment insights
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
filename: Name of the financial report file to analyze
|
| 176 |
+
source_url: Optional original URL where the report was downloaded from
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Dictionary with analysis results and investment insights
|
| 180 |
+
"""
|
| 181 |
+
logger.info(f"Analyzing financial report file: {filename}")
|
| 182 |
+
if source_url:
|
| 183 |
+
logger.info(f"Source URL: {source_url}")
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
# CRITICAL: If filename is empty, auto-detect the most recently downloaded file
|
| 187 |
+
if not filename or filename.strip() == "":
|
| 188 |
+
logger.info("[AUTO-DETECT] No filename provided, looking for most recent downloaded file")
|
| 189 |
+
reports_dir = Path("financial_reports")
|
| 190 |
+
if reports_dir.exists():
|
| 191 |
+
# Get all files in the directory
|
| 192 |
+
files = [(f, f.stat().st_mtime) for f in reports_dir.iterdir() if f.is_file()]
|
| 193 |
+
if files:
|
| 194 |
+
# Sort by modification time (most recent first)
|
| 195 |
+
files.sort(key=lambda x: x[1], reverse=True)
|
| 196 |
+
filename = files[0][0].name
|
| 197 |
+
logger.info(f"[AUTO-DETECT] Found most recent file: {filename}")
|
| 198 |
+
else:
|
| 199 |
+
raise Exception("No filename provided and no downloaded files found in financial_reports directory")
|
| 200 |
+
else:
|
| 201 |
+
raise Exception("No filename provided and financial_reports directory does not exist")
|
| 202 |
+
|
| 203 |
+
# Use absolute path to ensure correct file access in different environments
|
| 204 |
+
reports_dir = Path("financial_reports").absolute()
|
| 205 |
+
file_path = reports_dir / filename
|
| 206 |
+
|
| 207 |
+
if not file_path.exists():
|
| 208 |
+
# Also check with relative path as fallback
|
| 209 |
+
relative_path = Path("financial_reports") / filename
|
| 210 |
+
if relative_path.exists():
|
| 211 |
+
file_path = relative_path
|
| 212 |
+
else:
|
| 213 |
+
raise Exception(f"File not found: {filename}. Searched in {reports_dir} and relative path {relative_path}")
|
| 214 |
+
|
| 215 |
+
# Handle PDF files properly
|
| 216 |
+
file_content = ""
|
| 217 |
+
if filename.lower().endswith('.pdf'):
|
| 218 |
+
try:
|
| 219 |
+
import pdfplumber
|
| 220 |
+
with pdfplumber.open(file_path) as pdf:
|
| 221 |
+
text = ""
|
| 222 |
+
# Extract text from first few pages to avoid overwhelming the model
|
| 223 |
+
pages_to_extract = min(10, len(pdf.pages)) # Limit to first 10 pages
|
| 224 |
+
for i in range(pages_to_extract):
|
| 225 |
+
page = pdf.pages[i]
|
| 226 |
+
text += page.extract_text() or ""
|
| 227 |
+
file_content = text
|
| 228 |
+
except Exception as e:
|
| 229 |
+
# If PDF extraction fails, return error message
|
| 230 |
+
logger.error(f"Error extracting text from PDF {filename}: {str(e)}")
|
| 231 |
+
file_content = f"Error extracting text from PDF {filename}: {str(e)}"
|
| 232 |
+
else:
|
| 233 |
+
# For text-based files, read normally
|
| 234 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 235 |
+
file_content = f.read()
|
| 236 |
+
|
| 237 |
+
# CRITICAL: If this is HTML content and we have source_url, extract clean text instead
|
| 238 |
+
is_html = (
|
| 239 |
+
filename.lower().endswith('.html') or
|
| 240 |
+
'<html' in file_content.lower()[:500] or
|
| 241 |
+
'<!doctype html' in file_content.lower()[:500] or
|
| 242 |
+
'<meta' in file_content.lower()[:500]
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
if is_html and source_url:
|
| 246 |
+
logger.info(f"[HTML EXTRACTION] Detected HTML content, extracting text from source URL: {source_url}")
|
| 247 |
+
try:
|
| 248 |
+
from bs4 import BeautifulSoup
|
| 249 |
+
|
| 250 |
+
# Re-fetch the page to get full content (not truncated)
|
| 251 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 252 |
+
response = await client.get(source_url, headers={
|
| 253 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 254 |
+
})
|
| 255 |
+
response.raise_for_status()
|
| 256 |
+
|
| 257 |
+
# Parse HTML and extract text
|
| 258 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 259 |
+
|
| 260 |
+
# Remove script, style, nav, header, footer
|
| 261 |
+
for element in soup(["script", "style", "nav", "header", "footer", "noscript"]):
|
| 262 |
+
element.decompose()
|
| 263 |
+
|
| 264 |
+
# Get text
|
| 265 |
+
text = soup.get_text(separator='\n', strip=True)
|
| 266 |
+
|
| 267 |
+
# Clean up whitespace
|
| 268 |
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 269 |
+
clean_text = '\n'.join(lines)
|
| 270 |
+
|
| 271 |
+
if clean_text:
|
| 272 |
+
file_content = clean_text
|
| 273 |
+
logger.info(f"[HTML EXTRACTION] Successfully extracted {len(file_content)} characters of clean text")
|
| 274 |
+
else:
|
| 275 |
+
logger.warning(f"[HTML EXTRACTION] No text extracted, using original HTML")
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.error(f"[HTML EXTRACTION] Failed to extract text: {str(e)}")
|
| 279 |
+
logger.info(f"[HTML EXTRACTION] Falling back to original HTML content")
|
| 280 |
+
# Keep using the original HTML file_content
|
| 281 |
+
|
| 282 |
+
# Truncate content if too long for the model
|
| 283 |
+
if len(file_content) > 15000:
|
| 284 |
+
file_content = file_content[:15000] + "... (truncated)"
|
| 285 |
+
|
| 286 |
+
# Return file analysis trigger with content for the main app to process
|
| 287 |
+
# This allows app.py to do streaming analysis which is better for UX
|
| 288 |
+
result = {
|
| 289 |
+
"type": "file_analysis_trigger",
|
| 290 |
+
"file_path": str(file_path),
|
| 291 |
+
"filename": filename,
|
| 292 |
+
"content": file_content, # Include full content for analysis
|
| 293 |
+
"content_preview": file_content[:500] + "... (preview truncated)" if len(file_content) > 500 else file_content
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
# CRITICAL: Include source URL if available for analysis context
|
| 297 |
+
if source_url:
|
| 298 |
+
result["source_url"] = source_url
|
| 299 |
+
logger.info(f"Including source URL in analysis result: {source_url}")
|
| 300 |
+
|
| 301 |
+
return result
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f"Error analyzing financial report file {filename}: {str(e)}")
|
| 304 |
+
raise Exception(f"Error analyzing financial report file {filename}: {str(e)}")
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# New tool for searching financial reports online
|
| 308 |
+
@mcp.tool()
|
| 309 |
+
async def search_and_extract_financial_report(user_query: str) -> Dict[str, Any]:
|
| 310 |
+
"""
|
| 311 |
+
Search for financial reports online based on user's query and return raw search results for Agent analysis
|
| 312 |
+
|
| 313 |
+
Args:
|
| 314 |
+
user_query: The user's complete search query
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
Dictionary with raw search results for Agent analysis
|
| 318 |
+
"""
|
| 319 |
+
|
| 320 |
+
search_base_url = 'https://www.googleapis.com/customsearch/v1'
|
| 321 |
+
|
| 322 |
+
params = {
|
| 323 |
+
"key": "AIzaSyARhFllOKRdpHjij5idJZ-vXa-0fdIQqGI",
|
| 324 |
+
"cx": "51d2770bb9e304626",
|
| 325 |
+
"q": user_query
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
logger.info(f"Searching for financial reports with query: {user_query}")
|
| 329 |
+
|
| 330 |
+
try:
|
| 331 |
+
async with httpx.AsyncClient() as client:
|
| 332 |
+
response = await client.get(search_base_url, params=params)
|
| 333 |
+
response.raise_for_status()
|
| 334 |
+
search_results = response.json()
|
| 335 |
+
|
| 336 |
+
# Check if we have search results
|
| 337 |
+
if "items" in search_results and search_results["items"]:
|
| 338 |
+
# Return search results with proper structure
|
| 339 |
+
return {
|
| 340 |
+
"type": "search_results",
|
| 341 |
+
"results": search_results["items"],
|
| 342 |
+
"message": f"Successfully found {len(search_results['items'])} search results for query: {user_query}"
|
| 343 |
+
}
|
| 344 |
+
else:
|
| 345 |
+
# No results found
|
| 346 |
+
return {
|
| 347 |
+
"type": "search_no_results",
|
| 348 |
+
"message": f"No financial reports found for query: {user_query}",
|
| 349 |
+
"suggestion": "Please provide a direct URL (or PDF format URL) for the financial report you're looking for."
|
| 350 |
+
}
|
| 351 |
+
except httpx.RequestError as e:
|
| 352 |
+
logger.error(f"Error performing web search: {str(e)}")
|
| 353 |
+
return {
|
| 354 |
+
"type": "search_error",
|
| 355 |
+
"error": str(e),
|
| 356 |
+
"message": f"Exception while searching for financial reports with query '{user_query}': {str(e)}",
|
| 357 |
+
"suggestion": "Please ask user to provide a direct URL (or PDF format URL) for the financial report due to search error."
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
@mcp.tool()
|
| 362 |
+
def rank_pdf_links_by_relevance(pdf_links: List[Dict[str, str]], user_request: str) -> List[Dict[str, str]]:
|
| 363 |
+
"""
|
| 364 |
+
Rank PDF links by relevance to user request
|
| 365 |
+
|
| 366 |
+
Args:
|
| 367 |
+
pdf_links: List of PDF links to rank
|
| 368 |
+
user_request: User's specific request
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
Ranked list of PDF links
|
| 372 |
+
"""
|
| 373 |
+
# Convert user request to lowercase for case-insensitive matching
|
| 374 |
+
user_request_lower = user_request.lower()
|
| 375 |
+
|
| 376 |
+
# Score each PDF link based on relevance using dynamic token matching
|
| 377 |
+
scored_links = []
|
| 378 |
+
for link in pdf_links:
|
| 379 |
+
title = link.get("title", "").lower()
|
| 380 |
+
snippet = link.get("snippet", "").lower()
|
| 381 |
+
|
| 382 |
+
score = 0
|
| 383 |
+
|
| 384 |
+
# Dynamic keyword matching - extract tokens from user request and compare
|
| 385 |
+
request_tokens = set(user_request_lower.split())
|
| 386 |
+
title_tokens = set(title.split())
|
| 387 |
+
snippet_tokens = set(snippet.split())
|
| 388 |
+
|
| 389 |
+
# Calculate token overlap
|
| 390 |
+
title_overlap = len(request_tokens & title_tokens)
|
| 391 |
+
snippet_overlap = len(request_tokens & snippet_tokens)
|
| 392 |
+
|
| 393 |
+
if title_overlap > 0:
|
| 394 |
+
score += title_overlap * 2 # Each matching word in title = +2 points
|
| 395 |
+
if snippet_overlap > 0:
|
| 396 |
+
score += snippet_overlap # Each matching word in snippet = +1 point
|
| 397 |
+
|
| 398 |
+
# Prefer more recent reports - dynamically check for year patterns
|
| 399 |
+
import re
|
| 400 |
+
year_matches = re.findall(r'\b(19|20)\d{2}\b', user_request_lower)
|
| 401 |
+
for year in year_matches:
|
| 402 |
+
if year in title or year in snippet:
|
| 403 |
+
score += 1
|
| 404 |
+
|
| 405 |
+
# Check for "recent" indicators dynamically
|
| 406 |
+
recent_indicators = ['最近', 'recent', 'latest', 'newest']
|
| 407 |
+
if any(indicator in user_request_lower for indicator in recent_indicators):
|
| 408 |
+
# Prefer links with recent years in title
|
| 409 |
+
current_year = datetime.now().year
|
| 410 |
+
for i in range(3): # Check for current year and 2 previous years
|
| 411 |
+
year_str = str(current_year - i)
|
| 412 |
+
if year_str in title or year_str in snippet:
|
| 413 |
+
score += (3 - i) # Higher score for more recent years
|
| 414 |
+
|
| 415 |
+
scored_links.append((score, link))
|
| 416 |
+
|
| 417 |
+
# Sort by score (descending)
|
| 418 |
+
scored_links.sort(key=lambda x: x[0], reverse=True)
|
| 419 |
+
|
| 420 |
+
# Return links without scores
|
| 421 |
+
return [link for score, link in scored_links]
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
async def extract_pdf_links_from_page(url: str, user_request: str = "") -> List[Dict[str, str]]:
|
| 425 |
+
"""
|
| 426 |
+
Extract PDF links from a financial report index page and rank them based on user request
|
| 427 |
+
|
| 428 |
+
Args:
|
| 429 |
+
url: URL of the index page to parse
|
| 430 |
+
user_request: User's specific request for filtering relevant PDFs
|
| 431 |
+
|
| 432 |
+
Returns:
|
| 433 |
+
List of dictionaries containing PDF link information, sorted by relevance
|
| 434 |
+
"""
|
| 435 |
+
logger.info(f"Extracting PDF links from page: {url}")
|
| 436 |
+
|
| 437 |
+
try:
|
| 438 |
+
# Create SSL context that doesn't verify certificates (for testing)
|
| 439 |
+
ssl_context = ssl.create_default_context()
|
| 440 |
+
ssl_context.check_hostname = False
|
| 441 |
+
ssl_context.verify_mode = ssl.CERT_NONE
|
| 442 |
+
|
| 443 |
+
# Add timeout and headers for better reliability
|
| 444 |
+
timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
|
| 445 |
+
headers = {
|
| 446 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
| 450 |
+
async with session.get(url, ssl=ssl_context, headers=headers) as response:
|
| 451 |
+
if response.status != 200:
|
| 452 |
+
logger.warning(f"HTTP {response.status} when fetching {url}")
|
| 453 |
+
return []
|
| 454 |
+
|
| 455 |
+
content = await response.text()
|
| 456 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 457 |
+
|
| 458 |
+
pdf_links = []
|
| 459 |
+
|
| 460 |
+
# Look for PDF links in the page
|
| 461 |
+
for link_elem in soup.find_all('a', href=True):
|
| 462 |
+
href = link_elem['href']
|
| 463 |
+
title = link_elem.get_text(strip=True)
|
| 464 |
+
|
| 465 |
+
# Check if this is a PDF link
|
| 466 |
+
if href.lower().endswith('.pdf'):
|
| 467 |
+
# Make absolute URL if needed
|
| 468 |
+
if href.startswith('//'):
|
| 469 |
+
href = 'https:' + href
|
| 470 |
+
elif href.startswith('/'):
|
| 471 |
+
# Construct absolute URL from base URL
|
| 472 |
+
from urllib.parse import urljoin
|
| 473 |
+
href = urljoin(url, href)
|
| 474 |
+
elif not href.startswith('http'):
|
| 475 |
+
# Relative URL, construct absolute URL
|
| 476 |
+
from urllib.parse import urljoin
|
| 477 |
+
href = urljoin(url, href)
|
| 478 |
+
|
| 479 |
+
pdf_links.append({
|
| 480 |
+
"url": href,
|
| 481 |
+
"title": title or "PDF Report",
|
| 482 |
+
"snippet": f"PDF document: {title}"
|
| 483 |
+
})
|
| 484 |
+
|
| 485 |
+
# Also look for links with potential PDF indicators in text
|
| 486 |
+
# Use dynamic matching instead of hardcoded keywords
|
| 487 |
+
for link_elem in soup.find_all('a', href=True):
|
| 488 |
+
href = link_elem['href']
|
| 489 |
+
title = link_elem.get_text(strip=True)
|
| 490 |
+
title_lower = title.lower()
|
| 491 |
+
|
| 492 |
+
# Dynamic check: if link text contains PDF-related terms from user request
|
| 493 |
+
# or common report indicators, consider it
|
| 494 |
+
request_tokens = set(user_request.lower().split()) if user_request else set()
|
| 495 |
+
title_tokens = set(title_lower.split())
|
| 496 |
+
|
| 497 |
+
# Check for overlap with user request OR common PDF indicators
|
| 498 |
+
has_request_match = len(request_tokens & title_tokens) > 0 if request_tokens else False
|
| 499 |
+
has_pdf_indicator = 'pdf' in title_lower or '.pdf' in href.lower()
|
| 500 |
+
|
| 501 |
+
if has_request_match or has_pdf_indicator:
|
| 502 |
+
# Make absolute URL if needed
|
| 503 |
+
if href.startswith('//'):
|
| 504 |
+
href = 'https:' + href
|
| 505 |
+
elif href.startswith('/'):
|
| 506 |
+
# Construct absolute URL from base URL
|
| 507 |
+
from urllib.parse import urljoin
|
| 508 |
+
href = urljoin(url, href)
|
| 509 |
+
elif not href.startswith('http'):
|
| 510 |
+
# Relative URL, construct absolute URL
|
| 511 |
+
from urllib.parse import urljoin
|
| 512 |
+
href = urljoin(url, href)
|
| 513 |
+
|
| 514 |
+
# If it's a PDF link, add it
|
| 515 |
+
if href.lower().endswith('.pdf'):
|
| 516 |
+
pdf_links.append({
|
| 517 |
+
"url": href,
|
| 518 |
+
"title": title or "PDF Report",
|
| 519 |
+
"snippet": f"PDF document: {title}"
|
| 520 |
+
})
|
| 521 |
+
|
| 522 |
+
# Rank PDF links based on user request
|
| 523 |
+
if user_request:
|
| 524 |
+
ranked_links = rank_pdf_links_by_relevance(pdf_links, user_request)
|
| 525 |
+
else:
|
| 526 |
+
ranked_links = pdf_links
|
| 527 |
+
|
| 528 |
+
logger.info(f"Found {len(ranked_links)} PDF links on page {url}")
|
| 529 |
+
return ranked_links
|
| 530 |
+
except Exception as e:
|
| 531 |
+
logger.error(f"Error extracting PDF links from {url}: {str(e)}")
|
| 532 |
+
return []
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
@mcp.tool()
|
| 536 |
+
async def deep_analyze_and_extract_download_link(search_results: List[Dict[str, Any]], user_request: str) -> Dict[str, Any]:
|
| 537 |
+
"""
|
| 538 |
+
Deep analyze search results using LLM and extract the most relevant download link based on user request
|
| 539 |
+
|
| 540 |
+
Args:
|
| 541 |
+
search_results: List of search results from search_and_extract_financial_report
|
| 542 |
+
user_request: The user's specific request
|
| 543 |
+
|
| 544 |
+
Returns:
|
| 545 |
+
Dictionary with the most relevant download link and related information
|
| 546 |
+
"""
|
| 547 |
+
logger.info(f"Deep analyzing search results for user request: {user_request}")
|
| 548 |
+
|
| 549 |
+
# CRITICAL: Detect if user is requesting MULTIPLE quarters/reports
|
| 550 |
+
# Use dynamic regex pattern matching instead of hardcoding quarter names
|
| 551 |
+
user_request_lower = user_request.lower()
|
| 552 |
+
|
| 553 |
+
# Detect quarter requests dynamically using regex
|
| 554 |
+
quarters_requested = []
|
| 555 |
+
|
| 556 |
+
# Pattern 1: Q1, Q2, Q3, Q4 (case insensitive)
|
| 557 |
+
import re
|
| 558 |
+
q_pattern = re.findall(r'\bq([1-4])\b', user_request_lower)
|
| 559 |
+
for q_num in q_pattern:
|
| 560 |
+
quarter_key = f'q{q_num}'
|
| 561 |
+
if quarter_key not in quarters_requested:
|
| 562 |
+
quarters_requested.append(quarter_key)
|
| 563 |
+
|
| 564 |
+
# Pattern 2: "first", "second", "third", "fourth" + "quarter"
|
| 565 |
+
quarter_words = {
|
| 566 |
+
'first': 'q1',
|
| 567 |
+
'second': 'q2',
|
| 568 |
+
'third': 'q3',
|
| 569 |
+
'fourth': 'q4',
|
| 570 |
+
'1st': 'q1',
|
| 571 |
+
'2nd': 'q2',
|
| 572 |
+
'3rd': 'q3',
|
| 573 |
+
'4th': 'q4'
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
for word, q_key in quarter_words.items():
|
| 577 |
+
if word in user_request_lower and 'quarter' in user_request_lower:
|
| 578 |
+
if q_key not in quarters_requested:
|
| 579 |
+
quarters_requested.append(q_key)
|
| 580 |
+
|
| 581 |
+
is_multiple_quarter_request = len(quarters_requested) > 1
|
| 582 |
+
logger.info(f"[MULTI-QUARTER DETECTION] Quarters requested: {quarters_requested}, is_multiple: {is_multiple_quarter_request}")
|
| 583 |
+
|
| 584 |
+
try:
|
| 585 |
+
# Convert search results to a more readable format for LLM analysis
|
| 586 |
+
formatted_results = []
|
| 587 |
+
for i, result in enumerate(search_results[:10]): # Limit to top 10 results
|
| 588 |
+
formatted_results.append({
|
| 589 |
+
"index": i,
|
| 590 |
+
"title": result.get("title", ""),
|
| 591 |
+
"link": result.get("link", ""),
|
| 592 |
+
"snippet": result.get("snippet", "")
|
| 593 |
+
})
|
| 594 |
+
|
| 595 |
+
# Create prompt for LLM to analyze search results
|
| 596 |
+
prompt = f"""
|
| 597 |
+
You are a financial report analysis expert. Your task is to analyze search results and identify the most relevant download link for a user's specific request.
|
| 598 |
+
|
| 599 |
+
User Request: {user_request}
|
| 600 |
+
|
| 601 |
+
Search Results:
|
| 602 |
+
{json.dumps(formatted_results, indent=2)}
|
| 603 |
+
|
| 604 |
+
Please analyze these search results and identify the most relevant financial report that matches the user's request. Consider factors such as:
|
| 605 |
+
1. **CRITICAL: Prefer direct PDF download links (.pdf URLs) over web pages** - Users want downloadable files, not landing pages
|
| 606 |
+
2. Relevance to the user's specific request (company name, report type, quarter/year, etc.)
|
| 607 |
+
3. Source credibility (official company websites, SEC.gov, etc.)
|
| 608 |
+
4. Match the exact period requested (e.g., if user asks for Q1 2025, prioritize Q1 2025 reports over annual reports)
|
| 609 |
+
5. Avoid generic index pages or landing pages - look for specific report PDFs
|
| 610 |
+
|
| 611 |
+
Priority Rules:
|
| 612 |
+
- Direct PDF link for the exact period requested = HIGHEST PRIORITY
|
| 613 |
+
- Direct PDF link for a related period = HIGH PRIORITY
|
| 614 |
+
- Web page or landing page = LOW PRIORITY (only if no PDF available)
|
| 615 |
+
|
| 616 |
+
Respond with a JSON object in the following format:
|
| 617 |
+
{{
|
| 618 |
+
"selected_index": 0,
|
| 619 |
+
"reasoning": "Explanation of why this result was selected",
|
| 620 |
+
"confidence": "high|medium|low"
|
| 621 |
+
}}
|
| 622 |
+
|
| 623 |
+
If none of the results are relevant, respond with:
|
| 624 |
+
{{
|
| 625 |
+
"selected_index": -1,
|
| 626 |
+
"reasoning": "Explanation of why no results are relevant",
|
| 627 |
+
"confidence": "low"
|
| 628 |
+
}}
|
| 629 |
+
"""
|
| 630 |
+
|
| 631 |
+
# Call LLM for analysis
|
| 632 |
+
try:
|
| 633 |
+
import sys
|
| 634 |
+
import os
|
| 635 |
+
print(f"[LLM-DEBUG] About to initialize InferenceClient...", file=sys.stderr)
|
| 636 |
+
|
| 637 |
+
# Get token from environment
|
| 638 |
+
hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
| 639 |
+
if hf_token:
|
| 640 |
+
print(f"[LLM-DEBUG] Found HUGGING_FACE_HUB_TOKEN (length: {len(hf_token)})", file=sys.stderr)
|
| 641 |
+
else:
|
| 642 |
+
print(f"[LLM-DEBUG] WARNING: No token found", file=sys.stderr)
|
| 643 |
+
|
| 644 |
+
# Initialize the Hugging Face Inference Client with explicit endpoint
|
| 645 |
+
from huggingface_hub import InferenceClient
|
| 646 |
+
client = InferenceClient(
|
| 647 |
+
token=hf_token,
|
| 648 |
+
base_url="https://api-inference.huggingface.co/models/Qwen/Qwen2.5-72B-Instruct"
|
| 649 |
+
)
|
| 650 |
+
print(f"[LLM-DEBUG] InferenceClient initialized successfully", file=sys.stderr)
|
| 651 |
+
|
| 652 |
+
|
| 653 |
+
messages = [
|
| 654 |
+
{"role": "system", "content": "You are a precise JSON generator that helps analyze financial report search results. You are also helpful in guiding users to find the most relevant financial reports. You should ONLY generate valid JSON responses in the specified format."},
|
| 655 |
+
{"role": "user", "content": prompt}
|
| 656 |
+
]
|
| 657 |
+
|
| 658 |
+
# Get response from LLM
|
| 659 |
+
response = client.chat.completions.create(
|
| 660 |
+
model="Qwen/Qwen2.5-72B-Instruct",
|
| 661 |
+
messages=messages,
|
| 662 |
+
max_tokens=500,
|
| 663 |
+
temperature=0.3,
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
# Extract the JSON response
|
| 667 |
+
if hasattr(response, 'choices') and len(response.choices) > 0:
|
| 668 |
+
content = response.choices[0].message.content if hasattr(response.choices[0].message, 'content') else str(response.choices[0].message)
|
| 669 |
+
else:
|
| 670 |
+
content = str(response)
|
| 671 |
+
|
| 672 |
+
# Try to parse as JSON
|
| 673 |
+
try:
|
| 674 |
+
# Extract JSON from the response if it's wrapped in other text
|
| 675 |
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
| 676 |
+
if json_match:
|
| 677 |
+
json_str = json_match.group(0)
|
| 678 |
+
llm_result = json.loads(json_str)
|
| 679 |
+
|
| 680 |
+
# Extract the selected index
|
| 681 |
+
selected_index = llm_result.get("selected_index", -1)
|
| 682 |
+
reasoning = llm_result.get("reasoning", "No reasoning provided")
|
| 683 |
+
confidence = llm_result.get("confidence", "low")
|
| 684 |
+
|
| 685 |
+
# If a valid index was selected, return that result
|
| 686 |
+
if 0 <= selected_index < len(formatted_results):
|
| 687 |
+
selected_result = formatted_results[selected_index]
|
| 688 |
+
original_result = search_results[selected_index]
|
| 689 |
+
|
| 690 |
+
# CRITICAL: If LLM selected a non-PDF link, try to extract PDF from the page first
|
| 691 |
+
link = selected_result["link"]
|
| 692 |
+
if not link.lower().endswith(".pdf"):
|
| 693 |
+
# Check if it looks like an investor relations page
|
| 694 |
+
if "investor" in link or "ir." in link or "press-release" in link or "earnings" in link:
|
| 695 |
+
logger.info(f"[LLM-SELECTED] Non-PDF link detected, attempting to extract PDF from page: {link}")
|
| 696 |
+
pdf_links = await extract_pdf_links_from_page(link, user_request)
|
| 697 |
+
if pdf_links:
|
| 698 |
+
# Return the first PDF link found
|
| 699 |
+
pdf_link = pdf_links[0]
|
| 700 |
+
logger.info(f"[LLM-SELECTED] Successfully extracted PDF: {pdf_link.get('title', 'PDF Report')}")
|
| 701 |
+
return {
|
| 702 |
+
"type": "download_link_extracted",
|
| 703 |
+
"title": f"{selected_result['title']} - {pdf_link.get('title', 'PDF Report')}",
|
| 704 |
+
"link": pdf_link["url"],
|
| 705 |
+
"snippet": pdf_link.get("snippet", selected_result["snippet"]),
|
| 706 |
+
"message": f"Found the most relevant financial report for your request: {pdf_link.get('title', 'PDF Report')}",
|
| 707 |
+
"confidence": confidence,
|
| 708 |
+
"reasoning": f"{reasoning}. Extracted PDF link from the selected page."
|
| 709 |
+
}
|
| 710 |
+
else:
|
| 711 |
+
logger.warning(f"[LLM-SELECTED] No PDF links found on page: {link}")
|
| 712 |
+
|
| 713 |
+
return {
|
| 714 |
+
"type": "download_link_extracted",
|
| 715 |
+
"title": selected_result["title"],
|
| 716 |
+
"link": selected_result["link"],
|
| 717 |
+
"snippet": selected_result["snippet"],
|
| 718 |
+
"message": f"Found the most relevant financial report for your request: {selected_result['title']}",
|
| 719 |
+
"confidence": confidence,
|
| 720 |
+
"reasoning": reasoning
|
| 721 |
+
}
|
| 722 |
+
elif selected_index == -1:
|
| 723 |
+
# No relevant results found
|
| 724 |
+
if search_results:
|
| 725 |
+
first_result = search_results[0]
|
| 726 |
+
return {
|
| 727 |
+
"type": "download_link_extracted",
|
| 728 |
+
"title": first_result.get("title", ""),
|
| 729 |
+
"link": first_result.get("link", ""),
|
| 730 |
+
"snippet": first_result.get("snippet", ""),
|
| 731 |
+
"message": "Found a potential financial report, but it may not exactly match your request.",
|
| 732 |
+
"confidence": "low",
|
| 733 |
+
"reasoning": reasoning
|
| 734 |
+
}
|
| 735 |
+
else:
|
| 736 |
+
return {
|
| 737 |
+
"type": "no_results",
|
| 738 |
+
"message": "No search results available to analyze.",
|
| 739 |
+
"suggestion": "Please try a different search or provide a direct URL.",
|
| 740 |
+
"reasoning": "No search results were provided for analysis."
|
| 741 |
+
}
|
| 742 |
+
else:
|
| 743 |
+
# Invalid index, fall back to heuristic-based selection
|
| 744 |
+
raise ValueError("Invalid selected_index from LLM response")
|
| 745 |
+
else:
|
| 746 |
+
# If no JSON found, fall back to heuristic-based selection
|
| 747 |
+
raise ValueError("No valid JSON found in LLM response")
|
| 748 |
+
except (json.JSONDecodeError, ValueError) as e:
|
| 749 |
+
# If JSON parsing fails, fall back to heuristic-based selection
|
| 750 |
+
logger.warning(f"LLM response parsing failed, falling back to heuristic analysis: {str(e)}")
|
| 751 |
+
pass
|
| 752 |
+
except Exception as llm_error:
|
| 753 |
+
# If LLM call fails, fall back to heuristic-based selection
|
| 754 |
+
logger.warning(f"LLM call failed, falling back to heuristic analysis: {str(llm_error)}")
|
| 755 |
+
pass
|
| 756 |
+
|
| 757 |
+
# Fallback: Simple heuristic-based selection
|
| 758 |
+
logger.info("Using heuristic-based selection as fallback")
|
| 759 |
+
best_match_index = -1
|
| 760 |
+
best_score = -1
|
| 761 |
+
|
| 762 |
+
user_request_lower = user_request.lower()
|
| 763 |
+
|
| 764 |
+
# CRITICAL: Dynamically extract company names from search results
|
| 765 |
+
# Strategy: Identify unique domains/companies that appear in results
|
| 766 |
+
# The company mentioned in MOST results is likely the requested company
|
| 767 |
+
company_mentions = {} # {company_identifier: count}
|
| 768 |
+
domain_to_company = {} # {domain: company_name}
|
| 769 |
+
|
| 770 |
+
# First pass: Learn which companies appear in the search results
|
| 771 |
+
for result in formatted_results:
|
| 772 |
+
title = result.get("title", "").lower()
|
| 773 |
+
link = result.get("link", "").lower()
|
| 774 |
+
|
| 775 |
+
# Extract domain
|
| 776 |
+
domain_match = re.search(r'https?://(?:www\.)?([^/]+)', link)
|
| 777 |
+
if domain_match:
|
| 778 |
+
domain = domain_match.group(1)
|
| 779 |
+
|
| 780 |
+
# Extract company identifier from domain dynamically
|
| 781 |
+
# Strategy: Use the main part of domain as company key
|
| 782 |
+
# e.g., "intc.com" -> "intc", "aboutamazon.com" -> "aboutamazon", "ir.tesla.com" -> "tesla"
|
| 783 |
+
|
| 784 |
+
# Remove common prefixes/suffixes
|
| 785 |
+
domain_parts = domain.replace('www.', '').replace('ir.', '').replace('investor.', '').replace('investors.', '')
|
| 786 |
+
|
| 787 |
+
# Get the core domain name (before .com/.net/etc)
|
| 788 |
+
core_domain = domain_parts.split('.')[0]
|
| 789 |
+
|
| 790 |
+
# Use core domain as company identifier
|
| 791 |
+
company_key = core_domain
|
| 792 |
+
|
| 793 |
+
# Track company mentions
|
| 794 |
+
company_mentions[company_key] = company_mentions.get(company_key, 0) + 1
|
| 795 |
+
domain_to_company[domain] = company_key
|
| 796 |
+
|
| 797 |
+
# Determine the PRIMARY requested company (most mentioned in results)
|
| 798 |
+
primary_company = None
|
| 799 |
+
if company_mentions:
|
| 800 |
+
primary_company = max(company_mentions.items(), key=lambda x: x[1])[0]
|
| 801 |
+
logger.info(f"[COMPANY DETECTION] Detected primary company: '{primary_company}' (mentioned in {company_mentions[primary_company]} results)")
|
| 802 |
+
logger.info(f"[COMPANY DETECTION] All companies found: {company_mentions}")
|
| 803 |
+
|
| 804 |
+
for i, result in enumerate(formatted_results):
|
| 805 |
+
title = result.get("title", "").lower()
|
| 806 |
+
snippet = result.get("snippet", "").lower()
|
| 807 |
+
link = result.get("link", "")
|
| 808 |
+
|
| 809 |
+
# Get original result for metadata access
|
| 810 |
+
original_result = search_results[i] if i < len(search_results) else {}
|
| 811 |
+
|
| 812 |
+
# Calculate relevance score
|
| 813 |
+
score = 0
|
| 814 |
+
|
| 815 |
+
# CRITICAL #1: Company matching (HIGHEST PRIORITY)
|
| 816 |
+
# If we detected a primary company from search results, prioritize results from that company
|
| 817 |
+
if primary_company:
|
| 818 |
+
# Extract domain from this result
|
| 819 |
+
domain_match = re.search(r'https?://(?:www\.)?([^/]+)', link)
|
| 820 |
+
if domain_match:
|
| 821 |
+
result_domain = domain_match.group(1)
|
| 822 |
+
result_company = domain_to_company.get(result_domain, None)
|
| 823 |
+
|
| 824 |
+
if result_company == primary_company:
|
| 825 |
+
# This result is from the primary company!
|
| 826 |
+
score += 30 # HUGE bonus for matching primary company
|
| 827 |
+
logger.info(f"[SCORE] Result {i} from primary company '{primary_company}' (domain: {result_domain}) - score +30")
|
| 828 |
+
elif result_company and result_company != primary_company:
|
| 829 |
+
# This result is from a DIFFERENT company
|
| 830 |
+
score -= 100 # MASSIVE penalty for wrong company
|
| 831 |
+
logger.info(f"[SCORE] Result {i} from WRONG company '{result_company}' (expected '{primary_company}') - score -100")
|
| 832 |
+
|
| 833 |
+
# CRITICAL #2: Heavily prefer direct PDF files
|
| 834 |
+
# Check both URL extension AND mime type metadata
|
| 835 |
+
is_pdf = False
|
| 836 |
+
if link.lower().endswith(".pdf"):
|
| 837 |
+
is_pdf = True
|
| 838 |
+
score += 10 # Base PDF score
|
| 839 |
+
|
| 840 |
+
# BONUS: Check for explicit PDF metadata (mime type and fileFormat)
|
| 841 |
+
if original_result.get("mime") == "application/pdf" or original_result.get("fileFormat") == "PDF/Adobe Acrobat":
|
| 842 |
+
is_pdf = True
|
| 843 |
+
score += 12 # Even higher score for confirmed PDFs with metadata!
|
| 844 |
+
logger.info(f"[SCORE] Result {i} has PDF metadata (mime/fileFormat) - score +12")
|
| 845 |
+
|
| 846 |
+
# Check for keywords/patterns matching between user request and result
|
| 847 |
+
# Extract key terms from user request dynamically
|
| 848 |
+
request_tokens = set(user_request_lower.split())
|
| 849 |
+
title_tokens = set(title.split())
|
| 850 |
+
snippet_tokens = set(snippet.split())
|
| 851 |
+
|
| 852 |
+
# Calculate token overlap (how many words match)
|
| 853 |
+
title_overlap = len(request_tokens & title_tokens)
|
| 854 |
+
snippet_overlap = len(request_tokens & snippet_tokens)
|
| 855 |
+
|
| 856 |
+
# Bonus for word matches
|
| 857 |
+
if title_overlap > 0:
|
| 858 |
+
score += title_overlap * 2 # Each matching word in title = +2 points
|
| 859 |
+
logger.info(f"[SCORE] Result {i} has {title_overlap} matching words in title - score +{title_overlap * 2}")
|
| 860 |
+
|
| 861 |
+
if snippet_overlap > 0:
|
| 862 |
+
score += snippet_overlap # Each matching word in snippet = +1 point
|
| 863 |
+
logger.info(f"[SCORE] Result {i} has {snippet_overlap} matching words in snippet - score +{snippet_overlap}")
|
| 864 |
+
|
| 865 |
+
# Check for year patterns in user request and result
|
| 866 |
+
year_patterns = re.findall(r'\b(19|20)\d{2}\b', user_request_lower)
|
| 867 |
+
for year in year_patterns:
|
| 868 |
+
if year in title or year in snippet or year in link:
|
| 869 |
+
score += 2
|
| 870 |
+
logger.info(f"[SCORE] Result {i} matches year '{year}' - score +2")
|
| 871 |
+
|
| 872 |
+
# Penalize landing/index pages if they're NOT PDFs
|
| 873 |
+
# Dynamic check: look for common index page patterns in URL
|
| 874 |
+
if not is_pdf:
|
| 875 |
+
# Check if URL looks like an index/landing page (contains common patterns)
|
| 876 |
+
index_patterns = ['results', 'default', 'index', 'overview', 'main', 'performance']
|
| 877 |
+
if any(pattern in link for pattern in index_patterns):
|
| 878 |
+
score -= 5 # Heavy penalty for index pages
|
| 879 |
+
logger.info(f"[SCORE] Result {i} is an index/landing page - score -5")
|
| 880 |
+
|
| 881 |
+
# Prefer press-release pages over performance/overview pages
|
| 882 |
+
if 'press-release' in link or 'press_release' in link or 'webcast' in link:
|
| 883 |
+
score += 8 # Bonus for press release pages (likely to have PDFs)
|
| 884 |
+
logger.info(f"[SCORE] Result {i} is a press-release page - score +8")
|
| 885 |
+
|
| 886 |
+
# Prefer official sources (but only if it's a PDF)
|
| 887 |
+
# Dynamic check: look for credible domain indicators
|
| 888 |
+
if is_pdf:
|
| 889 |
+
credible_indicators = ['.gov', 'investor', 'ir.', 'cdn']
|
| 890 |
+
if any(indicator in link for indicator in credible_indicators):
|
| 891 |
+
score += 2
|
| 892 |
+
|
| 893 |
+
# Update best match if this score is higher
|
| 894 |
+
if score > best_score:
|
| 895 |
+
best_score = score
|
| 896 |
+
best_match_index = i
|
| 897 |
+
|
| 898 |
+
# SPECIAL HANDLING: If user requested multiple quarters, return multiple links
|
| 899 |
+
if is_multiple_quarter_request and len(quarters_requested) > 1:
|
| 900 |
+
logger.info(f"[MULTI-QUARTER] User requested {len(quarters_requested)} quarters, returning multiple links")
|
| 901 |
+
|
| 902 |
+
# Group results by quarter using dynamic scoring
|
| 903 |
+
quarter_results = {q: [] for q in quarters_requested}
|
| 904 |
+
|
| 905 |
+
for i, result in enumerate(formatted_results):
|
| 906 |
+
title = result.get("title", "").lower()
|
| 907 |
+
snippet = result.get("snippet", "").lower()
|
| 908 |
+
link = result.get("link", "")
|
| 909 |
+
|
| 910 |
+
# Get original result for metadata access
|
| 911 |
+
original_result = search_results[i] if i < len(search_results) else {}
|
| 912 |
+
|
| 913 |
+
# CRITICAL: Check if this is a PDF link
|
| 914 |
+
is_pdf = link.lower().endswith('.pdf')
|
| 915 |
+
|
| 916 |
+
# Also check PDF metadata
|
| 917 |
+
if original_result.get("mime") == "application/pdf" or original_result.get("fileFormat") == "PDF/Adobe Acrobat":
|
| 918 |
+
is_pdf = True
|
| 919 |
+
|
| 920 |
+
# Calculate relevance score for each quarter dynamically
|
| 921 |
+
# This avoids hardcoding patterns
|
| 922 |
+
quarter_scores = {}
|
| 923 |
+
for quarter in quarters_requested:
|
| 924 |
+
score = 0
|
| 925 |
+
|
| 926 |
+
# PRIORITY #1: Company matching (if we detected primary company)
|
| 927 |
+
if primary_company:
|
| 928 |
+
domain_match = re.search(r'https?://(?:www\.)?([^/]+)', link)
|
| 929 |
+
if domain_match:
|
| 930 |
+
result_domain = domain_match.group(1)
|
| 931 |
+
result_company = domain_to_company.get(result_domain, None)
|
| 932 |
+
|
| 933 |
+
if result_company == primary_company:
|
| 934 |
+
score += 30 # HUGE bonus for matching primary company
|
| 935 |
+
elif result_company and result_company != primary_company:
|
| 936 |
+
score -= 100 # MASSIVE penalty for wrong company
|
| 937 |
+
|
| 938 |
+
# PRIORITY #2: HUGE bonus for PDF files - we want direct download links!
|
| 939 |
+
if is_pdf:
|
| 940 |
+
score += 20 # PDF links get massive priority
|
| 941 |
+
|
| 942 |
+
# PRIORITY #3: Check if quarter appears in title/snippet/link
|
| 943 |
+
if quarter in title or quarter in snippet or quarter in link.lower():
|
| 944 |
+
score += 10
|
| 945 |
+
|
| 946 |
+
# Also check for numeric representation (e.g., "1" for q1)
|
| 947 |
+
quarter_num = quarter[1] # Extract '1' from 'q1'
|
| 948 |
+
if f"q{quarter_num}" in title or f"q{quarter_num}" in snippet or f"q{quarter_num}" in link.lower():
|
| 949 |
+
score += 5
|
| 950 |
+
|
| 951 |
+
# Penalize index/landing pages
|
| 952 |
+
if not is_pdf:
|
| 953 |
+
index_indicators = ['default.aspx', 'investor-relations', '/overview/', 'index']
|
| 954 |
+
if any(indicator in link.lower() for indicator in index_indicators):
|
| 955 |
+
score -= 15 # Heavy penalty for index pages
|
| 956 |
+
|
| 957 |
+
quarter_scores[quarter] = score
|
| 958 |
+
|
| 959 |
+
# Assign to the quarter with highest score (if score > 0)
|
| 960 |
+
if quarter_scores:
|
| 961 |
+
best_quarter = max(quarter_scores.items(), key=lambda x: x[1])
|
| 962 |
+
if best_quarter[1] > 0: # Only assign if score > 0
|
| 963 |
+
quarter_results[best_quarter[0]].append({
|
| 964 |
+
"index": i,
|
| 965 |
+
"title": result.get("title", ""),
|
| 966 |
+
"link": link,
|
| 967 |
+
"snippet": result.get("snippet", ""),
|
| 968 |
+
"score": best_quarter[1],
|
| 969 |
+
"is_pdf": is_pdf
|
| 970 |
+
})
|
| 971 |
+
|
| 972 |
+
# Select best result for each requested quarter
|
| 973 |
+
selected_links = []
|
| 974 |
+
for quarter in quarters_requested:
|
| 975 |
+
if quarter_results[quarter]:
|
| 976 |
+
# Sort by score and get the best result (PDF links will rank highest)
|
| 977 |
+
sorted_results = sorted(quarter_results[quarter], key=lambda x: x.get("score", 0), reverse=True)
|
| 978 |
+
best_for_quarter = sorted_results[0]
|
| 979 |
+
selected_links.append({
|
| 980 |
+
"quarter": quarter.upper(),
|
| 981 |
+
"title": best_for_quarter["title"],
|
| 982 |
+
"link": best_for_quarter["link"],
|
| 983 |
+
"snippet": best_for_quarter["snippet"]
|
| 984 |
+
})
|
| 985 |
+
is_pdf_marker = "[PDF]" if best_for_quarter.get("is_pdf", False) else "[Web Page]"
|
| 986 |
+
logger.info(f"[MULTI-QUARTER] Found result for {quarter.upper()}: {is_pdf_marker} {best_for_quarter['title'][:50]} (score: {best_for_quarter['score']})")
|
| 987 |
+
else:
|
| 988 |
+
logger.warning(f"[MULTI-QUARTER] No result found for {quarter.upper()}")
|
| 989 |
+
|
| 990 |
+
if selected_links:
|
| 991 |
+
return {
|
| 992 |
+
"type": "multiple_download_links",
|
| 993 |
+
"links": selected_links,
|
| 994 |
+
"message": f"Found {len(selected_links)} financial reports for the requested quarters: {', '.join([q.upper() for q in quarters_requested])}",
|
| 995 |
+
"confidence": "high" if len(selected_links) == len(quarters_requested) else "medium",
|
| 996 |
+
"reasoning": f"Selected best result for each requested quarter. Found {len(selected_links)} out of {len(quarters_requested)} quarters."
|
| 997 |
+
}
|
| 998 |
+
|
| 999 |
+
# If we found a reasonable match (score > 0), return it
|
| 1000 |
+
if best_match_index >= 0 and best_score > 0:
|
| 1001 |
+
selected_result = formatted_results[best_match_index]
|
| 1002 |
+
original_result = search_results[best_match_index]
|
| 1003 |
+
|
| 1004 |
+
# Check if the link is an index page that needs further parsing
|
| 1005 |
+
link = selected_result["link"]
|
| 1006 |
+
if not link.lower().endswith(".pdf") and ("investor" in link or "ir." in link or "financial-report" in link):
|
| 1007 |
+
# Try to extract PDF links from the index page
|
| 1008 |
+
pdf_links = await extract_pdf_links_from_page(link, user_request)
|
| 1009 |
+
if pdf_links:
|
| 1010 |
+
# For requests asking for multiple reports (like "2份" or "two"), return multiple links
|
| 1011 |
+
if "2份" in user_request.lower() or "two" in user_request.lower() or "2" in user_request.lower():
|
| 1012 |
+
# Return up to 2 most relevant PDF links
|
| 1013 |
+
relevant_links = pdf_links[:2]
|
| 1014 |
+
return {
|
| 1015 |
+
"type": "download_links_extracted",
|
| 1016 |
+
"links": relevant_links,
|
| 1017 |
+
"message": f"Found {len(relevant_links)} most relevant financial reports for your request",
|
| 1018 |
+
"confidence": "high" if best_score >= 5 else ("medium" if best_score >= 2 else "low"),
|
| 1019 |
+
"reasoning": f"Selected based on relevance scoring (score: {best_score}) and extracted {len(relevant_links)} PDF links from index page."
|
| 1020 |
+
}
|
| 1021 |
+
else:
|
| 1022 |
+
# Return the first PDF link found
|
| 1023 |
+
pdf_link = pdf_links[0]
|
| 1024 |
+
return {
|
| 1025 |
+
"type": "download_link_extracted",
|
| 1026 |
+
"title": f"{selected_result['title']} - {pdf_link.get('title', 'PDF Report')}",
|
| 1027 |
+
"link": pdf_link["url"],
|
| 1028 |
+
"snippet": pdf_link.get("snippet", selected_result["snippet"]),
|
| 1029 |
+
"message": f"Found the most relevant financial report for your request: {pdf_link.get('title', 'PDF Report')}",
|
| 1030 |
+
"confidence": "high" if best_score >= 5 else ("medium" if best_score >= 2 else "low"),
|
| 1031 |
+
"reasoning": f"Selected based on relevance scoring (score: {best_score}) and extracted PDF link from index page."
|
| 1032 |
+
}
|
| 1033 |
+
|
| 1034 |
+
return {
|
| 1035 |
+
"type": "download_link_extracted",
|
| 1036 |
+
"title": selected_result["title"],
|
| 1037 |
+
"link": selected_result["link"],
|
| 1038 |
+
"snippet": selected_result["snippet"],
|
| 1039 |
+
"message": f"Found the most relevant financial report for your request: {selected_result['title']}",
|
| 1040 |
+
"confidence": "high" if best_score >= 5 else ("medium" if best_score >= 2 else "low"),
|
| 1041 |
+
"reasoning": f"Selected based on relevance scoring (score: {best_score}). This result matches key terms in your request."
|
| 1042 |
+
}
|
| 1043 |
+
else:
|
| 1044 |
+
# If no clearly relevant results, return the first result with low confidence
|
| 1045 |
+
if search_results:
|
| 1046 |
+
first_result = search_results[0]
|
| 1047 |
+
link = first_result.get("link", "")
|
| 1048 |
+
|
| 1049 |
+
# Check if the link is an index page that needs further parsing
|
| 1050 |
+
if not link.lower().endswith(".pdf") and ("investor" in link or "ir." in link or "financial-report" in link):
|
| 1051 |
+
# Try to extract PDF links from the index page
|
| 1052 |
+
pdf_links = await extract_pdf_links_from_page(link, user_request)
|
| 1053 |
+
if pdf_links:
|
| 1054 |
+
# For requests asking for multiple reports (like "2份" or "two"), return multiple links
|
| 1055 |
+
if "2份" in user_request.lower() or "two" in user_request.lower() or "2" in user_request.lower():
|
| 1056 |
+
# Return up to 2 most relevant PDF links
|
| 1057 |
+
relevant_links = pdf_links[:2]
|
| 1058 |
+
return {
|
| 1059 |
+
"type": "download_links_extracted",
|
| 1060 |
+
"links": relevant_links,
|
| 1061 |
+
"message": f"Found {len(relevant_links)} most relevant financial reports for your request",
|
| 1062 |
+
"confidence": "low",
|
| 1063 |
+
"reasoning": f"Extracted {len(relevant_links)} PDF links from index page. No highly relevant results found using keyword matching."
|
| 1064 |
+
}
|
| 1065 |
+
else:
|
| 1066 |
+
# Return the first PDF link found
|
| 1067 |
+
pdf_link = pdf_links[0]
|
| 1068 |
+
return {
|
| 1069 |
+
"type": "download_link_extracted",
|
| 1070 |
+
"title": pdf_link.get("title", f"{first_result.get('title', 'Financial Report')} - PDF"),
|
| 1071 |
+
"link": pdf_link["url"],
|
| 1072 |
+
"snippet": pdf_link.get("snippet", first_result.get("snippet", "")),
|
| 1073 |
+
"message": f"Found a potential financial report: {pdf_link.get('title', 'PDF Report')}",
|
| 1074 |
+
"confidence": "low",
|
| 1075 |
+
"reasoning": "Extracted PDF link from index page. No highly relevant results found using keyword matching."
|
| 1076 |
+
}
|
| 1077 |
+
|
| 1078 |
+
return {
|
| 1079 |
+
"type": "download_link_extracted",
|
| 1080 |
+
"title": first_result.get("title", ""),
|
| 1081 |
+
"link": first_result.get("link", ""),
|
| 1082 |
+
"snippet": first_result.get("snippet", ""),
|
| 1083 |
+
"message": "Found a potential financial report, but it may not exactly match your request.",
|
| 1084 |
+
"confidence": "low",
|
| 1085 |
+
"reasoning": "No highly relevant results found using keyword matching."
|
| 1086 |
+
}
|
| 1087 |
+
else:
|
| 1088 |
+
return {
|
| 1089 |
+
"type": "no_results",
|
| 1090 |
+
"message": "No search results available to analyze.",
|
| 1091 |
+
"suggestion": "Please try a different search or provide a direct URL.",
|
| 1092 |
+
"reasoning": "No search results were provided for analysis."
|
| 1093 |
+
}
|
| 1094 |
+
except Exception as e:
|
| 1095 |
+
logger.error(f"Error in deep analysis: {str(e)}")
|
| 1096 |
+
return {
|
| 1097 |
+
"type": "analysis_error",
|
| 1098 |
+
"error": str(e),
|
| 1099 |
+
"message": f"Error occurred while analyzing search results: {str(e)}",
|
| 1100 |
+
"suggestion": "Please try again or provide a direct URL for the financial report."
|
| 1101 |
+
}
|
| 1102 |
+
|
| 1103 |
+
# Resource for accessing extracted financial report content
|
| 1104 |
+
@mcp.resource("financial-report://{filename}")
|
| 1105 |
+
def get_financial_report_content(filename: str) -> str:
|
| 1106 |
+
"""
|
| 1107 |
+
Get the content of an extracted financial report
|
| 1108 |
+
|
| 1109 |
+
Args:
|
| 1110 |
+
filename: Name of the extracted file
|
| 1111 |
+
|
| 1112 |
+
Returns:
|
| 1113 |
+
Content of the financial report
|
| 1114 |
+
"""
|
| 1115 |
+
# Use absolute path to ensure correct file access in different environments
|
| 1116 |
+
reports_dir = Path("financial_reports").absolute()
|
| 1117 |
+
file_path = reports_dir / filename
|
| 1118 |
+
|
| 1119 |
+
if not file_path.exists():
|
| 1120 |
+
# Also check with relative path as fallback
|
| 1121 |
+
relative_path = Path("financial_reports") / filename
|
| 1122 |
+
if relative_path.exists():
|
| 1123 |
+
file_path = relative_path
|
| 1124 |
+
else:
|
| 1125 |
+
raise Exception(f"File not found: {filename}. Searched in {reports_dir} and relative path {relative_path}")
|
| 1126 |
+
|
| 1127 |
+
# Handle PDF files properly
|
| 1128 |
+
if filename.lower().endswith('.pdf'):
|
| 1129 |
+
try:
|
| 1130 |
+
import pdfplumber
|
| 1131 |
+
with pdfplumber.open(file_path) as pdf:
|
| 1132 |
+
text = ""
|
| 1133 |
+
for page in pdf.pages:
|
| 1134 |
+
text += page.extract_text() or ""
|
| 1135 |
+
return text
|
| 1136 |
+
except Exception as e:
|
| 1137 |
+
# If PDF extraction fails, return error message
|
| 1138 |
+
logger.error(f"Error extracting text from PDF {filename}: {str(e)}")
|
| 1139 |
+
return f"Error extracting text from PDF {filename}: {str(e)}"
|
| 1140 |
+
else:
|
| 1141 |
+
# For text-based files, read normally
|
| 1142 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 1143 |
+
return f.read()
|
| 1144 |
+
|
| 1145 |
+
if __name__ == "__main__":
|
| 1146 |
+
# Run the server with stdio transport
|
| 1147 |
+
# Note: We should avoid printing to stdout here as it interferes with stdio communication
|
| 1148 |
+
# Log to stderr instead
|
| 1149 |
+
import sys
|
| 1150 |
+
print("MCP SDK imported successfully", file=sys.stderr)
|
| 1151 |
+
mcp.run(transport="stdio")
|
chatbot/chat_main.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|