jing commited on
Commit
80653d3
·
0 Parent(s):

Fresh deploy: all latest files

Browse files
Files changed (11) hide show
  1. .gitignore +195 -0
  2. LICENSE +21 -0
  3. README.md +93 -0
  4. app/agent.py +28 -0
  5. app/config.py +40 -0
  6. app/memory.py +300 -0
  7. app/tool.py +184 -0
  8. app/utils.py +72 -0
  9. main.py +196 -0
  10. requirements.txt +7 -0
  11. styles.css +525 -0
.gitignore ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ archive/
6
+ deploy.sh
7
+ # C extensions
8
+ *.so
9
+ test*
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # UV
99
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ #uv.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ #pdm.lock
114
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115
+ # in version control.
116
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
117
+ .pdm.toml
118
+ .pdm-python
119
+ .pdm-build/
120
+
121
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122
+ __pypackages__/
123
+
124
+ # Celery stuff
125
+ celerybeat-schedule
126
+ celerybeat.pid
127
+
128
+ # SageMath parsed files
129
+ *.sage.py
130
+
131
+ # Environments
132
+ .env
133
+ .venv
134
+ env/
135
+ venv/
136
+ ENV/
137
+ env.bak/
138
+ venv.bak/
139
+
140
+ # Spyder project settings
141
+ .spyderproject
142
+ .spyproject
143
+
144
+ # Rope project settings
145
+ .ropeproject
146
+
147
+ # mkdocs documentation
148
+ /site
149
+
150
+ # mypy
151
+ .mypy_cache/
152
+ .dmypy.json
153
+ dmypy.json
154
+
155
+ # Pyre type checker
156
+ .pyre/
157
+
158
+ # pytype static type analyzer
159
+ .pytype/
160
+
161
+ # Cython debug symbols
162
+ cython_debug/
163
+
164
+ # PyCharm
165
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
168
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
169
+ #.idea/
170
+
171
+ # Abstra
172
+ # Abstra is an AI-powered process automation framework.
173
+ # Ignore directories containing user credentials, local state, and settings.
174
+ # Learn more at https://abstra.io/docs
175
+ .abstra/
176
+
177
+ # Visual Studio Code
178
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
179
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
180
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
181
+ # you could uncomment the following to ignore the enitre vscode folder
182
+ # .vscode/
183
+
184
+ # Ruff stuff:
185
+ .ruff_cache/
186
+
187
+ # PyPI configuration file
188
+ .pypirc
189
+
190
+ # Cursor
191
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
192
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
193
+ # refer to https://docs.cursor.com/context/ignore-files
194
+ .cursorignore
195
+ .cursorindexingignore
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jing Bi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Perceptual Copilot
3
+ emoji: 👁️
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.33.1
8
+ app_file: main.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ ## ✨ What is Perceptual Copilot?
14
+
15
+ Perceptual Copilot is a prototype that demonstrates the integration of OpenAI agents with visual tools to process real-time video streams. This experimental platform showcases both the promising potential and current limitations of equipping agents with vision capabilities to understand and interact with live visual data.
16
+
17
+
18
+ ### Architecture Overview
19
+
20
+
21
+
22
+ ```
23
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
24
+ │ Webcam │───▶│ Memory │◀──▶│ Gradio │
25
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
26
+
27
+
28
+ ┌─────────────────┐ ┌─────────────────┐
29
+ │ Agent │◀──▶│ Tools │
30
+ └─────────────────┘ └─────────────────┘
31
+ ```
32
+
33
+ ### Available Tools
34
+
35
+ | Tool | Description | Output |
36
+ |------|-------------|---------|
37
+ | `caption` | Generate detailed image descriptions | Rich visual descriptions |
38
+ | `ocr` | Extract text from images | Extracted text content |
39
+ | `localize` | Detect and locate objects | Bounding boxes with labels |
40
+ | `qa` | Answer questions about images | Contextual answers |
41
+ | `time` | Get current timestamp | Current date and time |
42
+ | _More tools coming soon..._ | Additional capabilities in development | Various outputs |
43
+
44
+ ## 🚀 Quick Start
45
+
46
+ ### Prerequisites
47
+
48
+ - Webcam access
49
+
50
+ ### Installation
51
+
52
+ 1. **Install dependencies**
53
+ ```bash
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+ 2. **Set up environment variables**
58
+ ```bash
59
+ export HF_TOKEN="your_huggingface_token"
60
+ export API_KEY="your_openai_api_key"
61
+ export END_LANG="your_llm_endpoint"
62
+ export END_TASK="your_task_endpoint"
63
+ export MODEL_AGENT="your_agent_model"
64
+ export MODEL_MLLM="your_multimodal_model"
65
+ export MODEL_LOC="your_localization_model"
66
+ ```
67
+
68
+ 3. **Launch the application**
69
+ ```bash
70
+ python main.py
71
+ ```
72
+
73
+ ## 💡 Usage Examples
74
+
75
+ ### Basic Interaction
76
+ - **User**: "What do you see?"
77
+ - **Assistant**: *Generates detailed caption of current view*
78
+
79
+ ### OCR Functionality
80
+ - **User**: "Read the text in this document"
81
+ - **Assistant**: *Extracts and returns all visible text*
82
+
83
+ ### Object Detection
84
+ - **User**: "What objects are in front of me?"
85
+ - **Assistant**: *Identifies and localizes objects with bounding boxes*
86
+
87
+
88
+ ## Acknowledgments
89
+
90
+ - Built with [Gradio](https://gradio.app/) for the interactive web interface
91
+ - Uses [Supervision](https://supervision.roboflow.com/) for frame annotation
92
+ - WebRTC integration via [FastRTC](https://github.com/gradio-app/gradio)
93
+
app/agent.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from agents import Agent
3
+ from app.memory import Memory
4
+ from openai import AsyncOpenAI
5
+ from app.config import env
6
+ from agents import set_default_openai_client, set_default_openai_api, set_tracing_disabled
7
+ from app.tool import caption, ocr, localize, qa, time, video_caption, video_qa
8
+
9
+ def build_agent():
10
+ client = AsyncOpenAI(base_url=env.end_lang,api_key=env.api_key)
11
+ set_default_openai_client(client=client, use_for_tracing=False)
12
+ set_default_openai_api("chat_completions")
13
+ set_tracing_disabled(disabled=True)
14
+ chat_agent = Agent[Memory](
15
+ name="Assistant",
16
+ # tools=[caption, ocr, qa, time, localize, video_caption, video_qa],
17
+ tools=[caption, ocr, qa, time, video_caption, video_qa],
18
+ model=env.model_agent,
19
+ instructions=(
20
+ "Your name is Perceptual Copilot. As a helpful assistant, your functions include answering questions about images, "
21
+ "Optical Character Recognition (OCR), image caption generation, object localization "
22
+ "within images, and video caption generation and Q&A. For video-related tools, you "
23
+ "will need to determine the appropriate time window to analyze from the past."
24
+ ),
25
+ )
26
+
27
+ return chat_agent
28
+
app/config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from openai import OpenAI
4
+
5
+
6
+
7
+
8
+ try:
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+ except ImportError:
12
+ pass
13
+
14
+
15
+ class Envs:
16
+ def __init__(self):
17
+ self.hf_token = os.getenv("HF_TOKEN")
18
+ self.api_key = os.getenv("API_KEY")
19
+ self.end_task = os.getenv("END_TASK")
20
+ self.end_lang = os.getenv("END_LANG")
21
+ self.model_agent = os.getenv("MODEL_AGENT")
22
+ self.model_mllm = os.getenv("MODEL_MLLM")
23
+ self.model_loc = os.getenv("MODEL_LOC")
24
+
25
+ # Only initialize OpenAI client if we have the required env vars
26
+ if self.end_lang and self.api_key:
27
+ self.client = OpenAI(base_url=self.end_lang, api_key=self.api_key)
28
+ else:
29
+ self.client = None
30
+ print("WARNING: OpenAI client not initialized due to missing environment variables")
31
+
32
+ self.debug = os.getenv("DEBUG", "1").lower() in ("true", "1", "yes")
33
+ self.fps = int(os.getenv("FPS", "1"))
34
+
35
+
36
+ env = Envs()
37
+
38
+ logger = logging.getLogger('copilot')
39
+ logger.setLevel(logging.DEBUG if env.debug else logging.INFO)
40
+ logger.addHandler(logging.StreamHandler())
app/memory.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from dataclasses import dataclass, field
3
+ from agents import Runner, RunHooks
4
+ import threading
5
+ from typing import Any, Dict, Optional, List
6
+ import traceback
7
+ import time
8
+ from datetime import datetime
9
+ import numpy as np
10
+ import gradio as gr
11
+
12
+ from .config import logger, env
13
+ @dataclass
14
+ class RunnerStep:
15
+ """Log entry for a single Runner step"""
16
+ timestamp: str
17
+ step_type: str
18
+ agent_name: str
19
+ turn_number: int
20
+ details: Dict[str, Any] = field(default_factory=dict)
21
+ duration_ms: Optional[float] = None
22
+
23
+ def __str__(self) -> str:
24
+ return f"[{self.timestamp}][T{self.turn_number}][{self.step_type}]: {self.details}"
25
+
26
+ @dataclass
27
+ class Message:
28
+ role: str
29
+ content: str
30
+ mode: str
31
+ metadata: Dict[str, Any] = field(default_factory=dict)
32
+
33
+ @classmethod
34
+ def user(cls, content: str) -> "Message":
35
+ return cls("user", content, '')
36
+
37
+ @classmethod
38
+ def system(cls, content: str) -> "Message":
39
+ return cls("system", content, '')
40
+
41
+ @classmethod
42
+ def tool(cls, content: str, **kwargs) -> "Message":
43
+ return cls("assistant", content, 'tool', kwargs)
44
+
45
+ @classmethod
46
+ def assistant(cls, content: str, mode='') -> "Message":
47
+ return cls("assistant", content, mode)
48
+
49
+ @classmethod
50
+ def tts(cls, content: str) -> "Message":
51
+ return cls("assistant", content, 'tts')
52
+
53
+ def to_dict(self) -> Dict[str, Any]:
54
+ result = {"role": self.role, "content": self.content}
55
+ if self.mode == "tool":
56
+ metadata = self.metadata.copy()
57
+ if title := metadata.get("title"):
58
+ metadata["title"] = title.title()
59
+ result["metadata"] = metadata
60
+ return result
61
+
62
+
63
+ @dataclass
64
+ class Snapshot:
65
+ sender: str
66
+ data: Any
67
+ status: str = 'done'
68
+
69
+ @property
70
+ def gr(self):
71
+ if isinstance(self.data, np.ndarray):
72
+ return gr.Image(self.data)
73
+ return self.data
74
+
75
+
76
+ class RunnerLoggerHooks(RunHooks):
77
+ """Custom hooks to log every step of the Runner"""
78
+
79
+ def __init__(self, memory_instance):
80
+ super().__init__()
81
+ self.memory = memory_instance
82
+ self.current_turn = 0
83
+ self.turn_start_time = None
84
+
85
+ async def on_agent_start(self, context, agent):
86
+ self.current_turn += 1
87
+ self.turn_start_time = time.time()
88
+ self.memory.snapshots.append(Snapshot(
89
+ sender='agent',
90
+ data='Deciding tools',
91
+ status='pending'
92
+ ))
93
+ step = RunnerStep(
94
+ timestamp=datetime.now().isoformat(),
95
+ step_type="turn_start",
96
+ agent_name=agent.name,
97
+ turn_number=self.current_turn,
98
+ details={"message": f"Starting turn {self.current_turn} with agent {agent.name}"}
99
+ )
100
+ self.memory.log_runner_step(step)
101
+
102
+ async def on_agent_end(self, context, agent, result):
103
+ if self.turn_start_time:
104
+ duration = (time.time() - self.turn_start_time) * 1000
105
+ else:
106
+ duration = None
107
+
108
+ step = RunnerStep(
109
+ timestamp=datetime.now().isoformat(),
110
+ step_type="agent_call",
111
+ agent_name=agent.name,
112
+ turn_number=self.current_turn,
113
+ details={"message": f"Agent {agent.name} completed", "result_type": type(result).__name__},
114
+ duration_ms=duration
115
+ )
116
+ self.memory.log_runner_step(step)
117
+
118
+ async def on_tool_start(self, context, agent, tool_call):
119
+ tool_name = getattr(tool_call, 'name', 'unknown')
120
+ tool_args = None
121
+ for attr in ['arguments', 'args', 'function', 'parameters']:
122
+ if hasattr(tool_call, attr):
123
+ tool_args = getattr(tool_call, attr)
124
+ break
125
+ self.memory.snapshots.append(Snapshot(
126
+ sender='agent',
127
+ status='pending',
128
+ data=f'Calling **{tool_name}**'
129
+ ))
130
+ step = RunnerStep(
131
+ timestamp=datetime.now().isoformat(),
132
+ step_type="tool_call",
133
+ agent_name=agent.name,
134
+ turn_number=self.current_turn,
135
+ details={
136
+ "tool_name": tool_name,
137
+ "tool_args": tool_args,
138
+ "message": f"Calling tool {tool_name}"
139
+ }
140
+ )
141
+ self.memory.log_runner_step(step)
142
+
143
+ async def on_tool_end(self, context, agent, tool_call, result):
144
+ # Handle different tool_call object attributes safely
145
+ tool_name = getattr(tool_call, 'name', 'unknown')
146
+
147
+ step = RunnerStep(
148
+ timestamp=datetime.now().isoformat(),
149
+ step_type="tool_result",
150
+ agent_name=agent.name,
151
+ turn_number=self.current_turn,
152
+ details={
153
+ "tool_name": tool_name,
154
+ "result_length": len(str(result)) if result else 0,
155
+ "message": f"Tool {tool_name} completed"
156
+ }
157
+ )
158
+ self.memory.log_runner_step(step)
159
+
160
+
161
+ class Chat:
162
+ def __init__(self):
163
+ self.history = []
164
+
165
+ def append(self, message: Message):
166
+ self.history.append(message)
167
+
168
+ def clear_pending_tools(self) -> None:
169
+ self.history = [
170
+ message
171
+ for message in self.history
172
+ if not (
173
+ message.mode == "tool"
174
+ and message.metadata.get("status") == "pending"
175
+ )
176
+ ]
177
+
178
+ @property
179
+ def messages(self):
180
+ return [i.to_dict() for i in self.history]
181
+
182
+
183
+ class Memory:
184
+ def __init__(self, agent, limit: int = 200) -> None:
185
+ self.limit: int = limit
186
+ self.frames: list[Any] = []
187
+ self.snapshots: list[Any] = []
188
+ self.inputs: list[Any] = []
189
+ self.chat = Chat()
190
+
191
+ self.runner_steps: List[RunnerStep] = []
192
+ self.step_limit: int = 1000 # Keep last 1000 steps
193
+ self.logger_hooks: Optional[RunnerLoggerHooks] = None
194
+
195
+ self._chat_q: asyncio.Queue[Any] = asyncio.Queue()
196
+ self._input_q: asyncio.Queue[Any] = asyncio.Queue()
197
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
198
+ self.is_waiting: bool = False
199
+ self.is_running: bool = False
200
+ self._last_frame_time: float = 0
201
+ self.setup(agent)
202
+
203
+ def log_runner_step(self, step: RunnerStep) -> None:
204
+ """Log a runner step and maintain the step history limit"""
205
+ self.runner_steps.append(step)
206
+ logger.debug(f"[ 🛠️ ]{step}")
207
+ while len(self.runner_steps) > self.step_limit:
208
+ self.runner_steps.pop(0)
209
+
210
+ def enqueue(self, data: Any) -> None:
211
+ current_time = time.time()
212
+ if current_time-self._last_frame_time > 1.0 / env.fps:
213
+ self._last_frame_time = current_time
214
+ self.frames.append(data)
215
+ while len(self.frames) > self.limit:
216
+ self.frames.pop(0)
217
+ return self.snapshots.pop(0) if self.snapshots else None
218
+
219
+ def receive(self, text: str) -> None:
220
+ self.chat.append(Message.user(text))
221
+ self._loop.call_soon_threadsafe(self._chat_q.put_nowait, text)
222
+
223
+
224
+ def setup(self, agent) -> None:
225
+ """Bind *agent* and spawn the background monitor threads."""
226
+ self.v_agent = agent
227
+ self.logger_hooks = RunnerLoggerHooks(self)
228
+ def _runner() -> None:
229
+ self._loop = asyncio.new_event_loop()
230
+ asyncio.set_event_loop(self._loop)
231
+ try:
232
+ self._loop.create_task(self._monitor_chat())
233
+ self._loop.run_forever()
234
+ finally:
235
+ self._loop.close()
236
+
237
+ threading.Thread(target=_runner, daemon=True).start()
238
+
239
+ async def _monitor_chat(self) -> None:
240
+ """Process incoming chat messages, respecting the waiting gate."""
241
+ while True:
242
+ text = await self._chat_q.get()
243
+ logger.debug(f"Processing: {text}")
244
+ start_step = RunnerStep(
245
+ timestamp=datetime.now().isoformat(),
246
+ step_type="processing_start",
247
+ agent_name=getattr(self.v_agent, 'name', 'unknown'),
248
+ turn_number=0,
249
+ details={"user_input": text}
250
+ )
251
+ self.log_runner_step(start_step)
252
+
253
+ try:
254
+ self.is_running = True
255
+ result = await Runner.run(
256
+ starting_agent=self.v_agent,
257
+ input=text,
258
+ context=self,
259
+ hooks=self.logger_hooks # Add our custom hooks here
260
+ )
261
+
262
+ self.is_running = False
263
+
264
+ # Log successful completion
265
+ success_step = RunnerStep(
266
+ timestamp=datetime.now().isoformat(),
267
+ step_type="final_output",
268
+ agent_name=getattr(self.v_agent, 'name', 'unknown'),
269
+ turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
270
+ details={
271
+ "output_type": type(result.final_output).__name__,
272
+ "output_preview": str(result.final_output)[:100] + "..." if len(str(result.final_output)) > 100 else str(result.final_output)
273
+ }
274
+ )
275
+ self.log_runner_step(success_step)
276
+
277
+ except Exception as exc: # noqa: BLE001
278
+ self.is_running = False
279
+ full_traceback = traceback.format_exc()
280
+ logger.debug(f"Error in _monitor_chat: {exc}\n{full_traceback}")
281
+ self.chat.clear_pending_tools()
282
+
283
+ # Log the error
284
+ error_step = RunnerStep(
285
+ timestamp=datetime.now().isoformat(),
286
+ step_type="error",
287
+ agent_name=getattr(self.v_agent, 'name', 'unknown'),
288
+ turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
289
+ details={
290
+ "error_type": type(exc).__name__,
291
+ "error_message": str(exc),
292
+ "traceback": full_traceback
293
+ }
294
+ )
295
+ self.log_runner_step(error_step)
296
+ continue
297
+ final = result.final_output.split('</think>', 1)[-1]
298
+ self.chat.clear_pending_tools()
299
+ self.chat.append(Message.assistant(final))
300
+ await asyncio.sleep(0)
app/tool.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import cv2
4
+ import httpx
5
+ from app.config import env
6
+ from app.utils import image_w_box, encode_image
7
+ from agents import RunContextWrapper, function_tool
8
+ from app.memory import Memory,Snapshot
9
+
10
+
11
+
12
+
13
+ def task(name, image):
14
+ resp = httpx.post(f"{env.end_task}",
15
+ data={"name": name},
16
+ files={"file": ("frame.jpg", image.tobytes(), "image/jpeg")},
17
+ timeout=10,
18
+ headers={"Authorization": env.api_key},
19
+ )
20
+ resp.raise_for_status()
21
+ return resp.json()['result']
22
+
23
+ def completion(messages, model):
24
+ response = env.client.chat.completions.create(
25
+ model=model,
26
+ messages=messages
27
+ )
28
+ return response.choices[0].message.content
29
+
30
+
31
+ def completion_image(images, prompt, model):
32
+ messages = [
33
+ {
34
+ "role": "user",
35
+ "content": [
36
+ {"type": "text", "text": prompt},
37
+ {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
38
+ ],
39
+ }
40
+ for b64, mime in map(encode_image, images)
41
+ ]
42
+ return completion(messages, model=model)
43
+
44
+ # ------------------------ Function Tools ------------------------
45
+ @function_tool
46
+ def caption(wrapper: RunContextWrapper[Memory]) -> str:
47
+ """
48
+ Generate a descriptive caption for the most recent frame, record it as a snapshot, and return it.
49
+ Returns:
50
+ str:
51
+ The generated caption for the current view (i.e., the latest frame).
52
+ """
53
+ mem = wrapper.context
54
+ prompt = "Describe the image with rich details but in a concise manner."
55
+ result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
56
+ mem.snapshots.append(Snapshot(sender='caption', data=result))
57
+ return result
58
+
59
+ @function_tool
60
+ def ocr(wrapper: RunContextWrapper[Memory]) -> str:
61
+ """
62
+ Perform OCR on the most recent frame, record it as a snapshot, and return the extracted text.
63
+ Returns:
64
+ str:
65
+ The extracted text from the current view (i.e., the latest frame).
66
+ """
67
+ mem = wrapper.context
68
+ prompt = "Extract all text from image/payslip without miss anything."
69
+ result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
70
+ mem.snapshots.append(Snapshot(sender='ocr', data=result))
71
+ return result
72
+
73
+ @function_tool
74
+ def qa(wrapper: RunContextWrapper[Memory], question: str) -> str:
75
+ """
76
+ Answer a question based on the most recent frame, record it as a snapshot, and return the answer.
77
+
78
+ Args:
79
+ question (str): The question to be answered.
80
+ Returns:
81
+ str:
82
+ The answer to the question based on the current view (i.e., the latest frame).
83
+ """
84
+ mem = wrapper.context
85
+ prompt = f"Answer the question based on the image. Question: {question}"
86
+ result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
87
+ mem.snapshots.append(Snapshot(sender='qa', data=result))
88
+ return result
89
+
90
+
91
+ @function_tool
92
+ def localize(wrapper: RunContextWrapper[Memory]) -> str:
93
+ """
94
+ Localize all objects in the most recent frame
95
+ Returns:
96
+ str:
97
+ The localization result for the current view (i.e., the latest frame).
98
+ the format is {name:list of bboxes}
99
+ """
100
+ mem = wrapper.context
101
+ frame = mem.frames[-1]
102
+ _, img = cv2.imencode('.jpg', frame)
103
+ objxbox = task(env.model_loc, img)
104
+ mem.snapshots.append(Snapshot(sender='localize', data=image_w_box(frame, objxbox)))
105
+ return json.dumps(objxbox, indent=2)
106
+
107
+
108
+ @function_tool
109
+ def time(wrapper: RunContextWrapper[Memory]) -> str:
110
+ """
111
+ Get the current time, record it as a snapshot, and return the time.
112
+ Returns:
113
+ str:
114
+ The current time.
115
+ """
116
+ mem = wrapper.context
117
+ result = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
118
+ mem.snapshots.append(Snapshot(sender='time', data=result))
119
+ return result
120
+
121
+ def sample_frames(mem: Memory, n: int) -> list:
122
+ """
123
+ Sample frames from the past n seconds of video.
124
+
125
+ Args:
126
+ mem (Memory): The memory context containing frames.
127
+ n (int): Number of seconds to look back for video frames.
128
+ Returns:
129
+ list: Sampled frames from the video sequence.
130
+ """
131
+ if len(mem.frames) == 0:
132
+ return []
133
+
134
+ available_frames = min(n * env.fps, len(mem.frames))
135
+ recent_frames = mem.frames[-available_frames:]
136
+ sampled_frames = recent_frames[::env.fps // 2]
137
+
138
+ return sampled_frames
139
+
140
+ @function_tool
141
+ def video_caption(wrapper: RunContextWrapper[Memory], n=2) -> str:
142
+ """
143
+ Generate a descriptive caption for a video sequence from the past n seconds of frames.
144
+ The n is a required parameter that specifies how many seconds of video frames to consider.
145
+
146
+ Args:
147
+ n (int): Number of seconds to look back for video frames.
148
+ Returns:
149
+ str:
150
+ The generated caption for the video sequence from the past n seconds.
151
+ """
152
+ mem = wrapper.context
153
+ sampled_frames = sample_frames(mem, n)
154
+
155
+ if len(sampled_frames) == 0:
156
+ return "No frames available for video caption."
157
+
158
+ prompt = "Describe this video sequence focusing on any changes or actions that occur over time."
159
+ result = completion_image(sampled_frames, prompt, env.model_mllm)
160
+ mem.snapshots.append(Snapshot(sender='video caption', data=result))
161
+ return result
162
+
163
+ @function_tool
164
+ def video_qa(wrapper: RunContextWrapper[Memory], question: str, n=2) -> str:
165
+ """
166
+ Answer a question based on a video sequence from the past n seconds of frames.
167
+
168
+ Args:
169
+ question (str): The question to be answered.
170
+ n (int): Number of seconds to look back for video frames.
171
+ Returns:
172
+ str:
173
+ The answer to the question based on the video sequence from the past n seconds.
174
+ """
175
+ mem = wrapper.context
176
+ sampled_frames = sample_frames(mem, n)
177
+
178
+ if len(sampled_frames) == 0:
179
+ return "No frames available for video Q&A."
180
+
181
+ prompt = f"Answer the question based on this video sequence. Question: {question}"
182
+ result = completion_image(sampled_frames, prompt, env.model_mllm)
183
+ mem.snapshots.append(Snapshot(sender='video qa', data=result))
184
+ return result
app/utils.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import base64
3
+ import supervision as sv
4
+ import numpy as np
5
+ import cv2
6
+ colors = sv.ColorPalette.from_hex(
7
+ [
8
+ "#a1c9f4",
9
+ "#ffb482",
10
+ "#8de5a1",
11
+ "#ff9f9b",
12
+ "#d0bbff",
13
+ "#debb9b",
14
+ "#fab0e4",
15
+ "#cfcfcf",
16
+ "#fffea3",
17
+ "#b9f2f0",
18
+ "#a1c9f4",
19
+ "#ffb482",
20
+ "#8de5a1",
21
+ "#ff9f9b",
22
+ "#d0bbff",
23
+ "#debb9b",
24
+ "#fab0e4",
25
+ "#cfcfcf",
26
+ "#fffea3",
27
+ "#b9f2f0",
28
+ ]
29
+ )
30
+
31
+ def image_w_box(image,objxbox):
32
+
33
+ box_annotator = sv.BoxCornerAnnotator(thickness=10, corner_length=30, color=colors)
34
+ label_annotator = sv.LabelAnnotator(color=colors)
35
+ mask_annotator = sv.MaskAnnotator(opacity=0.2, color=colors)
36
+
37
+ xyxys = np.array([v for boxes in objxbox.values() for v in boxes])
38
+ unique_labels = sorted(objxbox.keys())
39
+ class_id_map = dict(enumerate(unique_labels))
40
+ labels = [l for l, boxes in objxbox.items() for _ in boxes]
41
+ class_id = [list(class_id_map.values()).index(label) for label in labels]
42
+
43
+ masks = np.zeros((len(xyxys), image.shape[0], image.shape[1]), dtype=bool)
44
+ for i, (x1, y1, x2, y2) in enumerate(xyxys):
45
+ masks[i, int(y1):int(y2), int(x1):int(x2)] = labels[i]
46
+
47
+ if len(xyxys) == 0:
48
+ return image
49
+ detections = sv.Detections(
50
+ xyxy=xyxys,
51
+ mask=masks,
52
+ class_id=np.array(class_id),
53
+ )
54
+ # Convert RGB to BGR for annotation
55
+ image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
56
+ # After annotation, convert back to RGB
57
+ annotated_image = box_annotator.annotate(scene=image_bgr.copy(), detections=detections)
58
+ annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
59
+ annotated_image = mask_annotator.annotate(scene=annotated_image, detections=detections)
60
+
61
+ return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
62
+
63
+
64
+ def encode_image(img) -> tuple[str, str]:
65
+ arr = np.array(img.convert("RGB")) if isinstance(img, Image.Image) else img
66
+ if not isinstance(arr, np.ndarray):
67
+ raise ValueError("Unsupported image type")
68
+ ok, buf = cv2.imencode('.jpg', cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
69
+ if not ok:
70
+ raise ValueError("Encoding failed")
71
+ b64 = base64.b64encode(buf).decode('utf-8')
72
+ return b64, "image/jpeg"
main.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+ import cv2
4
+ import gradio as gr
5
+ from fastrtc import WebRTC
6
+ from app.config import env
7
+ from fastrtc import AdditionalOutputs
8
+ from app.memory import Memory,Message
9
+ from fastrtc import get_cloudflare_turn_credentials
10
+ from app.agent import build_agent
11
+ from fastrtc import get_current_context
12
+ session_memories = {}
13
+
14
+ def get_session_memory(session_id: str = None) -> Memory:
15
+ if session_id not in session_memories:
16
+ session_memories[session_id] = Memory(build_agent())
17
+ welcome_message = "👋 Now I can see. Feel free to ask me about anything!"
18
+ session_memories[session_id].chat.append(Message.assistant(welcome_message))
19
+ return session_memories[session_id]
20
+
21
+ def video_handler(frame):
22
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
23
+ rtcid = get_current_context().webrtc_id
24
+ mem = get_session_memory(rtcid)
25
+ if (s := mem.enqueue(frame)):
26
+ if mem.chat.history[-1].metadata.get('status') == 'pending':
27
+ mem.chat.history[-1] = Message.tool(s.gr, title=s.sender, status=s.status)
28
+ else:
29
+ mem.chat.append(Message.tool(s.gr, title=s.sender, status=s.status))
30
+ return frame, AdditionalOutputs(mem.chat.messages, rtcid)
31
+
32
+ def chat_handler(text, webrtc_state):
33
+ if webrtc_state is None:
34
+ return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state
35
+
36
+ mem = get_session_memory(webrtc_state)
37
+ if not mem.is_running:
38
+ mem.receive(text.strip())
39
+ return "", mem.chat.messages, webrtc_state
40
+
41
+
42
+
43
+
44
+
45
+ if __name__ == "__main__":
46
+ print("🚀 Starting Perceptual Copilot...")
47
+ print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}")
48
+ print(f"Environment check - API_KEY: {'✓' if env.api_key else '✗'}")
49
+ print(f"Environment check - END_LANG: {'✓' if env.end_lang else '✗'}")
50
+ print(f"Environment check - OpenAI Client: {'✓' if env.client else '✗'}")
51
+
52
+
53
+
54
+ with gr.Blocks(
55
+ title="🤖 Perceptual Copilot - AI Vision Assistant",
56
+ theme=gr.themes.Soft(
57
+ primary_hue="blue",
58
+ secondary_hue="orange",
59
+ neutral_hue="slate",
60
+ font=("system-ui", "sans-serif")
61
+ ),
62
+ css=Path("styles.css").read_text(),
63
+ ) as demo:
64
+ # Header section with project context
65
+ gr.Markdown(
66
+ """
67
+ <div class="ultra-sleek-header">
68
+ <h1 class="hero-title">
69
+ <span class="title-primary">Perceptual</span>
70
+ <span class="title-accent">Copilot</span>
71
+ </h1>
72
+ <p class="hero-subtitle">
73
+ <span class="status-dot"></span>
74
+ A visual agent system that integrates OpenAI agents with visual tools to process video streams.
75
+ </p>
76
+ <div class="hero-actions">
77
+ <a class="hero-link hero-link-primary" href="https://proj-robot.jing.vision/" target="_blank" rel="noopener noreferrer">See It in Action</a>
78
+ </div>
79
+ <div class="feature-pills">
80
+ <span class="pill">Embodied agent core</span>
81
+ <span class="pill">Real-time streaming</span>
82
+ <span class="pill">Large vision language model</span>
83
+ <span class="pill">Reasoning</span>
84
+ </div>
85
+ </div>
86
+ """,
87
+ elem_classes="ultra-sleek-header",
88
+ )
89
+
90
+ state = gr.State(value=None)
91
+
92
+ # Main interface with improved layout
93
+ with gr.Row(equal_height=True):
94
+ with gr.Column(scale=1, elem_classes="video-container"):
95
+ video = WebRTC(
96
+ label="🎥 Camera Stream",
97
+ rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token),
98
+ track_constraints={
99
+ "width": {"exact": 600},
100
+ "height": {"exact": 600},
101
+ "aspectRatio": {"exact": 1}},
102
+ mode="send",
103
+ modality="video",
104
+ mirror_webcam=True,
105
+ width=600,
106
+ height=600,
107
+ full_screen=False,
108
+ )
109
+
110
+ with gr.Column(scale=1, elem_classes="chat-container"):
111
+ gr.Markdown(
112
+ """
113
+ <div class="chat-header">
114
+ <h3>💬 Chat</h3>
115
+ </div>
116
+ """
117
+ )
118
+ chatbot = gr.Chatbot(
119
+ type="messages",
120
+ height=450,
121
+ label="🤖 AI Assistant",
122
+ placeholder="Chat history will appear here...",
123
+ show_label=False,
124
+ )
125
+
126
+ with gr.Row(elem_classes="items-center"):
127
+ textbox = gr.Textbox(
128
+ placeholder="💭 Type a question and press Enter to send.",
129
+ lines=1,
130
+ show_label=False,
131
+ )
132
+ gr.Markdown(
133
+ """
134
+ <div class="latency-note latency-note-inline">
135
+ We self-host our models, so responses may take slightly longer.
136
+ </div>
137
+ """
138
+ )
139
+ # Event handlers
140
+ video.stream(
141
+ fn=video_handler,
142
+ inputs=[video],
143
+ outputs=[video],
144
+ concurrency_limit=10,
145
+ )
146
+ video.on_additional_outputs(
147
+ fn=lambda messages, webrtc_id: (messages, webrtc_id),
148
+ outputs=[chatbot, state]
149
+ )
150
+
151
+ # Chat handler for textbox
152
+ textbox.submit(
153
+ chat_handler,
154
+ inputs=[textbox, state],
155
+ outputs=[textbox, chatbot, state]
156
+ )
157
+
158
+ # Enhanced instructions section
159
+ with gr.Column(elem_classes="instructions-container"):
160
+ gr.Markdown("""
161
+ <div class="info-grid">
162
+ <section class="info-card info-card-wide">
163
+ <div class="info-label">The Core Layer</div>
164
+ <p>
165
+ Perceptual Copilot is the core perception and agent layer behind our embodied agent system, built for <a class="marker-link" href="https://proj-robot.jing.vision/" target="_blank" rel="noopener noreferrer">real-world robot deployment</a>.
166
+ This is a lightweight live preview of that larger system, focused on the core agent and its visual capabilities.
167
+ </p>
168
+ </section>
169
+ <section class="info-card">
170
+ <div class="info-label">Quick Start</div>
171
+ <ul>
172
+ <li>Start the camera and grant browser access.</li>
173
+ <li>Wait for the assistant’s welcome message.</li>
174
+ <li>Ask questions about what is in view of the camera.</li>
175
+ </ul>
176
+ </section>
177
+ <section class="info-card">
178
+ <div class="info-label">Try Asking</div>
179
+ <ul>
180
+ <li>"What do you see in front of me?"</li>
181
+ <li>"Read the text in this image."</li>
182
+ <li>"What changed in the last few seconds?"</li>
183
+ </ul>
184
+ </section>
185
+ <section class="info-card">
186
+ <div class="info-label">In This Demo</div>
187
+ <ul>
188
+ <li>Captioning and open visual Q&amp;A</li>
189
+ <li>OCR on live camera frames</li>
190
+ <li>Short-window video reasoning</li>
191
+ </ul>
192
+ </section>
193
+ </div>
194
+ """)
195
+ demo.queue(default_concurrency_limit=None)
196
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ python-dotenv
3
+ supervision
4
+ openai-agents
5
+ fastrtc
6
+ gradio
7
+ pydantic
styles.css ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --page-bg: #ecebe5;
3
+ --page-bg-soft: #f6f5f1;
4
+ --surface: rgba(255, 255, 255, 0.84);
5
+ --surface-strong: rgba(255, 255, 255, 0.92);
6
+ --surface-border: rgba(16, 19, 26, 0.07);
7
+ --text: #10131a;
8
+ --muted: #5f6776;
9
+ --royal-blue: #3f63d8;
10
+ --royal-blue-deep: #2f4fbe;
11
+ --periwinkle: #7f8af8;
12
+ --shadow: 0 14px 34px rgba(16, 19, 26, 0.045);
13
+ --shadow-soft: 0 8px 22px rgba(16, 19, 26, 0.03);
14
+ }
15
+ .gradio-container {
16
+ background: linear-gradient(180deg, var(--page-bg-soft) 0%, var(--page-bg) 100%) !important;
17
+ color: var(--text);
18
+ font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
19
+ }
20
+
21
+ .video-container,
22
+ .chat-container,
23
+ .instructions-container {
24
+ background: var(--surface);
25
+ border: 1px solid var(--surface-border);
26
+ border-radius: 22px;
27
+ padding: 22px;
28
+ box-shadow: var(--shadow);
29
+ backdrop-filter: blur(10px);
30
+ }
31
+
32
+ .chat-container textarea {
33
+ border: 1.5px solid rgba(65, 105, 225, 0.14) !important;
34
+ border-radius: 16px !important;
35
+ padding: 15px 16px !important;
36
+ font-size: 16px !important;
37
+ font-weight: 500 !important;
38
+ background: rgba(255, 255, 255, 0.98) !important;
39
+ box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.85), 0 6px 18px rgba(16, 19, 26, 0.03) !important;
40
+ transition: border-color 0.2s ease, box-shadow 0.2s ease, transform 0.2s ease !important;
41
+ min-height: 50px !important;
42
+ resize: none !important;
43
+ }
44
+
45
+ .chat-container textarea:focus {
46
+ border-color: var(--royal-blue) !important;
47
+ box-shadow: 0 0 0 4px rgba(65, 105, 225, 0.1) !important;
48
+ outline: none !important;
49
+ transform: translateY(-1px) !important;
50
+ }
51
+
52
+ .chat-container textarea::placeholder {
53
+ color: #6b7280 !important;
54
+ font-size: 15px !important;
55
+ font-weight: 400 !important;
56
+ }
57
+
58
+ .chat-container .gr-text-input {
59
+ border: none !important;
60
+ background: transparent !important;
61
+ }
62
+
63
+ .latency-note {
64
+ display: block;
65
+ margin: 0;
66
+ padding: 0;
67
+ border: 0;
68
+ background: transparent;
69
+ color: #7b8290;
70
+ font-size: 0.74rem;
71
+ line-height: 1.35;
72
+ letter-spacing: 0.01em;
73
+ white-space: normal;
74
+ }
75
+
76
+ .latency-note-inline {
77
+ width: 100%;
78
+ margin-top: 10px;
79
+ text-align: center;
80
+ }
81
+ .instructions-container {
82
+ margin: 20px 0;
83
+ background: rgba(255, 255, 255, 0.86);
84
+ }
85
+
86
+ .instructions-container a {
87
+ color: var(--royal-blue);
88
+ font-weight: 700;
89
+ text-decoration: none;
90
+ }
91
+
92
+ .instructions-container a:hover {
93
+ text-decoration: underline;
94
+ }
95
+
96
+ /* Ultra-sleek header styling with modern design */
97
+ .ultra-sleek-header {
98
+ text-align: center;
99
+ padding: 20px 20px 5px 20px;
100
+ margin: 0;
101
+ background: transparent;
102
+ border: none;
103
+ position: relative;
104
+ overflow: hidden;
105
+ }
106
+
107
+ .ultra-sleek-header::before {
108
+ content: '';
109
+ position: absolute;
110
+ top: 0;
111
+ left: 0;
112
+ right: 0;
113
+ bottom: 0;
114
+ background: radial-gradient(ellipse at center top, rgba(99, 102, 241, 0.05) 0%, transparent 70%);
115
+ pointer-events: none;
116
+ }
117
+
118
+
119
+ .badge-icon {
120
+ font-size: 1rem;
121
+ animation: float 3s ease-in-out infinite;
122
+ }
123
+
124
+ @keyframes float {
125
+
126
+ 0%,
127
+ 100% {
128
+ transform: translateY(0px);
129
+ }
130
+
131
+ 50% {
132
+ transform: translateY(-4px);
133
+ }
134
+ }
135
+
136
+ .hero-title {
137
+ margin: 8px 0 6px;
138
+ font-size: 4rem;
139
+ line-height: 1;
140
+ letter-spacing: -0.05em;
141
+ font-weight: 800;
142
+ color: var(--text);
143
+ text-shadow: 0 10px 28px rgba(114, 89, 255, 0.08);
144
+ }
145
+ .title-primary {
146
+ background: linear-gradient(45deg,
147
+ #4169E1 0%,
148
+ #8A2BE2 50%,
149
+ #E91E63 100%);
150
+ -webkit-background-clip: text;
151
+ -webkit-text-fill-color: transparent;
152
+ background-clip: text;
153
+ position: relative;
154
+ }
155
+
156
+ .title-accent {
157
+ display: inline-block;
158
+ color: #132542;
159
+ margin-left: 10px;
160
+ }
161
+
162
+ @supports ((-webkit-background-clip: text) or (background-clip: text)) {
163
+ .title-primary {
164
+ background: linear-gradient(90deg, #4f6ef7 0%, #7a43ee 46%, #e21c74 100%);
165
+ -webkit-background-clip: text;
166
+ -webkit-text-fill-color: transparent;
167
+ background-clip: text;
168
+ color: transparent;
169
+ }
170
+ }
171
+ .title-primary::after {
172
+ content: "";
173
+ position: absolute;
174
+ left: 0;
175
+ right: 0;
176
+ bottom: -12px;
177
+ height: 6px;
178
+ border-radius: 999px;
179
+ background: linear-gradient(90deg, rgba(135, 139, 255, 0.9), rgba(221, 123, 255, 0.85));
180
+ box-shadow: 0 8px 22px rgba(165, 105, 255, 0.18);
181
+ }
182
+
183
+ .hero-subtitle {
184
+ margin: 14px auto 0;
185
+ max-width: 560px;
186
+ display: flex;
187
+ align-items: center;
188
+ justify-content: center;
189
+ gap: 10px;
190
+ color: var(--muted);
191
+ font-size: 1.12rem;
192
+ font-weight: 500;
193
+ line-height: 1.6;
194
+ }
195
+
196
+ .status-dot {
197
+ display: inline-block;
198
+ width: 8px;
199
+ height: 8px;
200
+ border-radius: 999px;
201
+ background: #18b77f;
202
+ position: relative;
203
+ flex-shrink: 0;
204
+ }
205
+
206
+ .status-dot::before {
207
+ content: "";
208
+ position: absolute;
209
+ inset: 0;
210
+ border-radius: inherit;
211
+ background: #18b77f;
212
+ animation: ping 2s cubic-bezier(0, 0, 0.2, 1) infinite;
213
+ }
214
+
215
+ @keyframes ping {
216
+ 75%,
217
+ 100% {
218
+ transform: scale(2);
219
+ opacity: 0;
220
+ }
221
+ }
222
+ .hero-actions {
223
+ display: flex;
224
+ justify-content: center;
225
+ align-items: center;
226
+ gap: 12px;
227
+ margin-top: 18px;
228
+ }
229
+
230
+ .hero-link {
231
+ display: inline-flex;
232
+ align-items: center;
233
+ justify-content: center;
234
+ padding: 12px 20px;
235
+ border-radius: 999px;
236
+ text-decoration: none !important;
237
+ font-weight: 800;
238
+ letter-spacing: -0.01em;
239
+ border: 1px solid transparent;
240
+ }
241
+
242
+ .hero-link-primary {
243
+ position: relative;
244
+ overflow: hidden;
245
+ background: linear-gradient(135deg, var(--royal-blue), var(--royal-blue-deep) 62%, var(--periwinkle) 100%);
246
+ color: #ffffff !important;
247
+ box-shadow: 0 16px 34px rgba(65, 105, 225, 0.26);
248
+ animation: buttonPulse 2.8s ease-in-out infinite;
249
+ }
250
+
251
+ .hero-link-primary::after {
252
+ content: "";
253
+ position: absolute;
254
+ top: 0;
255
+ left: -45%;
256
+ width: 34%;
257
+ height: 100%;
258
+ background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.28), transparent);
259
+ transform: skewX(-20deg);
260
+ animation: buttonSheen 3.2s ease-in-out infinite;
261
+ }
262
+
263
+ .hero-link-primary:hover {
264
+ transform: translateY(-1px);
265
+ box-shadow: 0 20px 38px rgba(65, 105, 225, 0.3);
266
+ }
267
+
268
+ @keyframes buttonPulse {
269
+
270
+ 0%,
271
+ 100% {
272
+ transform: translateY(0);
273
+ box-shadow: 0 16px 34px rgba(65, 105, 225, 0.26);
274
+ }
275
+
276
+ 50% {
277
+ transform: translateY(-1px);
278
+ box-shadow: 0 20px 40px rgba(65, 105, 225, 0.34);
279
+ }
280
+ }
281
+
282
+ @keyframes buttonSheen {
283
+
284
+ 0%,
285
+ 55% {
286
+ left: -45%;
287
+ }
288
+
289
+ 82%,
290
+ 100% {
291
+ left: 125%;
292
+ }
293
+ }
294
+ .feature-pills {
295
+ display: flex;
296
+ justify-content: center;
297
+ gap: 10px;
298
+ margin-top: 18px;
299
+ flex-wrap: wrap;
300
+ }
301
+
302
+ .pill {
303
+ background: rgba(255, 255, 255, 0.66);
304
+ border: 1px solid rgba(16, 19, 26, 0.06);
305
+ border-radius: 999px;
306
+ padding: 7px 14px;
307
+ font-size: 0.82rem;
308
+ font-weight: 700;
309
+ color: #536071;
310
+ box-shadow: none;
311
+ }
312
+
313
+ .info-grid {
314
+ display: grid;
315
+ grid-template-columns: repeat(3, minmax(0, 1fr));
316
+ gap: 0;
317
+ margin-top: 6px;
318
+ border: 1px solid rgba(16, 19, 26, 0.09);
319
+ border-radius: 24px;
320
+ overflow: hidden;
321
+ background:
322
+ linear-gradient(90deg, rgba(16, 19, 26, 0.05) 1px, transparent 1px) 0 0 / 33.333% 100%,
323
+ linear-gradient(180deg, rgba(16, 19, 26, 0.04) 1px, transparent 1px) 0 0 / 100% 50%,
324
+ rgba(255, 255, 255, 0.94);
325
+ box-shadow: 0 20px 44px rgba(16, 19, 26, 0.035);
326
+ }
327
+
328
+ .info-card {
329
+ background: transparent;
330
+ border: 0;
331
+ border-right: 1px solid rgba(16, 19, 26, 0.09);
332
+ padding: 30px 36px 32px;
333
+ box-shadow: none;
334
+ border-radius: 0;
335
+ }
336
+
337
+ .info-card-wide {
338
+ grid-column: span 3;
339
+ border-right: 0;
340
+ border-bottom: 1px solid rgba(16, 19, 26, 0.09);
341
+ background: linear-gradient(180deg, rgba(255, 255, 255, 0.92) 0%, rgba(249, 250, 253, 0.96) 100%);
342
+ }
343
+ .info-label {
344
+ margin-bottom: 12px;
345
+ color: #4268f5;
346
+ font-size: 0.74rem;
347
+ font-weight: 800;
348
+ letter-spacing: 0.18em;
349
+ text-transform: uppercase;
350
+ }
351
+ .info-card p,
352
+ .info-card li {
353
+ color: #666d7b;
354
+ font-size: 1rem;
355
+ line-height: 1.62;
356
+ }
357
+
358
+ .info-card p {
359
+ margin: 0;
360
+ }
361
+
362
+ .info-card-wide p {
363
+ width: 100%;
364
+ max-width: none;
365
+ font-size: 1.18rem;
366
+ line-height: 1.72;
367
+ color: #4f5666;
368
+ text-wrap: pretty;
369
+ }
370
+ .info-card ul {
371
+ margin: 0;
372
+ padding: 0;
373
+ list-style: none;
374
+ }
375
+ .info-card li+li {
376
+ margin-top: 10px;
377
+ }
378
+
379
+ .info-card li {
380
+ position: relative;
381
+ padding-left: 16px;
382
+ }
383
+
384
+ .info-card li::before {
385
+ content: "";
386
+ position: absolute;
387
+ left: 0;
388
+ top: 0.7em;
389
+ width: 5px;
390
+ height: 5px;
391
+ border-radius: 999px;
392
+ background: rgba(66, 104, 245, 0.5);
393
+ }
394
+
395
+ .marker-link {
396
+ position: relative;
397
+ display: inline;
398
+ margin: 0;
399
+ padding: 0 0.02em 0.22em;
400
+ color: #3a5cf3 !important;
401
+ font-weight: 700;
402
+ text-decoration: none !important;
403
+ background-image: url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20viewBox%3D%220%200%20220%2026%22%20preserveAspectRatio%3D%22none%22%3E%3Cpath%20d%3D%22M0%2013%20C%2018%203%2036%2023%2055%2013%20S%2092%2023%20110%2013%20S%20148%203%20165%2013%20S%20202%2023%20220%2013%22%20fill%3D%22none%22%20stroke%3D%22%232145de%22%20stroke-width%3D%224.2%22%20stroke-linecap%3D%22round%22%20stroke-linejoin%3D%22round%22%2F%3E%3Cpath%20d%3D%22M0%2016%20C%2020%208%2035%2024%2055%2016%20S%2090%208%20110%2016%20S%20145%2024%20165%2016%20S%20200%208%20220%2016%22%20fill%3D%22none%22%20stroke%3D%22%235070ff%22%20stroke-width%3D%222.6%22%20stroke-linecap%3D%22round%22%20stroke-linejoin%3D%22round%22%20stroke-opacity%3D%22.82%22%2F%3E%3C%2Fsvg%3E");
404
+ background-repeat: no-repeat;
405
+ background-position: 0 125%;
406
+ background-size: 100% 0.52em;
407
+ -webkit-box-decoration-break: clone;
408
+ box-decoration-break: clone;
409
+ transition: color 0.22s ease, background-size 0.22s ease, text-shadow 0.22s ease;
410
+ }
411
+
412
+ .marker-link:hover,
413
+ .marker-link:focus-visible {
414
+ color: #133ad4 !important;
415
+ background-size: 100% 0.7em;
416
+ text-shadow: 0 0 16px rgba(80, 112, 255, 0.12);
417
+ }
418
+
419
+ .marker-link:hover::after,
420
+ .marker-link:focus-visible::after {
421
+ opacity: 1;
422
+ transform: translateY(-10%);
423
+ }
424
+
425
+ .marker-link:focus-visible {
426
+ outline: none;
427
+ }
428
+
429
+ .info-card:nth-child(2),
430
+ .info-card:nth-child(3) {
431
+ border-bottom: 0;
432
+ }
433
+ .info-card:nth-child(4) {
434
+ border-right: 0;
435
+ }
436
+ .items-center {
437
+ display: flex !important;
438
+ align-items: center !important;
439
+ gap: 8px !important;
440
+ }
441
+ .gradio-container footer,
442
+ .gradio-container .api-docs,
443
+ .gradio-container .built-with,
444
+ .gradio-container .settings,
445
+ .gradio-container .footer,
446
+ .gradio-container div[data-testid="footer"],
447
+ .gradio-container .gradio-footer {
448
+ display: none !important;
449
+ }
450
+
451
+ .gradio-container a[href*="gradio"] {
452
+ display: none !important;
453
+ }
454
+
455
+ .gradio-container .chatbot .message-wrap button[aria-label*="clear" i],
456
+ .gradio-container .chatbot .message-wrap button[title*="clear" i],
457
+ .chat-container button[aria-label*="clear" i],
458
+ .chatbot button[aria-label*="clear" i],
459
+ .message-wrap button[aria-label*="clear" i] {
460
+ display: none !important;
461
+ }
462
+
463
+ button:has(svg[data-testid*="clear"]),
464
+ button:has(svg[data-testid*="delete"]),
465
+ button:has(svg[data-testid*="trash"]) {
466
+ display: none !important;
467
+ }
468
+ @media (max-width: 768px) {
469
+ .ultra-sleek-header {
470
+ padding: 18px 14px 12px;
471
+ }
472
+
473
+ .latency-note {
474
+ white-space: normal;
475
+ }
476
+ .hero-title {
477
+ font-size: 3rem;
478
+
479
+
480
+ .hero-subtitle {
481
+ flex-direction: column;
482
+ gap: 8px;
483
+ padding: 0 12px;
484
+ font-size: 1.02rem;
485
+ }
486
+
487
+ .video-container,
488
+ .chat-container,
489
+ .instructions-container {
490
+ padding: 18px;
491
+ border-radius: 20px;
492
+ }
493
+
494
+ .info-grid {
495
+ grid-template-columns: 1fr;
496
+ background: rgba(255, 255, 255, 0.95);
497
+ }
498
+
499
+ .info-card-wide {
500
+ grid-column: span 1;
501
+ }
502
+
503
+ .info-card,
504
+ .info-card-wide {
505
+ border-right: 0;
506
+ border-bottom: 1px solid rgba(16, 19, 26, 0.08);
507
+ padding: 22px 20px 24px;
508
+
509
+
510
+ .info-card:last-child {
511
+ border-bottom: 0;
512
+ }
513
+ }
514
+
515
+ @media (max-width: 480px) {
516
+ .hero-title {
517
+ font-size: 2.5rem;
518
+ line-height: 1.08;
519
+ }
520
+
521
+ .title-accent {
522
+ display: block;
523
+ margin: 4px 0 0;
524
+ }
525
+ }