Mandark-droid commited on
Commit
c040b82
·
1 Parent(s): cb9eb3c

Add Trends tab with time series visualization

Browse files
Files changed (3) hide show
  1. app.py +16 -0
  2. components/__init__.py +13 -6
  3. components/analytics_charts.py +745 -0
app.py CHANGED
@@ -13,6 +13,7 @@ load_dotenv()
13
  # Import data loader and components
14
  from data_loader import create_data_loader_from_env
15
  from components.leaderboard_table import generate_leaderboard_html
 
16
 
17
  # Initialize data loader
18
  data_loader = create_data_loader_from_env()
@@ -73,6 +74,13 @@ def load_drilldown(agent_type, provider):
73
  return display_df
74
 
75
 
 
 
 
 
 
 
 
76
  # Build Gradio app
77
  with gr.Blocks(title="TraceMind-AI") as app:
78
  gr.Markdown("# 🧠 TraceMind-AI")
@@ -128,6 +136,9 @@ with gr.Blocks(title="TraceMind-AI") as app:
128
  interactive=False
129
  )
130
 
 
 
 
131
  # Hidden textbox for row selection (JavaScript bridge)
132
  selected_row_index = gr.Textbox(visible=False, elem_id="selected_row_index")
133
 
@@ -137,6 +148,11 @@ with gr.Blocks(title="TraceMind-AI") as app:
137
  outputs=[leaderboard_by_model, model_filter]
138
  )
139
 
 
 
 
 
 
140
  apply_filters_btn.click(
141
  fn=apply_filters,
142
  inputs=[model_filter, provider_filter, sort_by],
 
13
  # Import data loader and components
14
  from data_loader import create_data_loader_from_env
15
  from components.leaderboard_table import generate_leaderboard_html
16
+ from components.analytics_charts import create_trends_plot
17
 
18
  # Initialize data loader
19
  data_loader = create_data_loader_from_env()
 
74
  return display_df
75
 
76
 
77
+ def load_trends():
78
+ """Load trends visualization"""
79
+ df = data_loader.load_leaderboard()
80
+ fig = create_trends_plot(df)
81
+ return fig
82
+
83
+
84
  # Build Gradio app
85
  with gr.Blocks(title="TraceMind-AI") as app:
86
  gr.Markdown("# 🧠 TraceMind-AI")
 
136
  interactive=False
137
  )
138
 
139
+ with gr.TabItem("📈 Trends"):
140
+ trends_plot = gr.Plot()
141
+
142
  # Hidden textbox for row selection (JavaScript bridge)
143
  selected_row_index = gr.Textbox(visible=False, elem_id="selected_row_index")
144
 
 
148
  outputs=[leaderboard_by_model, model_filter]
149
  )
150
 
151
+ app.load(
152
+ fn=load_trends,
153
+ outputs=[trends_plot]
154
+ )
155
+
156
  apply_filters_btn.click(
157
  fn=apply_filters,
158
  inputs=[model_filter, provider_filter, sort_by],
components/__init__.py CHANGED
@@ -21,14 +21,16 @@ from .leaderboard_table import (
21
  generate_filter_summary_html
22
  )
23
 
 
 
 
 
 
 
 
 
24
  # Additional components (to be added)
25
  # from .thought_graph import create_thought_graph
26
- # from .analytics_charts import (
27
- # create_performance_heatmap,
28
- # create_speed_accuracy_scatter,
29
- # create_cost_efficiency_scatter,
30
- # create_comparison_radar
31
- # )
32
  # from .report_cards import (
33
  # generate_leaderboard_summary_card,
34
  # generate_run_report_card,
@@ -48,4 +50,9 @@ __all__ = [
48
  'generate_leaderboard_html',
49
  'generate_empty_state_html',
50
  'generate_filter_summary_html',
 
 
 
 
 
51
  ]
 
21
  generate_filter_summary_html
22
  )
23
 
24
+ from .analytics_charts import (
25
+ create_trends_plot,
26
+ create_performance_heatmap,
27
+ create_speed_accuracy_scatter,
28
+ create_cost_efficiency_scatter,
29
+ create_comparison_radar
30
+ )
31
+
32
  # Additional components (to be added)
33
  # from .thought_graph import create_thought_graph
 
 
 
 
 
 
34
  # from .report_cards import (
35
  # generate_leaderboard_summary_card,
36
  # generate_run_report_card,
 
50
  'generate_leaderboard_html',
51
  'generate_empty_state_html',
52
  'generate_filter_summary_html',
53
+ 'create_trends_plot',
54
+ 'create_performance_heatmap',
55
+ 'create_speed_accuracy_scatter',
56
+ 'create_cost_efficiency_scatter',
57
+ 'create_comparison_radar',
58
  ]
components/analytics_charts.py ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Analytics Charts Component
3
+ Interactive visualizations for leaderboard analytics
4
+ """
5
+
6
+ import plotly.graph_objects as go
7
+ import pandas as pd
8
+ import numpy as np
9
+ from typing import List, Dict, Any, Optional
10
+
11
+
12
+ def create_performance_heatmap(df: pd.DataFrame) -> go.Figure:
13
+ """
14
+ Create an interactive heatmap of models × metrics
15
+
16
+ Args:
17
+ df: Leaderboard DataFrame with metrics
18
+
19
+ Returns:
20
+ Plotly figure with heatmap visualization
21
+ """
22
+
23
+ if df.empty:
24
+ return _create_empty_figure("No data available for heatmap")
25
+
26
+ # Select metrics to display
27
+ metrics = [
28
+ 'success_rate',
29
+ 'avg_duration_ms',
30
+ 'total_cost_usd',
31
+ 'co2_emissions_g',
32
+ 'gpu_utilization_avg',
33
+ 'total_tokens'
34
+ ]
35
+
36
+ # Filter to only available metrics
37
+ available_metrics = [m for m in metrics if m in df.columns]
38
+
39
+ if not available_metrics:
40
+ return _create_empty_figure("No metrics available for analysis")
41
+
42
+ # Aggregate by model (in case of multiple runs)
43
+ model_stats = df.groupby('model')[available_metrics].mean()
44
+
45
+ # Prepare data matrix (rows=metrics, columns=models)
46
+ heatmap_data = []
47
+ heatmap_text = []
48
+ metric_labels = []
49
+
50
+ for metric in available_metrics:
51
+ values = model_stats[metric].values
52
+
53
+ # Normalize to 0-1 scale
54
+ # For metrics where lower is better (duration, cost, co2), invert the scale
55
+ if metric in ['avg_duration_ms', 'total_cost_usd', 'co2_emissions_g']:
56
+ # Invert: lower is better (green)
57
+ max_val = values.max()
58
+ if max_val > 0:
59
+ normalized = 1 - (values / max_val)
60
+ else:
61
+ normalized = np.zeros_like(values)
62
+ else:
63
+ # Higher is better (green)
64
+ max_val = values.max()
65
+ if max_val > 0:
66
+ normalized = values / max_val
67
+ else:
68
+ normalized = np.zeros_like(values)
69
+
70
+ heatmap_data.append(normalized)
71
+
72
+ # Create hover text with actual values
73
+ if metric == 'success_rate':
74
+ text_row = [f"{v:.1f}%" for v in values]
75
+ elif metric == 'avg_duration_ms':
76
+ text_row = [f"{v:.0f}ms" for v in values]
77
+ elif metric in ['total_cost_usd']:
78
+ text_row = [f"${v:.4f}" for v in values]
79
+ elif metric == 'co2_emissions_g':
80
+ text_row = [f"{v:.2f}g" for v in values]
81
+ elif metric == 'gpu_utilization_avg':
82
+ text_row = [f"{v:.1f}%" if pd.notna(v) else "N/A" for v in values]
83
+ else:
84
+ text_row = [f"{v:.0f}" for v in values]
85
+
86
+ heatmap_text.append(text_row)
87
+
88
+ # Create readable metric labels
89
+ label = metric.replace('_', ' ').replace('avg', 'Avg').replace('usd', 'USD').title()
90
+ metric_labels.append(label)
91
+
92
+ # Get model names
93
+ models = model_stats.index.tolist()
94
+
95
+ # Shorten model names if too long
96
+ model_labels = [m.split('/')[-1] if '/' in m else m for m in models]
97
+ model_labels = [m[:20] + '...' if len(m) > 20 else m for m in model_labels]
98
+
99
+ # Create heatmap
100
+ fig = go.Figure(data=go.Heatmap(
101
+ z=heatmap_data,
102
+ x=model_labels,
103
+ y=metric_labels,
104
+ text=heatmap_text,
105
+ texttemplate='%{text}',
106
+ textfont={"size": 10},
107
+ colorscale='RdYlGn', # Red (bad) → Yellow → Green (good)
108
+ hoverongaps=False,
109
+ hovertemplate='<b>%{y}</b><br>Model: %{x}<br>Value: %{text}<br>Score: %{z:.2f}<extra></extra>',
110
+ colorbar=dict(
111
+ title=dict(
112
+ text="Performance<br>Score",
113
+ side="right"
114
+ ),
115
+ tickmode="linear",
116
+ tick0=0,
117
+ dtick=0.25
118
+ )
119
+ ))
120
+
121
+ fig.update_layout(
122
+ title={
123
+ 'text': '🔥 Model Performance Heatmap',
124
+ 'x': 0.5,
125
+ 'xanchor': 'center',
126
+ 'font': {'size': 20}
127
+ },
128
+ xaxis_title='Model',
129
+ yaxis_title='Metric',
130
+ height=500,
131
+ plot_bgcolor='#f8f9fa',
132
+ paper_bgcolor='white',
133
+ xaxis=dict(tickangle=-45),
134
+ margin=dict(l=150, r=100, t=100, b=150),
135
+ )
136
+
137
+ return fig
138
+
139
+
140
+ def create_speed_accuracy_scatter(df: pd.DataFrame) -> go.Figure:
141
+ """
142
+ Speed vs Accuracy trade-off scatter plot
143
+
144
+ Args:
145
+ df: Leaderboard DataFrame
146
+
147
+ Returns:
148
+ Plotly figure with scatter plot
149
+ """
150
+
151
+ if df.empty:
152
+ return _create_empty_figure("No data available for scatter plot")
153
+
154
+ # Check required columns
155
+ required_cols = ['model', 'success_rate', 'avg_duration_ms']
156
+ if not all(col in df.columns for col in required_cols):
157
+ return _create_empty_figure(f"Missing required columns: {required_cols}")
158
+
159
+ # Aggregate by model
160
+ model_stats = df.groupby('model').agg({
161
+ 'success_rate': 'mean',
162
+ 'avg_duration_ms': 'mean',
163
+ 'total_cost_usd': 'mean' if 'total_cost_usd' in df.columns else 'size',
164
+ 'agent_type': 'first' if 'agent_type' in df.columns else 'size'
165
+ }).reset_index()
166
+
167
+ # Create figure
168
+ fig = go.Figure()
169
+
170
+ # Get unique agent types
171
+ agent_types = model_stats['agent_type'].unique() if 'agent_type' in model_stats.columns else ['all']
172
+
173
+ # Color scheme
174
+ colors = {
175
+ 'tool': '#E67E22', # Orange
176
+ 'code': '#3498DB', # Blue
177
+ 'both': '#9B59B6', # Purple
178
+ 'all': '#1ABC9C', # Teal
179
+ 'unknown': '#95A5A6' # Gray
180
+ }
181
+
182
+ for agent_type in agent_types:
183
+ if agent_type == 'all':
184
+ subset = model_stats
185
+ else:
186
+ subset = model_stats[model_stats['agent_type'] == agent_type]
187
+
188
+ # Prepare hover text
189
+ hover_texts = []
190
+ for _, row in subset.iterrows():
191
+ model_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
192
+ hover = f"<b>{model_name}</b><br>"
193
+ hover += f"Success Rate: {row['success_rate']:.1f}%<br>"
194
+ hover += f"Avg Duration: {row['avg_duration_ms']:.0f}ms<br>"
195
+ if 'total_cost_usd' in row and pd.notna(row['total_cost_usd']):
196
+ hover += f"Cost: ${row['total_cost_usd']:.4f}"
197
+ hover_texts.append(hover)
198
+
199
+ # Bubble size based on cost (if available)
200
+ if 'total_cost_usd' in subset.columns:
201
+ sizes = subset['total_cost_usd'] * 5000 # Scale up for visibility
202
+ sizes = sizes.clip(lower=10, upper=100) # Reasonable range
203
+ else:
204
+ sizes = 30 # Default size
205
+
206
+ fig.add_trace(go.Scatter(
207
+ x=subset['avg_duration_ms'],
208
+ y=subset['success_rate'],
209
+ mode='markers+text',
210
+ name=str(agent_type).title(),
211
+ marker=dict(
212
+ size=sizes,
213
+ color=colors.get(str(agent_type).lower(), colors['unknown']),
214
+ opacity=0.7,
215
+ line=dict(width=2, color='white')
216
+ ),
217
+ text=[m.split('/')[-1][:15] for m in subset['model']],
218
+ textposition='top center',
219
+ textfont=dict(size=9),
220
+ hovertext=hover_texts,
221
+ hoverinfo='text'
222
+ ))
223
+
224
+ # Add quadrant lines (median split)
225
+ if len(model_stats) > 1:
226
+ median_speed = model_stats['avg_duration_ms'].median()
227
+ median_accuracy = model_stats['success_rate'].median()
228
+
229
+ fig.add_hline(
230
+ y=median_accuracy,
231
+ line_dash="dash",
232
+ line_color="gray",
233
+ opacity=0.4,
234
+ annotation_text=f"Median Accuracy: {median_accuracy:.1f}%",
235
+ annotation_position="right"
236
+ )
237
+ fig.add_vline(
238
+ x=median_speed,
239
+ line_dash="dash",
240
+ line_color="gray",
241
+ opacity=0.4,
242
+ annotation_text=f"Median Speed: {median_speed:.0f}ms",
243
+ annotation_position="top"
244
+ )
245
+
246
+ # Add zone annotations
247
+ max_accuracy = model_stats['success_rate'].max()
248
+ min_speed = model_stats['avg_duration_ms'].min()
249
+
250
+ fig.add_annotation(
251
+ x=min_speed + (median_speed - min_speed) * 0.5,
252
+ y=max_accuracy * 0.98,
253
+ text="⭐ Fast & Accurate",
254
+ showarrow=False,
255
+ font=dict(size=14, color='green', family='Arial Black'),
256
+ bgcolor='rgba(144, 238, 144, 0.2)',
257
+ borderpad=5
258
+ )
259
+
260
+ fig.update_layout(
261
+ title={
262
+ 'text': '⚡ Speed vs Accuracy Trade-off',
263
+ 'x': 0.5,
264
+ 'xanchor': 'center',
265
+ 'font': {'size': 20}
266
+ },
267
+ xaxis_title='Average Duration (ms)',
268
+ yaxis_title='Success Rate (%)',
269
+ xaxis_type='log', # Log scale for duration
270
+ height=600,
271
+ plot_bgcolor='white',
272
+ paper_bgcolor='#f8f9fa',
273
+ showlegend=True,
274
+ legend=dict(
275
+ title=dict(text='Agent Type'),
276
+ orientation="v",
277
+ yanchor="top",
278
+ y=0.99,
279
+ xanchor="right",
280
+ x=0.99
281
+ ),
282
+ hovermode='closest'
283
+ )
284
+
285
+ # Add grid for better readability
286
+ fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
287
+ fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
288
+
289
+ return fig
290
+
291
+
292
+ def create_cost_efficiency_scatter(df: pd.DataFrame) -> go.Figure:
293
+ """
294
+ Cost-Performance Efficiency scatter plot
295
+
296
+ Args:
297
+ df: Leaderboard DataFrame
298
+
299
+ Returns:
300
+ Plotly figure with cost efficiency scatter
301
+ """
302
+
303
+ if df.empty:
304
+ return _create_empty_figure("No data available for cost analysis")
305
+
306
+ # Check required columns
307
+ if 'success_rate' not in df.columns or 'total_cost_usd' not in df.columns:
308
+ return _create_empty_figure("Missing required columns: success_rate, total_cost_usd")
309
+
310
+ # Aggregate by model
311
+ agg_dict = {
312
+ 'success_rate': 'mean',
313
+ 'total_cost_usd': 'mean',
314
+ 'avg_duration_ms': 'mean' if 'avg_duration_ms' in df.columns else 'size',
315
+ 'provider': 'first' if 'provider' in df.columns else 'size'
316
+ }
317
+
318
+ model_stats = df.groupby('model').agg(agg_dict).reset_index()
319
+
320
+ # Calculate efficiency metric: success_rate / cost
321
+ model_stats['efficiency'] = model_stats['success_rate'] / (model_stats['total_cost_usd'] + 0.0001) # Avoid division by zero
322
+
323
+ # Create figure
324
+ fig = go.Figure()
325
+
326
+ # Get unique providers
327
+ providers = model_stats['provider'].unique() if 'provider' in model_stats.columns else ['all']
328
+
329
+ # Color scheme
330
+ provider_colors = {
331
+ 'litellm': '#3498DB', # Blue (API)
332
+ 'transformers': '#2ECC71', # Green (GPU/local)
333
+ 'all': '#9B59B6', # Purple
334
+ 'unknown': '#95A5A6' # Gray
335
+ }
336
+
337
+ for provider in providers:
338
+ if provider == 'all':
339
+ subset = model_stats
340
+ else:
341
+ subset = model_stats[model_stats['provider'] == provider]
342
+
343
+ # Prepare hover text
344
+ hover_texts = []
345
+ for _, row in subset.iterrows():
346
+ model_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
347
+ hover = f"<b>{model_name}</b><br>"
348
+ hover += f"Success Rate: {row['success_rate']:.1f}%<br>"
349
+ hover += f"Total Cost: ${row['total_cost_usd']:.4f}<br>"
350
+ hover += f"Efficiency: {row['efficiency']:.0f} (points/$)<br>"
351
+ if 'avg_duration_ms' in row and pd.notna(row['avg_duration_ms']):
352
+ hover += f"Duration: {row['avg_duration_ms']:.0f}ms"
353
+ hover_texts.append(hover)
354
+
355
+ # Bubble size based on duration (if available)
356
+ if 'avg_duration_ms' in subset.columns:
357
+ # Invert: smaller duration = smaller bubble
358
+ sizes = subset['avg_duration_ms'] / 100 # Scale down
359
+ sizes = sizes.clip(lower=10, upper=80) # Reasonable range
360
+ else:
361
+ sizes = 30 # Default size
362
+
363
+ fig.add_trace(go.Scatter(
364
+ x=subset['total_cost_usd'],
365
+ y=subset['success_rate'],
366
+ mode='markers+text',
367
+ name=str(provider).title(),
368
+ marker=dict(
369
+ size=sizes,
370
+ color=provider_colors.get(str(provider).lower(), provider_colors['unknown']),
371
+ opacity=0.7,
372
+ line=dict(width=2, color='white')
373
+ ),
374
+ text=[m.split('/')[-1][:15] for m in subset['model']],
375
+ textposition='top center',
376
+ textfont=dict(size=9),
377
+ hovertext=hover_texts,
378
+ hoverinfo='text'
379
+ ))
380
+
381
+ # Add cost bands
382
+ if len(model_stats) > 0:
383
+ max_cost = model_stats['total_cost_usd'].max()
384
+
385
+ # Budget band: < $0.01
386
+ if max_cost > 0.01:
387
+ fig.add_vrect(
388
+ x0=0, x1=0.01,
389
+ fillcolor="lightgreen", opacity=0.1,
390
+ layer="below", line_width=0,
391
+ annotation_text="Budget", annotation_position="top left"
392
+ )
393
+
394
+ # Mid band: $0.01-$0.10
395
+ if max_cost > 0.10:
396
+ fig.add_vrect(
397
+ x0=0.01, x1=0.10,
398
+ fillcolor="yellow", opacity=0.1,
399
+ layer="below", line_width=0,
400
+ annotation_text="Mid-Range", annotation_position="top left"
401
+ )
402
+
403
+ # Premium band: > $0.10
404
+ if max_cost > 0.10:
405
+ fig.add_vrect(
406
+ x0=0.10, x1=max_cost * 1.1,
407
+ fillcolor="orange", opacity=0.1,
408
+ layer="below", line_width=0,
409
+ annotation_text="Premium", annotation_position="top left"
410
+ )
411
+
412
+ # Highlight top 3 most efficient models
413
+ top_efficient = model_stats.nlargest(3, 'efficiency')
414
+ for _, row in top_efficient.iterrows():
415
+ fig.add_annotation(
416
+ x=row['total_cost_usd'],
417
+ y=row['success_rate'],
418
+ text="⭐",
419
+ showarrow=False,
420
+ font=dict(size=20)
421
+ )
422
+
423
+ fig.update_layout(
424
+ title={
425
+ 'text': '💰 Cost-Performance Efficiency',
426
+ 'x': 0.5,
427
+ 'xanchor': 'center',
428
+ 'font': {'size': 20}
429
+ },
430
+ xaxis_title='Total Cost (USD)',
431
+ yaxis_title='Success Rate (%)',
432
+ xaxis_type='log', # Log scale for cost
433
+ height=600,
434
+ plot_bgcolor='white',
435
+ paper_bgcolor='#f8f9fa',
436
+ showlegend=True,
437
+ legend=dict(
438
+ title=dict(text='Provider'),
439
+ orientation="v",
440
+ yanchor="top",
441
+ y=0.99,
442
+ xanchor="right",
443
+ x=0.99
444
+ ),
445
+ hovermode='closest'
446
+ )
447
+
448
+ # Add grid for better readability
449
+ fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
450
+ fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
451
+
452
+ return fig
453
+
454
+
455
+ def _create_empty_figure(message: str) -> go.Figure:
456
+ """
457
+ Create an empty figure with a message
458
+
459
+ Args:
460
+ message: Message to display
461
+
462
+ Returns:
463
+ Plotly figure with annotation
464
+ """
465
+ fig = go.Figure()
466
+
467
+ fig.add_annotation(
468
+ text=message,
469
+ xref="paper", yref="paper",
470
+ x=0.5, y=0.5,
471
+ xanchor='center', yanchor='middle',
472
+ showarrow=False,
473
+ font=dict(size=16, color='gray')
474
+ )
475
+
476
+ fig.update_layout(
477
+ height=500,
478
+ plot_bgcolor='white',
479
+ paper_bgcolor='#f8f9fa',
480
+ xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
481
+ yaxis=dict(showgrid=False, showticklabels=False, zeroline=False)
482
+ )
483
+
484
+ return fig
485
+
486
+
487
+ def create_comparison_radar(runs: List[Dict[str, Any]]) -> go.Figure:
488
+ """
489
+ Create a multi-dimensional radar chart comparing 2-3 runs
490
+
491
+ Args:
492
+ runs: List of run data dictionaries (2-3 models)
493
+
494
+ Returns:
495
+ Plotly figure with radar chart comparison
496
+ """
497
+
498
+ if not runs or len(runs) < 2:
499
+ return _create_empty_figure("Please select at least 2 runs to compare")
500
+
501
+ if len(runs) > 3:
502
+ runs = runs[:3] # Limit to 3 runs for readability
503
+
504
+ # Define dimensions for radar chart
505
+ dimensions = []
506
+ dimension_names = []
507
+
508
+ # Helper function to normalize values (0-1 scale)
509
+ def normalize(values, invert=False):
510
+ """Normalize values to 0-1, optionally inverting (lower is better)"""
511
+ values = np.array(values, dtype=float)
512
+ min_val, max_val = np.nanmin(values), np.nanmax(values)
513
+ if max_val == min_val:
514
+ return [0.5] * len(values)
515
+ normalized = (values - min_val) / (max_val - min_val)
516
+ if invert:
517
+ normalized = 1 - normalized
518
+ return normalized.tolist()
519
+
520
+ # Extract metrics from all runs
521
+ success_rates = [run.get('success_rate', 0) / 100 for run in runs] # Already 0-1
522
+ durations = [run.get('avg_duration_ms', 0) for run in runs]
523
+ costs = [run.get('total_cost_usd', 0) for run in runs]
524
+ tokens = [run.get('total_tokens', 0) for run in runs]
525
+ co2 = [run.get('co2_emissions_g', 0) for run in runs]
526
+ gpu_util = [run.get('gpu_utilization_avg', None) for run in runs]
527
+
528
+ # Calculate Token Efficiency (success per 1000 tokens)
529
+ # Use max() to avoid division by zero
530
+ token_efficiency = [
531
+ (run.get('success_rate', 0) / 100) / max((run.get('total_tokens', 0) / 1000), 0.001)
532
+ for run in runs
533
+ ]
534
+
535
+ # Build dimensions (normalized 0-1)
536
+ dimensions.append(success_rates) # Already 0-1
537
+ dimension_names.append('Success Rate')
538
+
539
+ dimensions.append(normalize(durations, invert=True)) # Faster is better
540
+ dimension_names.append('Speed')
541
+
542
+ dimensions.append(normalize(costs, invert=True)) # Cheaper is better
543
+ dimension_names.append('Cost Efficiency')
544
+
545
+ dimensions.append(normalize(token_efficiency)) # Higher is better
546
+ dimension_names.append('Token Efficiency')
547
+
548
+ dimensions.append(normalize(co2, invert=True)) # Lower CO2 is better
549
+ dimension_names.append('CO2 Efficiency')
550
+
551
+ # Add GPU Utilization if available
552
+ if any(g is not None for g in gpu_util):
553
+ gpu_values = [g / 100 if g is not None else 0 for g in gpu_util] # Normalize to 0-1
554
+ dimensions.append(gpu_values)
555
+ dimension_names.append('GPU Utilization')
556
+
557
+ # Create radar chart
558
+ fig = go.Figure()
559
+
560
+ colors = ['#667eea', '#f093fb', '#43e97b'] # Purple, Pink, Green
561
+
562
+ for idx, run in enumerate(runs):
563
+ model_name = run.get('model', f'Run {idx+1}')
564
+ if '/' in model_name:
565
+ model_name = model_name.split('/')[-1] # Show only model name, not provider
566
+
567
+ # Extract values for this run across all dimensions
568
+ values = [dim[idx] for dim in dimensions]
569
+
570
+ # Close the radar chart by repeating first value
571
+ values_closed = values + [values[0]]
572
+ theta_closed = dimension_names + [dimension_names[0]]
573
+
574
+ fig.add_trace(go.Scatterpolar(
575
+ r=values_closed,
576
+ theta=theta_closed,
577
+ name=model_name,
578
+ fill='toself',
579
+ fillcolor=colors[idx],
580
+ opacity=0.3,
581
+ line=dict(color=colors[idx], width=2),
582
+ marker=dict(size=8, color=colors[idx]),
583
+ hovertemplate='<b>%{theta}</b><br>' +
584
+ 'Score: %{r:.2f}<br>' +
585
+ f'<b>{model_name}</b>' +
586
+ '<extra></extra>'
587
+ ))
588
+
589
+ fig.update_layout(
590
+ polar=dict(
591
+ bgcolor='#f8f9fa',
592
+ radialaxis=dict(
593
+ visible=True,
594
+ range=[0, 1],
595
+ showticklabels=True,
596
+ ticks='',
597
+ gridcolor='rgba(100, 100, 100, 0.2)',
598
+ tickfont=dict(size=10)
599
+ ),
600
+ angularaxis=dict(
601
+ gridcolor='rgba(100, 100, 100, 0.2)',
602
+ linecolor='rgba(100, 100, 100, 0.4)',
603
+ tickfont=dict(size=12, color='#0f172a')
604
+ )
605
+ ),
606
+ showlegend=True,
607
+ legend=dict(
608
+ orientation="h",
609
+ yanchor="bottom",
610
+ y=-0.2,
611
+ xanchor="center",
612
+ x=0.5,
613
+ bgcolor='rgba(255, 255, 255, 0.8)',
614
+ bordercolor='#ccc',
615
+ borderwidth=1
616
+ ),
617
+ title=dict(
618
+ text='Multi-Dimensional Model Comparison',
619
+ x=0.5,
620
+ xanchor='center',
621
+ font=dict(size=18, color='#0f172a', family='Inter, sans-serif')
622
+ ),
623
+ height=600,
624
+ paper_bgcolor='white',
625
+ font=dict(family='Inter, sans-serif')
626
+ )
627
+
628
+ return fig
629
+
630
+
631
+ def create_trends_plot(df: pd.DataFrame) -> go.Figure:
632
+ """
633
+ Create time series visualization of evaluation metrics over time
634
+
635
+ Args:
636
+ df: Leaderboard DataFrame with timestamp column
637
+
638
+ Returns:
639
+ Plotly figure with time series chart
640
+ """
641
+
642
+ if df.empty:
643
+ return _create_empty_figure("No data available for trends")
644
+
645
+ # Check if timestamp column exists
646
+ if 'timestamp' not in df.columns:
647
+ return _create_empty_figure("Missing timestamp column for trends analysis")
648
+
649
+ # Convert timestamp to datetime
650
+ df = df.copy()
651
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
652
+
653
+ # Sort by timestamp
654
+ df = df.sort_values('timestamp')
655
+
656
+ # Aggregate by date (in case multiple runs per day)
657
+ df['date'] = df['timestamp'].dt.date
658
+
659
+ daily_stats = df.groupby('date').agg({
660
+ 'success_rate': 'mean',
661
+ 'avg_duration_ms': 'mean',
662
+ 'total_cost_usd': 'mean',
663
+ 'total_tokens': 'mean'
664
+ }).reset_index()
665
+
666
+ # Create figure with secondary y-axis
667
+ fig = go.Figure()
668
+
669
+ # Success Rate
670
+ fig.add_trace(go.Scatter(
671
+ x=daily_stats['date'],
672
+ y=daily_stats['success_rate'],
673
+ name='Success Rate (%)',
674
+ mode='lines+markers',
675
+ line=dict(color='#2ECC71', width=3),
676
+ marker=dict(size=8),
677
+ yaxis='y1',
678
+ hovertemplate='<b>Success Rate</b><br>Date: %{x}<br>Rate: %{y:.1f}%<extra></extra>'
679
+ ))
680
+
681
+ # Duration
682
+ fig.add_trace(go.Scatter(
683
+ x=daily_stats['date'],
684
+ y=daily_stats['avg_duration_ms'],
685
+ name='Avg Duration (ms)',
686
+ mode='lines+markers',
687
+ line=dict(color='#3498DB', width=3),
688
+ marker=dict(size=8),
689
+ yaxis='y2',
690
+ hovertemplate='<b>Duration</b><br>Date: %{x}<br>Time: %{y:.0f}ms<extra></extra>'
691
+ ))
692
+
693
+ # Cost
694
+ fig.add_trace(go.Scatter(
695
+ x=daily_stats['date'],
696
+ y=daily_stats['total_cost_usd'],
697
+ name='Avg Cost (USD)',
698
+ mode='lines+markers',
699
+ line=dict(color='#E67E22', width=3),
700
+ marker=dict(size=8),
701
+ yaxis='y2',
702
+ hovertemplate='<b>Cost</b><br>Date: %{x}<br>Cost: $%{y:.4f}<extra></extra>'
703
+ ))
704
+
705
+ fig.update_layout(
706
+ title={
707
+ 'text': '📈 Evaluation Metrics Trends Over Time',
708
+ 'x': 0.5,
709
+ 'xanchor': 'center',
710
+ 'font': {'size': 20}
711
+ },
712
+ xaxis=dict(
713
+ title='Date',
714
+ showgrid=True,
715
+ gridcolor='lightgray'
716
+ ),
717
+ yaxis=dict(
718
+ title='Success Rate (%)',
719
+ titlefont=dict(color='#2ECC71'),
720
+ tickfont=dict(color='#2ECC71'),
721
+ showgrid=True,
722
+ gridcolor='lightgray'
723
+ ),
724
+ yaxis2=dict(
725
+ title='Duration (ms) / Cost (USD)',
726
+ titlefont=dict(color='#3498DB'),
727
+ tickfont=dict(color='#3498DB'),
728
+ overlaying='y',
729
+ side='right'
730
+ ),
731
+ hovermode='x unified',
732
+ height=500,
733
+ plot_bgcolor='white',
734
+ paper_bgcolor='#f8f9fa',
735
+ showlegend=True,
736
+ legend=dict(
737
+ orientation="h",
738
+ yanchor="bottom",
739
+ y=1.02,
740
+ xanchor="right",
741
+ x=1
742
+ )
743
+ )
744
+
745
+ return fig