from inspect_viz import Data, Selection
from inspect_viz.input import checkbox_group, select
from inspect_viz.layout import vconcat, vspace
from inspect_viz.plot import plot, legend
from inspect_viz.mark import dot, rule_x, text, regression_y
from inspect_viz.table import table
from inspect_viz.transform import ci_bounds, epoch_ms
# read data
1 evals = Data.from_file("benchmarks.parquet" )
# transforms to compute ci bounds from score and stderr columns
2 ci_lower, ci_upper = ci_bounds(
score= "score_headline_value" ,
level= 0.95 ,
stderr= "score_headline_stderr"
)
vconcat(
# select benchmark
select(evals, label= "Eval: " , column= "task_name" , value= "GPQA Diamond" , width= 425 ),
# filter models by organization(s)
checkbox_group(evals, column= "model_organization_name" ),
# dot plot w/ error bars
vspace(15 ),
plot(
# benchmark score
dot(
evals,
3 x= epoch_ms("model_release_date" ),
y= "score_headline_value" ,
r= 3 ,
fill= "model_organization_name" ,
4 channels= {
"Model" : "model_display_name" ,
"Scorer" : "score_headline_name" ,
"Stderr" : "score_headline_stderr" ,
"Log Viewer" : "log_viewer"
}
),
# confidence interval
rule_x(
evals,
x= epoch_ms("model_release_date" ),
y= "score_headline_value" ,
y1= ci_lower,
y2= ci_upper,
stroke= "model_organization_name" ,
5 stroke_opacity= 0.4 ,
marker= "tick-x" ,
),
# regression line
regression_y(
evals,
x= epoch_ms("model_release_date" ),
y= "score_headline_value" ,
stroke= "#AAAAAA"
),
# frontier annotation
text(
evals,
6 text= "model_display_name" ,
x= epoch_ms("model_release_date" ),
y= "score_headline_value" ,
line_anchor= "middle" ,
frame_anchor= "right" ,
7 filter = "frontier" ,
dx=- 4 ,
fill= "model_organization_name" ,
),
8 legend= legend("color" , target= evals.selection),
9 x_domain= "fixed" ,
y_domain= [0 ,1.0 ],
x_label= "Release Date" ,
y_label= "Score" ,
color_label= "Organization" ,
color_domain= "fixed" ,
10 x_tick_format= "%b. %Y" ,
grid= True ,
)
)