Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions lib/argument_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class ArgumentParser
:skip_yjit,
:skip_zjit,
:with_pre_init,
:pvalue,
keyword_init: true
)

Expand Down Expand Up @@ -144,6 +145,10 @@ def parse(argv)
args.rss = true
end

opts.on("--pvalue", "show p-value and significance columns for each comparison") do
args.pvalue = true
end

opts.on("--graph", "generate a graph image of benchmark results") do
args.graph = true
end
Expand Down Expand Up @@ -224,6 +229,7 @@ def default_args
name_filters: [],
excludes: [],
rss: false,
pvalue: false,
graph: false,
no_pinning: false,
force_pinning: false,
Expand Down
1 change: 1 addition & 0 deletions lib/benchmark_runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def build_output_text(ruby_descriptions, table, format, bench_failures)
output_str << "- #{name} 1st itr: ratio of #{base_name}/#{name} time for the first benchmarking iteration.\n"
output_str << "- #{base_name}/#{name}: ratio of #{base_name}/#{name} time. Higher is better for #{name}. Above 1 represents a speedup.\n"
end
output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
end

output_str
Expand Down
3 changes: 2 additions & 1 deletion lib/benchmark_runner/cli.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def run
builder = ResultsTableBuilder.new(
executable_names: ruby_descriptions.keys,
bench_data: bench_data,
include_rss: args.rss
include_rss: args.rss,
include_pvalue: args.pvalue
)
table, format = builder.build

Expand Down
66 changes: 62 additions & 4 deletions lib/results_table_builder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ class ResultsTableBuilder
SECONDS_TO_MS = 1000.0
BYTES_TO_MIB = 1024.0 * 1024.0

def initialize(executable_names:, bench_data:, include_rss: false)
def initialize(executable_names:, bench_data:, include_rss: false, include_pvalue: false)
@executable_names = executable_names
@bench_data = bench_data
@include_rss = include_rss
@include_pvalue = include_pvalue
@base_name = executable_names.first
@other_names = executable_names[1..]
@bench_names = compute_bench_names
Expand Down Expand Up @@ -48,6 +49,9 @@ def build_header

@other_names.each do |name|
header << "#{@base_name}/#{name}"
if @include_pvalue
header << "p-value" << "sig"
end
end

header
Expand All @@ -66,7 +70,10 @@ def build_format
end

@other_names.each do |_name|
format << "%.3f"
format << "%s"
if @include_pvalue
format << "%s" << "%s"
end
end

format
Expand Down Expand Up @@ -105,9 +112,60 @@ def build_comparison_columns(row, other_ts, other_rsss)

def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
ratio_1sts = other_t0s.map { |other_t0| base_t0 / other_t0 }
ratios = other_ts.map { |other_t| mean(base_t) / mean(other_t) }
row.concat(ratio_1sts)
row.concat(ratios)

other_ts.each do |other_t|
pval = Stats.welch_p_value(base_t, other_t)
row << format_ratio(mean(base_t) / mean(other_t), pval)
if @include_pvalue
row << format_p_value(pval)
row << significance_level(pval)
end
end
end

def format_ratio(ratio, pval)
sym = significance_symbol(pval)
formatted = "%.3f" % ratio
sym.empty? ? formatted : "#{formatted} (#{sym})"
end

def format_p_value(pval)
return "N/A" if pval.nil?

if pval >= 0.001
"%.3f" % pval
else
"%.1e" % pval
end
end

def significance_symbol(pval)
return "" if pval.nil?

if pval < 0.001
"***"
elsif pval < 0.01
"**"
elsif pval < 0.05
"*"
else
""
end
end

def significance_level(pval)
return "" if pval.nil?

if pval < 0.001
"p < 0.001"
elsif pval < 0.01
"p < 0.01"
elsif pval < 0.05
"p < 0.05"
else
""
end
end

def extract_first_iteration_times(bench_name)
Expand Down
104 changes: 104 additions & 0 deletions misc/stats.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,101 @@
class Stats
class << self
# Welch's t-test (two-tailed). Returns the p-value, or nil if
# either sample is too small to compute a meaningful test.
def welch_p_value(a, b)
return nil if a.size < 2 || b.size < 2

stats_a = new(a)
stats_b = new(b)

n_a = a.size.to_f
n_b = b.size.to_f
var_a = stats_a.sample_variance
var_b = stats_b.sample_variance

se_sq = var_a / n_a + var_b / n_b
if se_sq == 0.0
# Both samples have zero variance — if means match they're
# indistinguishable, otherwise they're trivially different.
return stats_a.mean == stats_b.mean ? 1.0 : 0.0
end

t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq)

# Welch-Satterthwaite degrees of freedom
df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1))

# Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2)
x = df / (df + t * t)
regularized_incomplete_beta(x, df / 2.0, 0.5)
end

private

# Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method).
# Returns the probability that a Beta(alpha, beta)-distributed variable is <= x.
def regularized_incomplete_beta(x, alpha, beta)
return 0.0 if x <= 0.0
return 1.0 if x >= 1.0

# Symmetry relation: pick the side that converges faster
if x > (alpha + 1.0) / (alpha + beta + 2.0)
return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha)
end

# B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow
ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] +
alpha * Math.log(x) + beta * Math.log(1.0 - x)
normalizer = Math.exp(ln_normalizer)

normalizer * beta_continued_fraction(x, alpha, beta) / alpha
end

# Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm.
# Each iteration computes two sub-steps (even and odd terms of the fraction).
def beta_continued_fraction(x, alpha, beta)
floor = 1.0e-30 # prevent division by zero in Lentz's method
converged = false

numerator_term = 1.0
denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0)
denominator_term = floor if denominator_term.abs < floor
denominator_term = 1.0 / denominator_term
fraction = denominator_term

(1..200).each do |iteration|
two_i = 2 * iteration

# Even sub-step: d_{2m} coefficient of the continued fraction
coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i))
denominator_term = 1.0 + coeff * denominator_term
denominator_term = floor if denominator_term.abs < floor
numerator_term = 1.0 + coeff / numerator_term
numerator_term = floor if numerator_term.abs < floor
denominator_term = 1.0 / denominator_term
fraction *= denominator_term * numerator_term

# Odd sub-step: d_{2m+1} coefficient of the continued fraction
coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0))
denominator_term = 1.0 + coeff * denominator_term
denominator_term = floor if denominator_term.abs < floor
numerator_term = 1.0 + coeff / numerator_term
numerator_term = floor if numerator_term.abs < floor
denominator_term = 1.0 / denominator_term
correction = denominator_term * numerator_term
fraction *= correction

if (correction - 1.0).abs < 1.0e-10
converged = true
break
end
end

warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged
fraction
end
end

def initialize(data)
@data = data
end
Expand All @@ -15,13 +112,20 @@ def mean
@data.sum(0.0) / @data.size
end

# Population standard deviation (N denominator) — describes these specific values.
def stddev
mean = self.mean
diffs_squared = @data.map { |v| (v-mean) * (v-mean) }
mean_squared = diffs_squared.sum(0.0) / @data.size
Math.sqrt(mean_squared)
end

# Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference.
def sample_variance
m = mean
@data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f
end

def median
compute_median(@data)
end
Expand Down
10 changes: 10 additions & 0 deletions test/argument_parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def setup_mock_ruby(path)
assert_equal [], args.categories
assert_equal [], args.name_filters
assert_equal false, args.rss
assert_equal false, args.pvalue
assert_equal false, args.graph
assert_equal false, args.no_pinning
assert_equal false, args.turbo
Expand Down Expand Up @@ -428,6 +429,15 @@ def setup_mock_ruby(path)
end
end

describe '--pvalue option' do
it 'sets pvalue flag' do
parser = ArgumentParser.new
args = parser.parse(['--pvalue'])

assert_equal true, args.pvalue
end
end

describe '--graph option' do
it 'sets graph flag' do
parser = ArgumentParser.new
Expand Down
1 change: 1 addition & 0 deletions test/benchmark_runner_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@
assert_includes result, 'Legend:'
assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.'
assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.'
assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)"
end

it 'includes formatted table in output' do
Expand Down
Loading
Loading