diff --git a/lib/argument_parser.rb b/lib/argument_parser.rb index 36c2186a..df106bed 100644 --- a/lib/argument_parser.rb +++ b/lib/argument_parser.rb @@ -23,6 +23,7 @@ class ArgumentParser :skip_yjit, :skip_zjit, :with_pre_init, + :pvalue, keyword_init: true ) @@ -144,6 +145,10 @@ def parse(argv) args.rss = true end + opts.on("--pvalue", "show p-value and significance columns for each comparison") do + args.pvalue = true + end + opts.on("--graph", "generate a graph image of benchmark results") do args.graph = true end @@ -224,6 +229,7 @@ def default_args name_filters: [], excludes: [], rss: false, + pvalue: false, graph: false, no_pinning: false, force_pinning: false, diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb index 487d57a8..98860ca8 100644 --- a/lib/benchmark_runner.rb +++ b/lib/benchmark_runner.rb @@ -66,6 +66,7 @@ def build_output_text(ruby_descriptions, table, format, bench_failures) output_str << "- #{name} 1st itr: ratio of #{base_name}/#{name} time for the first benchmarking iteration.\n" output_str << "- #{base_name}/#{name}: ratio of #{base_name}/#{name} time. Higher is better for #{name}. Above 1 represents a speedup.\n" end + output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n" end output_str diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb index f73e3ea9..2a68e21f 100644 --- a/lib/benchmark_runner/cli.rb +++ b/lib/benchmark_runner/cli.rb @@ -69,7 +69,8 @@ def run builder = ResultsTableBuilder.new( executable_names: ruby_descriptions.keys, bench_data: bench_data, - include_rss: args.rss + include_rss: args.rss, + include_pvalue: args.pvalue ) table, format = builder.build diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb index 6bbc53f7..d066c8c8 100644 --- a/lib/results_table_builder.rb +++ b/lib/results_table_builder.rb @@ -5,10 +5,11 @@ class ResultsTableBuilder SECONDS_TO_MS = 1000.0 BYTES_TO_MIB = 1024.0 * 1024.0 - def initialize(executable_names:, bench_data:, include_rss: false) + def initialize(executable_names:, bench_data:, include_rss: false, include_pvalue: false) @executable_names = executable_names @bench_data = bench_data @include_rss = include_rss + @include_pvalue = include_pvalue @base_name = executable_names.first @other_names = executable_names[1..] @bench_names = compute_bench_names @@ -48,6 +49,9 @@ def build_header @other_names.each do |name| header << "#{@base_name}/#{name}" + if @include_pvalue + header << "p-value" << "sig" + end end header @@ -66,7 +70,10 @@ def build_format end @other_names.each do |_name| - format << "%.3f" + format << "%s" + if @include_pvalue + format << "%s" << "%s" + end end format @@ -105,9 +112,60 @@ def build_comparison_columns(row, other_ts, other_rsss) def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts) ratio_1sts = other_t0s.map { |other_t0| base_t0 / other_t0 } - ratios = other_ts.map { |other_t| mean(base_t) / mean(other_t) } row.concat(ratio_1sts) - row.concat(ratios) + + other_ts.each do |other_t| + pval = Stats.welch_p_value(base_t, other_t) + row << format_ratio(mean(base_t) / mean(other_t), pval) + if @include_pvalue + row << format_p_value(pval) + row << significance_level(pval) + end + end + end + + def format_ratio(ratio, pval) + sym = significance_symbol(pval) + formatted = "%.3f" % ratio + sym.empty? ? formatted : "#{formatted} (#{sym})" + end + + def format_p_value(pval) + return "N/A" if pval.nil? + + if pval >= 0.001 + "%.3f" % pval + else + "%.1e" % pval + end + end + + def significance_symbol(pval) + return "" if pval.nil? + + if pval < 0.001 + "***" + elsif pval < 0.01 + "**" + elsif pval < 0.05 + "*" + else + "" + end + end + + def significance_level(pval) + return "" if pval.nil? + + if pval < 0.001 + "p < 0.001" + elsif pval < 0.01 + "p < 0.01" + elsif pval < 0.05 + "p < 0.05" + else + "" + end end def extract_first_iteration_times(bench_name) diff --git a/misc/stats.rb b/misc/stats.rb index 44c03dca..4c70ab38 100644 --- a/misc/stats.rb +++ b/misc/stats.rb @@ -1,4 +1,101 @@ class Stats + class << self + # Welch's t-test (two-tailed). Returns the p-value, or nil if + # either sample is too small to compute a meaningful test. + def welch_p_value(a, b) + return nil if a.size < 2 || b.size < 2 + + stats_a = new(a) + stats_b = new(b) + + n_a = a.size.to_f + n_b = b.size.to_f + var_a = stats_a.sample_variance + var_b = stats_b.sample_variance + + se_sq = var_a / n_a + var_b / n_b + if se_sq == 0.0 + # Both samples have zero variance — if means match they're + # indistinguishable, otherwise they're trivially different. + return stats_a.mean == stats_b.mean ? 1.0 : 0.0 + end + + t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq) + + # Welch-Satterthwaite degrees of freedom + df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1)) + + # Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2) + x = df / (df + t * t) + regularized_incomplete_beta(x, df / 2.0, 0.5) + end + + private + + # Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method). + # Returns the probability that a Beta(alpha, beta)-distributed variable is <= x. + def regularized_incomplete_beta(x, alpha, beta) + return 0.0 if x <= 0.0 + return 1.0 if x >= 1.0 + + # Symmetry relation: pick the side that converges faster + if x > (alpha + 1.0) / (alpha + beta + 2.0) + return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha) + end + + # B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow + ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] + + alpha * Math.log(x) + beta * Math.log(1.0 - x) + normalizer = Math.exp(ln_normalizer) + + normalizer * beta_continued_fraction(x, alpha, beta) / alpha + end + + # Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm. + # Each iteration computes two sub-steps (even and odd terms of the fraction). + def beta_continued_fraction(x, alpha, beta) + floor = 1.0e-30 # prevent division by zero in Lentz's method + converged = false + + numerator_term = 1.0 + denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0) + denominator_term = floor if denominator_term.abs < floor + denominator_term = 1.0 / denominator_term + fraction = denominator_term + + (1..200).each do |iteration| + two_i = 2 * iteration + + # Even sub-step: d_{2m} coefficient of the continued fraction + coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i)) + denominator_term = 1.0 + coeff * denominator_term + denominator_term = floor if denominator_term.abs < floor + numerator_term = 1.0 + coeff / numerator_term + numerator_term = floor if numerator_term.abs < floor + denominator_term = 1.0 / denominator_term + fraction *= denominator_term * numerator_term + + # Odd sub-step: d_{2m+1} coefficient of the continued fraction + coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0)) + denominator_term = 1.0 + coeff * denominator_term + denominator_term = floor if denominator_term.abs < floor + numerator_term = 1.0 + coeff / numerator_term + numerator_term = floor if numerator_term.abs < floor + denominator_term = 1.0 / denominator_term + correction = denominator_term * numerator_term + fraction *= correction + + if (correction - 1.0).abs < 1.0e-10 + converged = true + break + end + end + + warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged + fraction + end + end + def initialize(data) @data = data end @@ -15,6 +112,7 @@ def mean @data.sum(0.0) / @data.size end + # Population standard deviation (N denominator) — describes these specific values. def stddev mean = self.mean diffs_squared = @data.map { |v| (v-mean) * (v-mean) } @@ -22,6 +120,12 @@ def stddev Math.sqrt(mean_squared) end + # Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference. + def sample_variance + m = mean + @data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f + end + def median compute_median(@data) end diff --git a/test/argument_parser_test.rb b/test/argument_parser_test.rb index 3c49f560..442fb0cd 100644 --- a/test/argument_parser_test.rb +++ b/test/argument_parser_test.rb @@ -49,6 +49,7 @@ def setup_mock_ruby(path) assert_equal [], args.categories assert_equal [], args.name_filters assert_equal false, args.rss + assert_equal false, args.pvalue assert_equal false, args.graph assert_equal false, args.no_pinning assert_equal false, args.turbo @@ -428,6 +429,15 @@ def setup_mock_ruby(path) end end + describe '--pvalue option' do + it 'sets pvalue flag' do + parser = ArgumentParser.new + args = parser.parse(['--pvalue']) + + assert_equal true, args.pvalue + end + end + describe '--graph option' do it 'sets graph flag' do parser = ArgumentParser.new diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb index f9c6e492..00fdbd6e 100644 --- a/test/benchmark_runner_test.rb +++ b/test/benchmark_runner_test.rb @@ -387,6 +387,7 @@ assert_includes result, 'Legend:' assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.' assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.' + assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)" end it 'includes formatted table in output' do diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb index f5b811fc..b85ff79a 100644 --- a/test/results_table_builder_test.rb +++ b/test/results_table_builder_test.rb @@ -58,13 +58,13 @@ assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit'], table[0] - assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f'], format + assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%s'], format assert_equal 'fib', table[1][0] assert_in_delta 100.0, table[1][1], 1.0 assert_in_delta 50.0, table[1][3], 1.0 assert_in_delta 2.0, table[1][5], 0.1 - assert_in_delta 2.0, table[1][6], 0.1 + assert_match(/^2\.0\d+/, table[1][6]) end it 'includes RSS columns when include_rss is true' do @@ -176,7 +176,7 @@ ] assert_equal expected_header, table[0] - expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%.3f'] + expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%s', '%s'] assert_equal expected_format, format end @@ -308,6 +308,98 @@ assert_equal 'fib', table[1][0] end + it 'shows small p-value in scientific notation for clearly different distributions' do + executable_names = ['ruby', 'ruby-yjit'] + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.100, 0.101, 0.099, 0.1005, 0.0995, 0.1002, 0.0998, 0.1001, 0.0999, 0.1003], + 'rss' => 1024 * 1024 * 10 + } + }, + 'ruby-yjit' => { + 'fib' => { + 'warmup' => [0.05], + 'bench' => [0.050, 0.051, 0.049, 0.0505, 0.0495, 0.0502, 0.0498, 0.0501, 0.0499, 0.0503], + 'rss' => 1024 * 1024 * 12 + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: executable_names, + bench_data: bench_data, + include_pvalue: true + ) + + table, _format = builder.build + p_value_str = table[1][-2] + sig_str = table[1].last + assert_match(/e-/, p_value_str, "Expected scientific notation for very small p-value, got #{p_value_str}") + assert_equal "p < 0.001", sig_str + end + + it 'shows N/A p-value when samples have fewer than 2 elements' do + executable_names = ['ruby', 'ruby-yjit'] + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.1], + 'rss' => 1024 * 1024 * 10 + } + }, + 'ruby-yjit' => { + 'fib' => { + 'warmup' => [0.05], + 'bench' => [0.05], + 'rss' => 1024 * 1024 * 12 + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: executable_names, + bench_data: bench_data, + include_pvalue: true + ) + + table, _format = builder.build + assert_equal 'N/A', table[1][-2] + assert_equal '', table[1].last + end + + it 'always shows significance symbol but omits verbose columns without --pvalue' do + executable_names = ['ruby', 'ruby-yjit'] + bench_data = { + 'ruby' => { + 'fib' => { + 'warmup' => [0.1], + 'bench' => [0.100, 0.101, 0.099], + 'rss' => 1024 * 1024 * 10 + } + }, + 'ruby-yjit' => { + 'fib' => { + 'warmup' => [0.05], + 'bench' => [0.050, 0.051, 0.049], + 'rss' => 1024 * 1024 * 12 + } + } + } + + builder = ResultsTableBuilder.new( + executable_names: executable_names, + bench_data: bench_data + ) + + table, _format = builder.build + refute_includes table[0], 'p-value' + refute_includes table[0], 'sig' + assert_match(/\(\*{1,3}\)$/, table[1].last) + end + it 'handles only headline benchmarks' do executable_names = ['ruby'] bench_data = { diff --git a/test/stats_test.rb b/test/stats_test.rb new file mode 100644 index 00000000..b614bb0d --- /dev/null +++ b/test/stats_test.rb @@ -0,0 +1,72 @@ +require_relative 'test_helper' +require_relative '../misc/stats' + +describe Stats do + describe '#sample_variance' do + it 'computes unbiased sample variance (n-1 denominator)' do + stats = Stats.new([2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]) + assert_in_delta 4.571, stats.sample_variance, 0.001 + end + + it 'equals population variance * n/(n-1)' do + data = [10.0, 12.0, 14.0, 16.0, 18.0] + stats = Stats.new(data) + n = data.size.to_f + assert_in_delta stats.stddev ** 2 * n / (n - 1), stats.sample_variance, 1e-10 + end + end + + describe '.welch_p_value' do + it 'returns 1.0 for identical distributions' do + a = [1.0, 1.0, 1.0, 1.0, 1.0] + b = [1.0, 1.0, 1.0, 1.0, 1.0] + assert_in_delta 1.0, Stats.welch_p_value(a, b), 0.001 + end + + it 'returns small p-value for clearly different distributions' do + a = [100.0, 101.0, 99.0, 100.5, 99.5, 100.2, 99.8, 100.1, 99.9, 100.3] + b = [50.0, 51.0, 49.0, 50.5, 49.5, 50.2, 49.8, 50.1, 49.9, 50.3] + p_value = Stats.welch_p_value(a, b) + assert p_value < 0.001, "Expected p < 0.001, got #{p_value}" + end + + it 'returns high p-value for overlapping distributions' do + a = [10.0, 11.0, 9.0, 10.5, 9.5] + b = [10.1, 10.9, 9.1, 10.6, 9.4] + p_value = Stats.welch_p_value(a, b) + assert p_value > 0.5, "Expected p > 0.5, got #{p_value}" + end + + it 'is symmetric' do + a = [100.0, 102.0, 98.0, 101.0, 99.0] + b = [90.0, 92.0, 88.0, 91.0, 89.0] + assert_in_delta Stats.welch_p_value(a, b), Stats.welch_p_value(b, a), 1e-10 + end + + it 'handles different sample sizes' do + a = [100.0, 101.0, 99.0] + b = [50.0, 51.0, 49.0, 50.5, 49.5, 50.2, 49.8, 50.1, 49.9, 50.3] + p_value = Stats.welch_p_value(a, b) + assert p_value < 0.001, "Expected p < 0.001, got #{p_value}" + end + + it 'handles unequal variances' do + a = [100.0, 100.0, 100.0, 100.0, 100.0] # zero variance + b = [50.0, 60.0, 40.0, 70.0, 30.0] # high variance + p_value = Stats.welch_p_value(a, b) + assert p_value < 0.05, "Expected p < 0.05, got #{p_value}" + end + + it 'returns 0.0 when both samples have zero variance but different means' do + a = [10.0, 10.0, 10.0, 10.0, 10.0] + b = [5.0, 5.0, 5.0, 5.0, 5.0] + assert_equal 0.0, Stats.welch_p_value(a, b) + end + + it 'returns nil when a sample has fewer than 2 elements' do + assert_nil Stats.welch_p_value([10.0], [5.0, 6.0, 7.0]) + assert_nil Stats.welch_p_value([5.0, 6.0, 7.0], [10.0]) + assert_nil Stats.welch_p_value([10.0], [5.0]) + end + end +end