From 159a49525c54647a20108a134e66cef1785c6573 Mon Sep 17 00:00:00 2001 From: Matt Valentine-House Date: Wed, 18 Feb 2026 20:10:56 +0000 Subject: [PATCH] Optionally interleave benchmarks base/experiment In order to try and mitigate thermal drift, backgroung system activity, cache warmth etc a little better we could alternate the benchmarks. Instead of running all the base benchmarks, then switching to the experimental ruby and running them all again we can interleave them: base/experimental/base/experimental etc. This commit adds an --interleave option that enables this behaviour --- lib/argument_parser.rb | 6 ++ lib/benchmark_runner/cli.rb | 42 ++++++++++--- lib/benchmark_suite.rb | 67 +++++++++++++-------- test/argument_parser_test.rb | 10 ++++ test/benchmark_runner_cli_test.rb | 32 ++++++++++ test/benchmark_suite_test.rb | 99 +++++++++++++++++++++++++++++++ 6 files changed, 222 insertions(+), 34 deletions(-) diff --git a/lib/argument_parser.rb b/lib/argument_parser.rb index df106bed..4e6770e7 100644 --- a/lib/argument_parser.rb +++ b/lib/argument_parser.rb @@ -24,6 +24,7 @@ class ArgumentParser :skip_zjit, :with_pre_init, :pvalue, + :interleave, keyword_init: true ) @@ -149,6 +150,10 @@ def parse(argv) args.pvalue = true end + opts.on("--interleave", "run benchmarks interleaved across executables to reduce thermal drift") do + args.interleave = true + end + opts.on("--graph", "generate a graph image of benchmark results") do args.graph = true end @@ -230,6 +235,7 @@ def default_args excludes: [], rss: false, pvalue: false, + interleave: false, graph: false, no_pinning: false, force_pinning: false, diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb index 2812f3ee..8db164c2 100644 --- a/lib/benchmark_runner/cli.rb +++ b/lib/benchmark_runner/cli.rb @@ -40,19 +40,43 @@ def run force_pinning: args.force_pinning ) - # Benchmark with and without YJIT + # Collect ruby version descriptions for all executables upfront + args.executables.each do |name, executable| + ruby_descriptions[name] = `#{executable.shelljoin} -v`.chomp + end + bench_start_time = Time.now.to_f bench_data = {} bench_failures = {} - args.executables.each do |name, executable| - ruby_descriptions[name] = `#{executable.shelljoin} -v`.chomp - bench_data[name], failures = suite.run( - ruby: executable, - ruby_description: ruby_descriptions[name] - ) - # Make it easier to query later. - bench_failures[name] = failures unless failures.empty? + if args.interleave + args.executables.each_key { |name| bench_data[name] = {} } + entries = suite.benchmarks + + entries.each_with_index do |entry, idx| + # Alternate executable order to cancel cache-warming bias + exes = ruby_descriptions.keys + exes = exes.reverse if idx.odd? + + exes.each do |name| + puts("Running benchmark \"#{entry.name}\" [#{name}] (#{idx+1}/#{entries.length})") + result = suite.run_benchmark(entry, ruby: args.executables[name], ruby_description: ruby_descriptions[name]) + if result[:data] + bench_data[name][entry.name] = result[:data] + else + bench_failures[name] ||= {} + bench_failures[name][entry.name] = result[:failure] + end + end + end + else + args.executables.each do |name, executable| + bench_data[name], failures = suite.run( + ruby: executable, + ruby_description: ruby_descriptions[name] + ) + bench_failures[name] = failures unless failures.empty? + end end bench_end_time = Time.now.to_f diff --git a/lib/benchmark_suite.rb b/lib/benchmark_suite.rb index a2354c2a..cb71fc78 100644 --- a/lib/benchmark_suite.rb +++ b/lib/benchmark_suite.rb @@ -33,41 +33,53 @@ def initialize(categories:, name_filters:, excludes: [], out_path:, harness:, ha @bench_dir = BENCHMARKS_DIR end - # Run all the benchmarks and record execution times - # Returns [bench_data, bench_failures] - def run(ruby:, ruby_description:) - bench_data = {} - bench_failures = {} + # Discovered and filtered benchmark entries, memoized. + def benchmarks + @benchmarks ||= discover_benchmarks + end - benchmark_entries = discover_benchmarks + # Run a single benchmark entry on a single executable. + # Returns { name:, data: } on success, { name:, failure: } on error. + def run_benchmark(entry, ruby:, ruby_description:) env = benchmark_env(ruby) caller_json_path = ENV["RESULT_JSON_PATH"] - - # Capture quiet setting before entering unbundled env (which clears ENV) quiet = ENV['BENCHMARK_QUIET'] == '1' - benchmark_entries.each_with_index do |entry, idx| - puts("Running benchmark \"#{entry.name}\" (#{idx+1}/#{benchmark_entries.length})") + result_json_path = caller_json_path || File.join(out_path, "temp#{Process.pid}.json") + cmd_prefix = base_cmd(ruby_description, entry.name) - result_json_path = caller_json_path || File.join(out_path, "temp#{Process.pid}.json") - cmd_prefix = base_cmd(ruby_description, entry.name) - - # Clear project-level Bundler environment so benchmarks run in a clean context. - # Benchmarks that need Bundler (e.g., railsbench) set up their own via use_gemfile. - # This is important when running tests under `bundle exec rake test`. - result = if defined?(Bundler) - Bundler.with_unbundled_env do - run_single_benchmark(entry.script_path, result_json_path, ruby, cmd_prefix, env, entry.name, quiet: quiet) - end - else + # Clear project-level Bundler environment so benchmarks run in a clean context. + # Benchmarks that need Bundler (e.g., railsbench) set up their own via use_gemfile. + result = if defined?(Bundler) + Bundler.with_unbundled_env do run_single_benchmark(entry.script_path, result_json_path, ruby, cmd_prefix, env, entry.name, quiet: quiet) end + else + run_single_benchmark(entry.script_path, result_json_path, ruby, cmd_prefix, env, entry.name, quiet: quiet) + end + + if result[:success] + { name: entry.name, data: process_benchmark_result(result_json_path, result[:command], delete_file: !caller_json_path) } + else + FileUtils.rm_f(result_json_path) unless caller_json_path + { name: entry.name, failure: result[:status].exitstatus } + end + end - if result[:success] - bench_data[entry.name] = process_benchmark_result(result_json_path, result[:command], delete_file: !caller_json_path) + # Run all the benchmarks and record execution times. + # Returns [bench_data, bench_failures] + def run(ruby:, ruby_description:) + bench_data = {} + bench_failures = {} + + benchmarks.each_with_index do |entry, idx| + puts("Running benchmark \"#{entry.name}\" (#{idx+1}/#{benchmarks.length})") + + result = run_benchmark(entry, ruby: ruby, ruby_description: ruby_description) + if result[:data] + bench_data[entry.name] = result[:data] else - bench_failures[entry.name] = result[:status].exitstatus - FileUtils.rm_f(result_json_path) unless caller_json_path + bench_failures[entry.name] = result[:failure] end end @@ -174,6 +186,11 @@ def benchmark_harness_for(benchmark_name) end def benchmark_env(ruby) + @benchmark_env_cache ||= {} + @benchmark_env_cache[ruby] ||= compute_benchmark_env(ruby) + end + + def compute_benchmark_env(ruby) # When the Ruby running this script is not the first Ruby in PATH, shell commands # like `bundle install` in a child process will not use the Ruby being benchmarked. # It overrides PATH to guarantee the commands of the benchmarked Ruby will be used. diff --git a/test/argument_parser_test.rb b/test/argument_parser_test.rb index 442fb0cd..9ab2db9c 100644 --- a/test/argument_parser_test.rb +++ b/test/argument_parser_test.rb @@ -50,6 +50,7 @@ def setup_mock_ruby(path) assert_equal [], args.name_filters assert_equal false, args.rss assert_equal false, args.pvalue + assert_equal false, args.interleave assert_equal false, args.graph assert_equal false, args.no_pinning assert_equal false, args.turbo @@ -438,6 +439,15 @@ def setup_mock_ruby(path) end end + describe '--interleave option' do + it 'sets interleave flag' do + parser = ArgumentParser.new + args = parser.parse(['--interleave']) + + assert_equal true, args.interleave + end + end + describe '--graph option' do it 'sets graph flag' do parser = ArgumentParser.new diff --git a/test/benchmark_runner_cli_test.rb b/test/benchmark_runner_cli_test.rb index 72452429..6bc6336c 100644 --- a/test/benchmark_runner_cli_test.rb +++ b/test/benchmark_runner_cli_test.rb @@ -48,6 +48,7 @@ def create_args(overrides = {}) name_filters: [], excludes: [], rss: false, + interleave: false, graph: false, no_pinning: true, turbo: true, @@ -318,6 +319,37 @@ def create_args(overrides = {}) end end + it 'runs benchmarks interleaved when --interleave is set' do + Dir.mktmpdir do |tmpdir| + args = create_args( + name_filters: ['fib', 'respond_to'], + out_path: tmpdir, + interleave: true + ) + + cli = BenchmarkRunner::CLI.new(args) + output = capture_io { cli.run }.join + + # Progress output should include executable names in brackets + assert_match(/\[.+\]/, output, "Interleaved output should include executable name in brackets") + assert_match(/Total time spent benchmarking:/, output) + + # Verify output files were created with data from all executables + json_files = Dir.glob(File.join(tmpdir, "*.json")) + assert_equal 1, json_files.size + + json_data = JSON.parse(File.read(json_files.first)) + raw_data = json_data['raw_data'] + + # All executables should have results + args.executables.each_key do |name| + assert raw_data.key?(name), "Expected raw_data to contain '#{name}'" + assert raw_data[name].key?('fib'), "Expected '#{name}' to have 'fib' results" + assert raw_data[name].key?('respond_to'), "Expected '#{name}' to have 'respond_to' results" + end + end + end + it 'creates output directory if it does not exist' do Dir.mktmpdir do |parent_tmpdir| nested_dir = File.join(parent_tmpdir, 'nested', 'output', 'dir') diff --git a/test/benchmark_suite_test.rb b/test/benchmark_suite_test.rb index aa6ac32b..527204ec 100644 --- a/test/benchmark_suite_test.rb +++ b/test/benchmark_suite_test.rb @@ -519,6 +519,105 @@ end end + describe '#benchmarks' do + it 'returns discovered benchmark entries' do + suite = BenchmarkSuite.new( + categories: [], + name_filters: ['simple'], + out_path: @out_path, + harness: 'harness', + no_pinning: true + ) + + entries = suite.benchmarks + assert_instance_of Array, entries + assert_equal 1, entries.length + assert_equal 'simple', entries.first.name + end + + it 'memoizes the result' do + suite = BenchmarkSuite.new( + categories: [], + name_filters: ['simple'], + out_path: @out_path, + harness: 'harness', + no_pinning: true + ) + + assert_same suite.benchmarks, suite.benchmarks + end + end + + describe '#run_benchmark' do + it 'returns data hash on success' do + suite = BenchmarkSuite.new( + categories: [], + name_filters: ['simple'], + out_path: @out_path, + harness: 'harness', + no_pinning: true + ) + + entry = suite.benchmarks.first + result = nil + capture_io do + result = suite.run_benchmark(entry, ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0') + end + + assert_equal 'simple', result[:name] + assert_instance_of Hash, result[:data] + assert_includes result[:data], 'warmup' + assert_includes result[:data], 'bench' + assert_includes result[:data], 'rss' + assert_nil result[:failure] + end + + it 'returns failure hash on error' do + File.write('benchmarks/failing.rb', "exit(1)\n") + + suite = BenchmarkSuite.new( + categories: [], + name_filters: ['failing'], + out_path: @out_path, + harness: 'harness', + no_pinning: true + ) + + entry = suite.benchmarks.first + result = nil + capture_io do + result = suite.run_benchmark(entry, ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0') + end + + assert_equal 'failing', result[:name] + assert_nil result[:data] + assert_equal 1, result[:failure] + end + + it 'produces same data as run for the same benchmark' do + suite = BenchmarkSuite.new( + categories: [], + name_filters: ['simple'], + out_path: @out_path, + harness: 'harness', + no_pinning: true + ) + + entry = suite.benchmarks.first + single_result = nil + capture_io do + single_result = suite.run_benchmark(entry, ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0') + end + + run_data = nil + capture_io do + run_data, _ = suite.run(ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0') + end + + assert_equal run_data['simple'].keys.sort, single_result[:data].keys.sort + end + end + describe 'integration with BenchmarkFilter' do it 'uses BenchmarkFilter to match benchmarks' do # Create benchmarks with different categories