From 159a49525c54647a20108a134e66cef1785c6573 Mon Sep 17 00:00:00 2001
From: Matt Valentine-House <matt@eightbitraptor.com>
Date: Wed, 18 Feb 2026 20:10:56 +0000
Subject: [PATCH] Optionally interleave benchmarks base/experiment

In order to try and mitigate thermal drift, backgroung system activity,
cache warmth etc a little better we could alternate the benchmarks.

Instead of running all the base benchmarks, then switching to the
experimental ruby and running them all again we can interleave them:
base/experimental/base/experimental etc.

This commit adds an --interleave option that enables this behaviour
---
 lib/argument_parser.rb            |  6 ++
 lib/benchmark_runner/cli.rb       | 42 ++++++++++---
 lib/benchmark_suite.rb            | 67 +++++++++++++--------
 test/argument_parser_test.rb      | 10 ++++
 test/benchmark_runner_cli_test.rb | 32 ++++++++++
 test/benchmark_suite_test.rb      | 99 +++++++++++++++++++++++++++++++
 6 files changed, 222 insertions(+), 34 deletions(-)

diff --git a/lib/argument_parser.rb b/lib/argument_parser.rb
index df106bed..4e6770e7 100644
--- a/lib/argument_parser.rb
+++ b/lib/argument_parser.rb
@@ -24,6 +24,7 @@ class ArgumentParser
     :skip_zjit,
     :with_pre_init,
     :pvalue,
+    :interleave,
     keyword_init: true
   )
 
@@ -149,6 +150,10 @@ def parse(argv)
         args.pvalue = true
       end
 
+      opts.on("--interleave", "run benchmarks interleaved across executables to reduce thermal drift") do
+        args.interleave = true
+      end
+
       opts.on("--graph", "generate a graph image of benchmark results") do
         args.graph = true
       end
@@ -230,6 +235,7 @@ def default_args
       excludes: [],
       rss: false,
       pvalue: false,
+      interleave: false,
       graph: false,
       no_pinning: false,
       force_pinning: false,
diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb
index 2812f3ee..8db164c2 100644
--- a/lib/benchmark_runner/cli.rb
+++ b/lib/benchmark_runner/cli.rb
@@ -40,19 +40,43 @@ def run
         force_pinning: args.force_pinning
       )
 
-      # Benchmark with and without YJIT
+      # Collect ruby version descriptions for all executables upfront
+      args.executables.each do |name, executable|
+        ruby_descriptions[name] = `#{executable.shelljoin} -v`.chomp
+      end
+
       bench_start_time = Time.now.to_f
       bench_data = {}
       bench_failures = {}
-      args.executables.each do |name, executable|
-        ruby_descriptions[name] = `#{executable.shelljoin} -v`.chomp
 
-        bench_data[name], failures = suite.run(
-          ruby: executable,
-          ruby_description: ruby_descriptions[name]
-        )
-        # Make it easier to query later.
-        bench_failures[name] = failures unless failures.empty?
+      if args.interleave
+        args.executables.each_key { |name| bench_data[name] = {} }
+        entries = suite.benchmarks
+
+        entries.each_with_index do |entry, idx|
+          # Alternate executable order to cancel cache-warming bias
+          exes = ruby_descriptions.keys
+          exes = exes.reverse if idx.odd?
+
+          exes.each do |name|
+            puts("Running benchmark \"#{entry.name}\" [#{name}] (#{idx+1}/#{entries.length})")
+            result = suite.run_benchmark(entry, ruby: args.executables[name], ruby_description: ruby_descriptions[name])
+            if result[:data]
+              bench_data[name][entry.name] = result[:data]
+            else
+              bench_failures[name] ||= {}
+              bench_failures[name][entry.name] = result[:failure]
+            end
+          end
+        end
+      else
+        args.executables.each do |name, executable|
+          bench_data[name], failures = suite.run(
+            ruby: executable,
+            ruby_description: ruby_descriptions[name]
+          )
+          bench_failures[name] = failures unless failures.empty?
+        end
       end
 
       bench_end_time = Time.now.to_f
diff --git a/lib/benchmark_suite.rb b/lib/benchmark_suite.rb
index a2354c2a..cb71fc78 100644
--- a/lib/benchmark_suite.rb
+++ b/lib/benchmark_suite.rb
@@ -33,41 +33,53 @@ def initialize(categories:, name_filters:, excludes: [], out_path:, harness:, ha
     @bench_dir = BENCHMARKS_DIR
   end
 
-  # Run all the benchmarks and record execution times
-  # Returns [bench_data, bench_failures]
-  def run(ruby:, ruby_description:)
-    bench_data = {}
-    bench_failures = {}
+  # Discovered and filtered benchmark entries, memoized.
+  def benchmarks
+    @benchmarks ||= discover_benchmarks
+  end
 
-    benchmark_entries = discover_benchmarks
+  # Run a single benchmark entry on a single executable.
+  # Returns { name:, data: } on success, { name:, failure: } on error.
+  def run_benchmark(entry, ruby:, ruby_description:)
     env = benchmark_env(ruby)
     caller_json_path = ENV["RESULT_JSON_PATH"]
-
-    # Capture quiet setting before entering unbundled env (which clears ENV)
     quiet = ENV['BENCHMARK_QUIET'] == '1'
 
-    benchmark_entries.each_with_index do |entry, idx|
-      puts("Running benchmark \"#{entry.name}\" (#{idx+1}/#{benchmark_entries.length})")
+    result_json_path = caller_json_path || File.join(out_path, "temp#{Process.pid}.json")
+    cmd_prefix = base_cmd(ruby_description, entry.name)
 
-      result_json_path = caller_json_path || File.join(out_path, "temp#{Process.pid}.json")
-      cmd_prefix = base_cmd(ruby_description, entry.name)
-
-      # Clear project-level Bundler environment so benchmarks run in a clean context.
-      # Benchmarks that need Bundler (e.g., railsbench) set up their own via use_gemfile.
-      # This is important when running tests under `bundle exec rake test`.
-      result = if defined?(Bundler)
-        Bundler.with_unbundled_env do
-          run_single_benchmark(entry.script_path, result_json_path, ruby, cmd_prefix, env, entry.name, quiet: quiet)
-        end
-      else
+    # Clear project-level Bundler environment so benchmarks run in a clean context.
+    # Benchmarks that need Bundler (e.g., railsbench) set up their own via use_gemfile.
+    result = if defined?(Bundler)
+      Bundler.with_unbundled_env do
         run_single_benchmark(entry.script_path, result_json_path, ruby, cmd_prefix, env, entry.name, quiet: quiet)
       end
+    else
+      run_single_benchmark(entry.script_path, result_json_path, ruby, cmd_prefix, env, entry.name, quiet: quiet)
+    end
+
+    if result[:success]
+      { name: entry.name, data: process_benchmark_result(result_json_path, result[:command], delete_file: !caller_json_path) }
+    else
+      FileUtils.rm_f(result_json_path) unless caller_json_path
+      { name: entry.name, failure: result[:status].exitstatus }
+    end
+  end
 
-      if result[:success]
-        bench_data[entry.name] = process_benchmark_result(result_json_path, result[:command], delete_file: !caller_json_path)
+  # Run all the benchmarks and record execution times.
+  # Returns [bench_data, bench_failures]
+  def run(ruby:, ruby_description:)
+    bench_data = {}
+    bench_failures = {}
+
+    benchmarks.each_with_index do |entry, idx|
+      puts("Running benchmark \"#{entry.name}\" (#{idx+1}/#{benchmarks.length})")
+
+      result = run_benchmark(entry, ruby: ruby, ruby_description: ruby_description)
+      if result[:data]
+        bench_data[entry.name] = result[:data]
       else
-        bench_failures[entry.name] = result[:status].exitstatus
-        FileUtils.rm_f(result_json_path) unless caller_json_path
+        bench_failures[entry.name] = result[:failure]
       end
     end
 
@@ -174,6 +186,11 @@ def benchmark_harness_for(benchmark_name)
   end
 
   def benchmark_env(ruby)
+    @benchmark_env_cache ||= {}
+    @benchmark_env_cache[ruby] ||= compute_benchmark_env(ruby)
+  end
+
+  def compute_benchmark_env(ruby)
     # When the Ruby running this script is not the first Ruby in PATH, shell commands
     # like `bundle install` in a child process will not use the Ruby being benchmarked.
     # It overrides PATH to guarantee the commands of the benchmarked Ruby will be used.
diff --git a/test/argument_parser_test.rb b/test/argument_parser_test.rb
index 442fb0cd..9ab2db9c 100644
--- a/test/argument_parser_test.rb
+++ b/test/argument_parser_test.rb
@@ -50,6 +50,7 @@ def setup_mock_ruby(path)
         assert_equal [], args.name_filters
         assert_equal false, args.rss
         assert_equal false, args.pvalue
+        assert_equal false, args.interleave
         assert_equal false, args.graph
         assert_equal false, args.no_pinning
         assert_equal false, args.turbo
@@ -438,6 +439,15 @@ def setup_mock_ruby(path)
       end
     end
 
+    describe '--interleave option' do
+      it 'sets interleave flag' do
+        parser = ArgumentParser.new
+        args = parser.parse(['--interleave'])
+
+        assert_equal true, args.interleave
+      end
+    end
+
     describe '--graph option' do
       it 'sets graph flag' do
         parser = ArgumentParser.new
diff --git a/test/benchmark_runner_cli_test.rb b/test/benchmark_runner_cli_test.rb
index 72452429..6bc6336c 100644
--- a/test/benchmark_runner_cli_test.rb
+++ b/test/benchmark_runner_cli_test.rb
@@ -48,6 +48,7 @@ def create_args(overrides = {})
       name_filters: [],
       excludes: [],
       rss: false,
+      interleave: false,
       graph: false,
       no_pinning: true,
       turbo: true,
@@ -318,6 +319,37 @@ def create_args(overrides = {})
       end
     end
 
+    it 'runs benchmarks interleaved when --interleave is set' do
+      Dir.mktmpdir do |tmpdir|
+        args = create_args(
+          name_filters: ['fib', 'respond_to'],
+          out_path: tmpdir,
+          interleave: true
+        )
+
+        cli = BenchmarkRunner::CLI.new(args)
+        output = capture_io { cli.run }.join
+
+        # Progress output should include executable names in brackets
+        assert_match(/\[.+\]/, output, "Interleaved output should include executable name in brackets")
+        assert_match(/Total time spent benchmarking:/, output)
+
+        # Verify output files were created with data from all executables
+        json_files = Dir.glob(File.join(tmpdir, "*.json"))
+        assert_equal 1, json_files.size
+
+        json_data = JSON.parse(File.read(json_files.first))
+        raw_data = json_data['raw_data']
+
+        # All executables should have results
+        args.executables.each_key do |name|
+          assert raw_data.key?(name), "Expected raw_data to contain '#{name}'"
+          assert raw_data[name].key?('fib'), "Expected '#{name}' to have 'fib' results"
+          assert raw_data[name].key?('respond_to'), "Expected '#{name}' to have 'respond_to' results"
+        end
+      end
+    end
+
     it 'creates output directory if it does not exist' do
       Dir.mktmpdir do |parent_tmpdir|
         nested_dir = File.join(parent_tmpdir, 'nested', 'output', 'dir')
diff --git a/test/benchmark_suite_test.rb b/test/benchmark_suite_test.rb
index aa6ac32b..527204ec 100644
--- a/test/benchmark_suite_test.rb
+++ b/test/benchmark_suite_test.rb
@@ -519,6 +519,105 @@
     end
   end
 
+  describe '#benchmarks' do
+    it 'returns discovered benchmark entries' do
+      suite = BenchmarkSuite.new(
+        categories: [],
+        name_filters: ['simple'],
+        out_path: @out_path,
+        harness: 'harness',
+        no_pinning: true
+      )
+
+      entries = suite.benchmarks
+      assert_instance_of Array, entries
+      assert_equal 1, entries.length
+      assert_equal 'simple', entries.first.name
+    end
+
+    it 'memoizes the result' do
+      suite = BenchmarkSuite.new(
+        categories: [],
+        name_filters: ['simple'],
+        out_path: @out_path,
+        harness: 'harness',
+        no_pinning: true
+      )
+
+      assert_same suite.benchmarks, suite.benchmarks
+    end
+  end
+
+  describe '#run_benchmark' do
+    it 'returns data hash on success' do
+      suite = BenchmarkSuite.new(
+        categories: [],
+        name_filters: ['simple'],
+        out_path: @out_path,
+        harness: 'harness',
+        no_pinning: true
+      )
+
+      entry = suite.benchmarks.first
+      result = nil
+      capture_io do
+        result = suite.run_benchmark(entry, ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0')
+      end
+
+      assert_equal 'simple', result[:name]
+      assert_instance_of Hash, result[:data]
+      assert_includes result[:data], 'warmup'
+      assert_includes result[:data], 'bench'
+      assert_includes result[:data], 'rss'
+      assert_nil result[:failure]
+    end
+
+    it 'returns failure hash on error' do
+      File.write('benchmarks/failing.rb', "exit(1)\n")
+
+      suite = BenchmarkSuite.new(
+        categories: [],
+        name_filters: ['failing'],
+        out_path: @out_path,
+        harness: 'harness',
+        no_pinning: true
+      )
+
+      entry = suite.benchmarks.first
+      result = nil
+      capture_io do
+        result = suite.run_benchmark(entry, ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0')
+      end
+
+      assert_equal 'failing', result[:name]
+      assert_nil result[:data]
+      assert_equal 1, result[:failure]
+    end
+
+    it 'produces same data as run for the same benchmark' do
+      suite = BenchmarkSuite.new(
+        categories: [],
+        name_filters: ['simple'],
+        out_path: @out_path,
+        harness: 'harness',
+        no_pinning: true
+      )
+
+      entry = suite.benchmarks.first
+      single_result = nil
+      capture_io do
+        single_result = suite.run_benchmark(entry, ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0')
+      end
+
+      run_data = nil
+      capture_io do
+        run_data, _ = suite.run(ruby: [RbConfig.ruby], ruby_description: 'ruby 3.2.0')
+      end
+
+      assert_equal run_data['simple'].keys.sort, single_result[:data].keys.sort
+    end
+  end
+
   describe 'integration with BenchmarkFilter' do
     it 'uses BenchmarkFilter to match benchmarks' do
       # Create benchmarks with different categories