From 6c4804e046433fa4ba46c953370f14f7e220a4f8 Mon Sep 17 00:00:00 2001
From: prithvi <prithvisivasankar@gmail.com>
Date: Tue, 21 Apr 2026 22:13:29 +0530
Subject: [PATCH 1/2] Eliminate redundant cardinality() pass in
 MaxScoreBulkScorer

Signed-off-by: prithvi <prithvisivasankar@gmail.com>
---
 lucene/CHANGES.txt                            |   3 +
 .../jmh/WindowExtractionBenchmark.java        | 219 ++++++++++++++++++
 .../lucene/search/MaxScoreBulkScorer.java     |   4 +-
 3 files changed, 225 insertions(+), 1 deletion(-)
 create mode 100644 lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 94b05c92f883..c7e9056905b3 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -125,6 +125,9 @@ Improvements
 
 Optimizations
 ---------------------
+* Eliminate redundant cardinality() pass in MaxScoreBulkScorer by pre-allocating the
+  doc/score buffer to the max window size. (Prithvi S)
+
 * GITHUB#15681, GITHUB#15833: Replace pre-sized array or empty array with lambda expression to call Collection#toArray. (Zhou Hui)
 
 * GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui)
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java
new file mode 100644
index 000000000000..988489c5759c
--- /dev/null
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.SplittableRandom;
+import java.util.concurrent.TimeUnit;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOIntConsumer;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Benchmark comparing bitset extraction strategies used in
+ * MaxScoreBulkScorer.scoreInnerWindowMultipleEssentialClauses().
+ *
+ * <p>Three strategies are compared:
+ *
+ * <ol>
+ *   <li><b>oldCardinalityForEach</b>: cardinality() + forEach(lambda) + clear() — 3 passes
+ *   <li><b>newForEachNoCardinality</b>: forEach(lambda) + clear() with pre-allocated buffer — 2
+ *       passes (eliminates cardinality)
+ *   <li><b>newIntoArray</b>: intoArray() + score gather loop + clear() — single extraction pass
+ * </ol>
+ *
+ * <p>Both benchmarks include the populate step (setting bits + scores) to simulate the full
+ * inner-window lifecycle.
+ */
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 5, time = 1)
+@Fork(
+    value = 1,
+    jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
+public class WindowExtractionBenchmark {
+
+  static final int INNER_WINDOW_SIZE = 1 << 12; // 4096, same as MaxScoreBulkScorer
+
+  /**
+   * Number of matching documents in the window. Realistic values range from very sparse (10) to
+   * moderately dense (2000). Multi-term boolean queries typically match 50-500 docs per window.
+   */
+  @Param({"10", "50", "128", "500", "1000", "2000"})
+  int matchCount;
+
+  private final SplittableRandom random = new SplittableRandom(42);
+
+  // Simulates MaxScoreBulkScorer's fields
+  private FixedBitSet windowMatches;
+  private double[] windowScores;
+  private int innerWindowMin;
+
+  // Output buffers (pre-allocated to max size, like MaxScoreBulkScorer reuses them)
+  private int[] outDocs;
+  private double[] outScores;
+  private int outSize;
+
+  // Pre-computed match positions and scores for deterministic setup
+  private int[] matchPositions;
+  private double[] matchScoreValues;
+
+  @Setup(Level.Trial)
+  public void setupTrial() {
+    windowMatches = new FixedBitSet(INNER_WINDOW_SIZE);
+    windowScores = new double[INNER_WINDOW_SIZE];
+    // +1 for denseWord2Array sentinel slot
+    outDocs = new int[INNER_WINDOW_SIZE + 1];
+    outScores = new double[INNER_WINDOW_SIZE + 1];
+    outSize = 0;
+    innerWindowMin = 100_000; // arbitrary base doc ID
+
+    // Pre-compute random match positions
+    matchPositions = new int[matchCount];
+    matchScoreValues = new double[matchCount];
+    FixedBitSet temp = new FixedBitSet(INNER_WINDOW_SIZE);
+    int count = 0;
+    while (count < matchCount) {
+      int pos = random.nextInt(INNER_WINDOW_SIZE);
+      if (!temp.get(pos)) {
+        temp.set(pos);
+        matchPositions[count] = pos;
+        matchScoreValues[count] = random.nextDouble() * 10.0;
+        count++;
+      }
+    }
+    Arrays.sort(matchPositions);
+  }
+
+  /** Populate the bitset and windowScores — simulates what the essential clause collection does. */
+  private void populateWindow() {
+    for (int i = 0; i < matchPositions.length; i++) {
+      int pos = matchPositions[i];
+      windowMatches.set(pos);
+      windowScores[pos] = matchScoreValues[i];
+    }
+  }
+
+  /**
+   * ORIGINAL: cardinality() + forEach(lambda) + clear(). This is what the code did before any
+   * optimization — 3 passes over the bitset.
+   */
+  @Benchmark
+  public int oldCardinalityForEach(Blackhole bh) throws IOException {
+    populateWindow();
+    int innerWindowSize = INNER_WINDOW_SIZE;
+
+    // Pass 1: count bits to pre-size buffer
+    int card = windowMatches.cardinality(0, innerWindowSize);
+    // In original code: docAndScoreAccBuffer.growNoCopy(card)
+    // We simulate with pre-allocated buffer, but cardinality() cost is still measured
+
+    // Pass 2: forEach with lambda to extract docs + scores + zero scores
+    outSize = 0;
+    windowMatches.forEach(
+        0,
+        innerWindowSize,
+        0,
+        (IOIntConsumer)
+            index -> {
+              outDocs[outSize] = innerWindowMin + index;
+              outScores[outSize] = windowScores[index];
+              outSize++;
+              windowScores[index] = 0d;
+            });
+
+    // Pass 3: clear the bitset
+    windowMatches.clear(0, innerWindowSize);
+
+    bh.consume(card);
+    bh.consume(outScores);
+    return outSize;
+  }
+
+  /**
+   * OPTIMIZED: forEach(lambda) + clear() with pre-allocated buffer. Eliminates the cardinality()
+   * pass — 2 passes over the bitset. This is the current implementation.
+   */
+  @Benchmark
+  public int newForEachNoCardinality(Blackhole bh) throws IOException {
+    populateWindow();
+    int innerWindowSize = INNER_WINDOW_SIZE;
+
+    // No cardinality pass needed — buffer pre-allocated to INNER_WINDOW_SIZE
+
+    // Single extraction pass: forEach with lambda
+    outSize = 0;
+    windowMatches.forEach(
+        0,
+        innerWindowSize,
+        0,
+        (IOIntConsumer)
+            index -> {
+              outDocs[outSize] = innerWindowMin + index;
+              outScores[outSize] = windowScores[index];
+              outSize++;
+              windowScores[index] = 0d;
+            });
+
+    // Clear the bitset
+    windowMatches.clear(0, innerWindowSize);
+
+    bh.consume(outScores);
+    return outSize;
+  }
+
+  /**
+   * ALTERNATIVE: intoArray() + score gather loop + clear(). Uses the optimized branchless
+   * denseWord2Array for bit extraction — best for dense windows.
+   */
+  @Benchmark
+  public int newIntoArray(Blackhole bh) {
+    populateWindow();
+    int innerWindowSize = INNER_WINDOW_SIZE;
+
+    // Single pass: extract doc IDs and get count
+    int count = windowMatches.intoArray(0, innerWindowSize, innerWindowMin, outDocs);
+
+    // Gather scores using extracted indices + zero used entries
+    for (int i = 0; i < count; ++i) {
+      int index = outDocs[i] - innerWindowMin;
+      outScores[i] = windowScores[index];
+      windowScores[index] = 0d;
+    }
+
+    // Clear the bitset
+    windowMatches.clear(0, innerWindowSize);
+
+    bh.consume(outScores);
+    return count;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
index 9bc732ba19e3..3dbfbc95dec7 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
@@ -263,7 +263,9 @@ private void scoreInnerWindowMultipleEssentialClauses(
       top = essentialQueue.updateTop();
     } while (top.doc < innerWindowMax);
 
-    docAndScoreAccBuffer.growNoCopy(windowMatches.cardinality(0, innerWindowSize));
+    // Pre-allocate to max window size to avoid a separate cardinality() pass.
+    // This buffer is reused across windows so the allocation is a one-time cost.
+    docAndScoreAccBuffer.growNoCopy(INNER_WINDOW_SIZE);
     docAndScoreAccBuffer.size = 0;
     windowMatches.forEach(
         0,

From 3051268f5087e024720820b98a1791bb24863e71 Mon Sep 17 00:00:00 2001
From: prithvi <prithvisivasankar@gmail.com>
Date: Fri, 1 May 2026 22:37:09 +0530
Subject: [PATCH 2/2] review changes

---
 .../jmh/WindowExtractionBenchmark.java        | 219 ------------------
 .../lucene/search/MaxScoreBulkScorer.java     |   7 +-
 2 files changed, 3 insertions(+), 223 deletions(-)
 delete mode 100644 lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java

diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java
deleted file mode 100644
index 988489c5759c..000000000000
--- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/WindowExtractionBenchmark.java
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.benchmark.jmh;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.SplittableRandom;
-import java.util.concurrent.TimeUnit;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.IOIntConsumer;
-import org.openjdk.jmh.annotations.Benchmark;
-import org.openjdk.jmh.annotations.BenchmarkMode;
-import org.openjdk.jmh.annotations.Fork;
-import org.openjdk.jmh.annotations.Level;
-import org.openjdk.jmh.annotations.Measurement;
-import org.openjdk.jmh.annotations.Mode;
-import org.openjdk.jmh.annotations.OutputTimeUnit;
-import org.openjdk.jmh.annotations.Param;
-import org.openjdk.jmh.annotations.Scope;
-import org.openjdk.jmh.annotations.Setup;
-import org.openjdk.jmh.annotations.State;
-import org.openjdk.jmh.annotations.Warmup;
-import org.openjdk.jmh.infra.Blackhole;
-
-/**
- * Benchmark comparing bitset extraction strategies used in
- * MaxScoreBulkScorer.scoreInnerWindowMultipleEssentialClauses().
- *
- * <p>Three strategies are compared:
- *
- * <ol>
- *   <li><b>oldCardinalityForEach</b>: cardinality() + forEach(lambda) + clear() — 3 passes
- *   <li><b>newForEachNoCardinality</b>: forEach(lambda) + clear() with pre-allocated buffer — 2
- *       passes (eliminates cardinality)
- *   <li><b>newIntoArray</b>: intoArray() + score gather loop + clear() — single extraction pass
- * </ol>
- *
- * <p>Both benchmarks include the populate step (setting bits + scores) to simulate the full
- * inner-window lifecycle.
- */
-@BenchmarkMode(Mode.Throughput)
-@OutputTimeUnit(TimeUnit.MICROSECONDS)
-@State(Scope.Benchmark)
-@Warmup(iterations = 5, time = 1)
-@Measurement(iterations = 5, time = 1)
-@Fork(
-    value = 1,
-    jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
-public class WindowExtractionBenchmark {
-
-  static final int INNER_WINDOW_SIZE = 1 << 12; // 4096, same as MaxScoreBulkScorer
-
-  /**
-   * Number of matching documents in the window. Realistic values range from very sparse (10) to
-   * moderately dense (2000). Multi-term boolean queries typically match 50-500 docs per window.
-   */
-  @Param({"10", "50", "128", "500", "1000", "2000"})
-  int matchCount;
-
-  private final SplittableRandom random = new SplittableRandom(42);
-
-  // Simulates MaxScoreBulkScorer's fields
-  private FixedBitSet windowMatches;
-  private double[] windowScores;
-  private int innerWindowMin;
-
-  // Output buffers (pre-allocated to max size, like MaxScoreBulkScorer reuses them)
-  private int[] outDocs;
-  private double[] outScores;
-  private int outSize;
-
-  // Pre-computed match positions and scores for deterministic setup
-  private int[] matchPositions;
-  private double[] matchScoreValues;
-
-  @Setup(Level.Trial)
-  public void setupTrial() {
-    windowMatches = new FixedBitSet(INNER_WINDOW_SIZE);
-    windowScores = new double[INNER_WINDOW_SIZE];
-    // +1 for denseWord2Array sentinel slot
-    outDocs = new int[INNER_WINDOW_SIZE + 1];
-    outScores = new double[INNER_WINDOW_SIZE + 1];
-    outSize = 0;
-    innerWindowMin = 100_000; // arbitrary base doc ID
-
-    // Pre-compute random match positions
-    matchPositions = new int[matchCount];
-    matchScoreValues = new double[matchCount];
-    FixedBitSet temp = new FixedBitSet(INNER_WINDOW_SIZE);
-    int count = 0;
-    while (count < matchCount) {
-      int pos = random.nextInt(INNER_WINDOW_SIZE);
-      if (!temp.get(pos)) {
-        temp.set(pos);
-        matchPositions[count] = pos;
-        matchScoreValues[count] = random.nextDouble() * 10.0;
-        count++;
-      }
-    }
-    Arrays.sort(matchPositions);
-  }
-
-  /** Populate the bitset and windowScores — simulates what the essential clause collection does. */
-  private void populateWindow() {
-    for (int i = 0; i < matchPositions.length; i++) {
-      int pos = matchPositions[i];
-      windowMatches.set(pos);
-      windowScores[pos] = matchScoreValues[i];
-    }
-  }
-
-  /**
-   * ORIGINAL: cardinality() + forEach(lambda) + clear(). This is what the code did before any
-   * optimization — 3 passes over the bitset.
-   */
-  @Benchmark
-  public int oldCardinalityForEach(Blackhole bh) throws IOException {
-    populateWindow();
-    int innerWindowSize = INNER_WINDOW_SIZE;
-
-    // Pass 1: count bits to pre-size buffer
-    int card = windowMatches.cardinality(0, innerWindowSize);
-    // In original code: docAndScoreAccBuffer.growNoCopy(card)
-    // We simulate with pre-allocated buffer, but cardinality() cost is still measured
-
-    // Pass 2: forEach with lambda to extract docs + scores + zero scores
-    outSize = 0;
-    windowMatches.forEach(
-        0,
-        innerWindowSize,
-        0,
-        (IOIntConsumer)
-            index -> {
-              outDocs[outSize] = innerWindowMin + index;
-              outScores[outSize] = windowScores[index];
-              outSize++;
-              windowScores[index] = 0d;
-            });
-
-    // Pass 3: clear the bitset
-    windowMatches.clear(0, innerWindowSize);
-
-    bh.consume(card);
-    bh.consume(outScores);
-    return outSize;
-  }
-
-  /**
-   * OPTIMIZED: forEach(lambda) + clear() with pre-allocated buffer. Eliminates the cardinality()
-   * pass — 2 passes over the bitset. This is the current implementation.
-   */
-  @Benchmark
-  public int newForEachNoCardinality(Blackhole bh) throws IOException {
-    populateWindow();
-    int innerWindowSize = INNER_WINDOW_SIZE;
-
-    // No cardinality pass needed — buffer pre-allocated to INNER_WINDOW_SIZE
-
-    // Single extraction pass: forEach with lambda
-    outSize = 0;
-    windowMatches.forEach(
-        0,
-        innerWindowSize,
-        0,
-        (IOIntConsumer)
-            index -> {
-              outDocs[outSize] = innerWindowMin + index;
-              outScores[outSize] = windowScores[index];
-              outSize++;
-              windowScores[index] = 0d;
-            });
-
-    // Clear the bitset
-    windowMatches.clear(0, innerWindowSize);
-
-    bh.consume(outScores);
-    return outSize;
-  }
-
-  /**
-   * ALTERNATIVE: intoArray() + score gather loop + clear(). Uses the optimized branchless
-   * denseWord2Array for bit extraction — best for dense windows.
-   */
-  @Benchmark
-  public int newIntoArray(Blackhole bh) {
-    populateWindow();
-    int innerWindowSize = INNER_WINDOW_SIZE;
-
-    // Single pass: extract doc IDs and get count
-    int count = windowMatches.intoArray(0, innerWindowSize, innerWindowMin, outDocs);
-
-    // Gather scores using extracted indices + zero used entries
-    for (int i = 0; i < count; ++i) {
-      int index = outDocs[i] - innerWindowMin;
-      outScores[i] = windowScores[index];
-      windowScores[index] = 0d;
-    }
-
-    // Clear the bitset
-    windowMatches.clear(0, innerWindowSize);
-
-    bh.consume(outScores);
-    return count;
-  }
-}
diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
index 3dbfbc95dec7..81ebc6d0cfe7 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java
@@ -51,7 +51,7 @@ final class MaxScoreBulkScorer extends BulkScorer {
   private final double[] windowScores = new double[INNER_WINDOW_SIZE];
 
   private final DocAndFloatFeatureBuffer docAndScoreBuffer = new DocAndFloatFeatureBuffer();
-  private final DocAndScoreAccBuffer docAndScoreAccBuffer = new DocAndScoreAccBuffer();
+  private final DocAndScoreAccBuffer docAndScoreAccBuffer;
 
   MaxScoreBulkScorer(int maxDoc, List<Scorer> scorers, Scorer filter) throws IOException {
     this.maxDoc = maxDoc;
@@ -68,6 +68,8 @@ final class MaxScoreBulkScorer extends BulkScorer {
     this.cost = cost;
     essentialQueue = DisiPriorityQueue.ofMaxSize(allScorers.length);
     maxScoreSums = new double[allScorers.length];
+    docAndScoreAccBuffer = new DocAndScoreAccBuffer();
+    docAndScoreAccBuffer.growNoCopy(INNER_WINDOW_SIZE);
   }
 
   // Number of outer windows that have been evaluated
@@ -263,9 +265,6 @@ private void scoreInnerWindowMultipleEssentialClauses(
       top = essentialQueue.updateTop();
     } while (top.doc < innerWindowMax);
 
-    // Pre-allocate to max window size to avoid a separate cardinality() pass.
-    // This buffer is reused across windows so the allocation is a one-time cost.
-    docAndScoreAccBuffer.growNoCopy(INNER_WINDOW_SIZE);
     docAndScoreAccBuffer.size = 0;
     windowMatches.forEach(
         0,