diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8ef68343e5fb..25b007b3955d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -131,6 +131,9 @@ Improvements Optimizations --------------------- +* GITHUB#15519: Use filter doc ID runs to bulk-score filtered MaxScoreBulkScorer windows. + (Prithvi S) + * GITHUB#15681, GITHUB#15833: Replace pre-sized array or empty array with lambda expression to call Collection#toArray. (Zhou Hui) * GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/MaxScoreFilterBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/MaxScoreFilterBenchmark.java new file mode 100644 index 000000000000..e3b18f69d4cd --- /dev/null +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/MaxScoreFilterBenchmark.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.benchmark.jmh; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Comparator; +import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MMapDirectory; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Benchmarks filtered top-score disjunctions that use {@link + * org.apache.lucene.search.MaxScoreBulkScorer}. + */ +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Thread) +@Warmup(iterations = 3, time = 3) +@Measurement(iterations = 5, time = 5) +@Fork(value = 1, warmups = 1) +public class MaxScoreFilterBenchmark { + + private Directory dir; + private IndexReader reader; + private IndexSearcher searcher; + private Path path; + private Query query; + + @Param({"1000000"}) + public int docCount; + + @Param({"100", "1000"}) + public int essentialInterval; + + @Param({"1", "10", "1000"}) + public int filterInterval; + + @Param({"2"}) + public int lowInterval; + + @Setup(Level.Trial) + public void setup() throws IOException { + path = Files.createTempDirectory("maxScoreFilterBench"); + dir = MMapDirectory.open(path); + + try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) { + for (int i = 0; i < docCount; ++i) { + Document doc = new Document(); + if (matchesFilter(i, filterInterval)) { + doc.add(new StringField("filter", "yes", Field.Store.NO)); + } + if (i % essentialInterval == 0) { + doc.add(new StringField("foo", "high", Field.Store.NO)); + } + if (i % lowInterval == 1) { + doc.add(new StringField("foo", "low", Field.Store.NO)); + } + writer.addDocument(doc); + } + writer.forceMerge(1); + } + + reader = DirectoryReader.open(dir); + searcher = new IndexSearcher(reader); + query = buildQuery(); + } + + private static boolean matchesFilter(int doc, int interval) { + if (interval == 1) { + return true; + } + int h = doc; + h ^= h >>> 16; + h *= 0x7feb352d; + h ^= h >>> 15; + h *= 0x846ca68b; + h ^= h >>> 16; + return (h & 0x7fffffff) % interval == 0; + } + + private static Query buildQuery() { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + builder.add(new BoostQuery(new TermQuery(new Term("foo", "high")), 4f), Occur.SHOULD); + builder.add(new BoostQuery(new TermQuery(new Term("foo", "low")), 1f), Occur.SHOULD); + builder.add(new TermQuery(new Term("filter", "yes")), Occur.FILTER); + return builder.build(); + } + + @TearDown(Level.Trial) + public void tearDown() throws IOException { + reader.close(); + dir.close(); + if (Files.exists(path)) { + try (Stream walk = Files.walk(path)) { + walk.sorted(Comparator.reverseOrder()) + .forEach( + p -> { + try { + Files.delete(p); + } catch (IOException _) { + } + }); + } + } + } + + @Benchmark + public TopDocs searchTopScores() throws IOException { + return searcher.search(query, new TopScoreDocCollectorManager(10, 1)); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 9bc732ba19e3..f91e3132c3d8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -148,17 +148,22 @@ private void scoreInnerWindow( if (filter != null) { scoreInnerWindowWithFilter(collector, acceptDocs, max, filter); } else { - DisiWrapper top = essentialQueue.top(); - DisiWrapper top2 = essentialQueue.top2(); - if (top2 == null) { - scoreInnerWindowSingleEssentialClause(collector, acceptDocs, max); - } else if (top2.doc - INNER_WINDOW_SIZE / 2 >= top.doc) { - // The first half of the window would match a single clause. Let's collect this single - // clause until the next doc ID of the next clause. - scoreInnerWindowSingleEssentialClause(collector, acceptDocs, Math.min(max, top2.doc)); - } else { - scoreInnerWindowMultipleEssentialClauses(collector, acceptDocs, max); - } + scoreInnerWindowWithoutFilter(collector, acceptDocs, max); + } + } + + private void scoreInnerWindowWithoutFilter(LeafCollector collector, Bits acceptDocs, int max) + throws IOException { + DisiWrapper top = essentialQueue.top(); + DisiWrapper top2 = essentialQueue.top2(); + if (top2 == null) { + scoreInnerWindowSingleEssentialClause(collector, acceptDocs, max); + } else if (top2.doc - INNER_WINDOW_SIZE / 2 >= top.doc) { + // The first half of the window would match a single clause. Let's collect this single + // clause until the next doc ID of the next clause. + scoreInnerWindowSingleEssentialClause(collector, acceptDocs, Math.min(max, top2.doc)); + } else { + scoreInnerWindowMultipleEssentialClauses(collector, acceptDocs, max); } } @@ -195,9 +200,19 @@ private void scoreInnerWindowWithFilter( } while (top.doc < filter.doc); } else { int doc = top.doc; - boolean match = - (acceptDocs == null || acceptDocs.get(doc)) - && (filter.twoPhaseView == null || filter.twoPhaseView.matches()); + boolean filterMatch = filter.twoPhaseView == null || filter.twoPhaseView.matches(); + if (filterMatch) { + int filterRunEnd = + filter.twoPhaseView == null + ? filter.approximation.docIDRunEnd() + : filter.twoPhaseView.docIDRunEnd(); + int upTo = MathUtil.unsignedMin(innerWindowMax, filterRunEnd); + if (upTo - doc >= INNER_WINDOW_SIZE / 2) { + scoreInnerWindowWithoutFilter(collector, acceptDocs, upTo); + return; + } + } + boolean match = (acceptDocs == null || acceptDocs.get(doc)) && filterMatch; double score = 0; do { if (match) { @@ -239,6 +254,12 @@ private void scoreInnerWindowSingleEssentialClause( private void scoreInnerWindowMultipleEssentialClauses( LeafCollector collector, Bits acceptDocs, int max) throws IOException { + collectInnerWindowMultipleEssentialClauses(acceptDocs, max); + scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer); + } + + private void collectInnerWindowMultipleEssentialClauses(Bits acceptDocs, int max) + throws IOException { DisiWrapper top = essentialQueue.top(); int innerWindowMin = top.doc; @@ -276,8 +297,6 @@ private void scoreInnerWindowMultipleEssentialClauses( windowScores[index] = 0d; }); windowMatches.clear(0, innerWindowSize); - - scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer); } private int computeOuterWindowMax(int windowMin) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java index c49dd6d7da66..cd8fa659733d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorer.java @@ -279,6 +279,92 @@ public void collect(int doc) throws IOException { } } + public void testFilteredDisjunctionWithFilterRun() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter w = + new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) { + for (int i = 0; i < MaxScoreBulkScorer.INNER_WINDOW_SIZE; ++i) { + Document doc = new Document(); + if (i != 10 && i != 30) { + doc.add(new StringField("filter", "yes", Field.Store.NO)); + } + if (i == 10 || i == 20 || i == 30) { + doc.add(new StringField("foo", "A", Field.Store.NO)); + } + if (i == 20 || i == 40) { + doc.add(new StringField("foo", "C", Field.Store.NO)); + } + w.addDocument(doc); + } + w.forceMerge(1); + } + + try (IndexReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = newSearcher(reader); + + Query clause1 = + new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2); + Query clause2 = + new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3); + Query filter = + new RandomApproximationQuery(new TermQuery(new Term("filter", "yes")), random()); + LeafReaderContext context = searcher.getIndexReader().leaves().get(0); + Scorer scorer1 = + searcher + .createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f) + .scorer(context); + Scorer scorer2 = + searcher + .createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f) + .scorer(context); + Scorer filterScorer = + searcher + .createWeight(searcher.rewrite(filter), ScoreMode.TOP_SCORES, 1f) + .scorer(context); + + BulkScorer scorer = + new MaxScoreBulkScorer( + context.reader().maxDoc(), Arrays.asList(scorer1, scorer2), filterScorer); + + LeafCollector collector = + new LeafCollector() { + + private int i; + private Scorable scorer; + + @Override + public void setScorer(Scorable scorer) throws IOException { + this.scorer = scorer; + } + + @Override + public void collect(int doc) throws IOException { + switch (i++) { + case 0: + assertEquals(20, doc); + assertEquals(2 + 3, scorer.score(), 0); + break; + case 1: + assertEquals(40, doc); + assertEquals(3, scorer.score(), 0); + break; + default: + fail(); + break; + } + } + + @Override + public void finish() throws IOException { + assertEquals(2, i); + } + }; + scorer.score(collector, null, 0, DocIdSetIterator.NO_MORE_DOCS); + collector.finish(); + } + } + } + public void testBasicsWithTwoDisjunctionClausesAndSkipping() throws Exception { try (Directory dir = newDirectory()) { writeDocuments(dir);