Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ Improvements

Optimizations
---------------------
* GITHUB#15519: Apply broad filters after essential clauses in MaxScoreBulkScorer when the
filter cost is substantially higher than essential clause costs. (Prithvi S)

* GITHUB#15681, GITHUB#15833: Replace pre-sized array or empty array with lambda expression to call Collection#toArray. (Zhou Hui)

* GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;

/**
* Benchmarks filtered top-score disjunctions that use {@link
* org.apache.lucene.search.MaxScoreBulkScorer}.
*/
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 5)
@Fork(value = 1, warmups = 1)
public class MaxScoreFilterBenchmark {

private Directory dir;
private IndexReader reader;
private IndexSearcher searcher;
private Path path;
private Query query;

@Param({"1000000"})
public int docCount;

@Param({"100", "1000"})
public int essentialInterval;

@Param({"1", "10", "1000"})
public int filterInterval;

@Param({"2"})
public int lowInterval;

@Setup(Level.Trial)
public void setup() throws IOException {
path = Files.createTempDirectory("maxScoreFilterBench");
dir = MMapDirectory.open(path);

try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
for (int i = 0; i < docCount; ++i) {
Document doc = new Document();
if (matchesFilter(i, filterInterval)) {
doc.add(new StringField("filter", "yes", Field.Store.NO));
}
if (i % essentialInterval == 0) {
doc.add(new StringField("foo", "high", Field.Store.NO));
}
if (i % lowInterval == 1) {
doc.add(new StringField("foo", "low", Field.Store.NO));
}
writer.addDocument(doc);
}
writer.forceMerge(1);
}

reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
query = buildQuery();
}

private static boolean matchesFilter(int doc, int interval) {
if (interval == 1) {
return true;
}
int h = doc;
h ^= h >>> 16;
h *= 0x7feb352d;
h ^= h >>> 15;
h *= 0x846ca68b;
h ^= h >>> 16;
return (h & 0x7fffffff) % interval == 0;
}

private static Query buildQuery() {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new BoostQuery(new TermQuery(new Term("foo", "high")), 4f), Occur.SHOULD);
builder.add(new BoostQuery(new TermQuery(new Term("foo", "low")), 1f), Occur.SHOULD);
builder.add(new TermQuery(new Term("filter", "yes")), Occur.FILTER);
return builder.build();
}

@TearDown(Level.Trial)
public void tearDown() throws IOException {
reader.close();
dir.close();
if (Files.exists(path)) {
try (Stream<Path> walk = Files.walk(path)) {
walk.sorted(Comparator.reverseOrder())
.forEach(
p -> {
try {
Files.delete(p);
} catch (IOException _) {
}
});
}
}
}

@Benchmark
public TopDocs searchTopScores() throws IOException {
return searcher.search(query, new TopScoreDocCollectorManager(10, 1));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
final class MaxScoreBulkScorer extends BulkScorer {

static final int INNER_WINDOW_SIZE = 1 << 12;
private static final int MIN_FILTER_TO_ESSENTIAL_COST_RATIO_FOR_POST_FILTERING = 16;

private final int maxDoc;
// All scorers, sorted by increasing max score.
Expand Down Expand Up @@ -146,7 +147,11 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr
private void scoreInnerWindow(
LeafCollector collector, Bits acceptDocs, int max, DisiWrapper filter) throws IOException {
if (filter != null) {
scoreInnerWindowWithFilter(collector, acceptDocs, max, filter);
if (shouldPostFilter(filter)) {
scoreInnerWindowWithPostFilter(collector, acceptDocs, max, filter);
} else {
scoreInnerWindowWithFilter(collector, acceptDocs, max, filter);
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would much prefer if we continued with one function.

Really, the optimization is just providing bulk scoring on contiguous blocks of IDs, right?

If that is the case, I think we can use the filter iterator's "docEndRunID" or whatever to get a block of matching ids, allowing for bulk scoring of the competitive scorers.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, updated. The fast path now only uses filter docIDRunEnd() when the matching run covers at least half an inner window, similar in spirit to DenseConjunctionBulkScorer’s run-size guard. Otherwise it keeps the existing leap-frog path. Thanks!

} else {
DisiWrapper top = essentialQueue.top();
DisiWrapper top2 = essentialQueue.top2();
Expand All @@ -162,6 +167,39 @@ private void scoreInnerWindow(
}
}

private boolean shouldPostFilter(DisiWrapper filter) {
long remainingCost = filter.cost;
for (int i = firstEssentialScorer; i < allScorers.length; ++i) {
long essentialCost = allScorers[i].cost;
if (essentialCost > Long.MAX_VALUE / MIN_FILTER_TO_ESSENTIAL_COST_RATIO_FOR_POST_FILTERING) {
return false;
}
// Post-filtering needs to overcome extra buffering, so require a substantial cost gap.
remainingCost -= essentialCost * MIN_FILTER_TO_ESSENTIAL_COST_RATIO_FOR_POST_FILTERING;
if (remainingCost <= 0) {
return false;
}
}
return true;
}

private void scoreInnerWindowWithPostFilter(
LeafCollector collector, Bits acceptDocs, int max, DisiWrapper filter) throws IOException {
DisiWrapper top = essentialQueue.top();
DisiWrapper top2 = essentialQueue.top2();
int innerWindowMax = MathUtil.unsignedMin(max, top.doc + INNER_WINDOW_SIZE);
if (top2 == null) {
scoreInnerWindowSingleEssentialClauseWithPostFilter(
collector, acceptDocs, innerWindowMax, filter);
} else if (top2.doc - INNER_WINDOW_SIZE / 2 >= top.doc) {
scoreInnerWindowSingleEssentialClauseWithPostFilter(
collector, acceptDocs, Math.min(innerWindowMax, top2.doc), filter);
} else {
scoreInnerWindowMultipleEssentialClausesWithPostFilter(
collector, acceptDocs, innerWindowMax, filter);
}
}

private void scoreInnerWindowWithFilter(
LeafCollector collector, Bits acceptDocs, int max, DisiWrapper filter) throws IOException {

Expand Down Expand Up @@ -237,8 +275,38 @@ private void scoreInnerWindowSingleEssentialClause(
essentialQueue.updateTop();
}

private void scoreInnerWindowSingleEssentialClauseWithPostFilter(
LeafCollector collector, Bits acceptDocs, int upTo, DisiWrapper filter) throws IOException {
DisiWrapper top = essentialQueue.top();

for (top.scorer.nextDocsAndScores(upTo, acceptDocs, docAndScoreBuffer);
docAndScoreBuffer.size > 0;
top.scorer.nextDocsAndScores(upTo, acceptDocs, docAndScoreBuffer)) {

docAndScoreAccBuffer.copyFrom(docAndScoreBuffer);
applyFilter(docAndScoreAccBuffer, filter);
scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer);
}

top.doc = top.iterator.docID();
essentialQueue.updateTop();
}

private void scoreInnerWindowMultipleEssentialClauses(
LeafCollector collector, Bits acceptDocs, int max) throws IOException {
collectInnerWindowMultipleEssentialClauses(acceptDocs, max);
scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer);
}

private void scoreInnerWindowMultipleEssentialClausesWithPostFilter(
LeafCollector collector, Bits acceptDocs, int max, DisiWrapper filter) throws IOException {
collectInnerWindowMultipleEssentialClauses(acceptDocs, max);
applyFilter(docAndScoreAccBuffer, filter);
scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer);
}

private void collectInnerWindowMultipleEssentialClauses(Bits acceptDocs, int max)
throws IOException {
DisiWrapper top = essentialQueue.top();

int innerWindowMin = top.doc;
Expand Down Expand Up @@ -276,8 +344,25 @@ private void scoreInnerWindowMultipleEssentialClauses(
windowScores[index] = 0d;
});
windowMatches.clear(0, innerWindowSize);
}

scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer);
private void applyFilter(DocAndScoreAccBuffer buffer, DisiWrapper filter) throws IOException {
int intersectionSize = 0;
int filterDoc = filter.doc;
for (int i = 0; i < buffer.size; ++i) {
int targetDoc = buffer.docs[i];
if (filterDoc < targetDoc) {
filterDoc = filter.approximation.advance(targetDoc);
}
if (filterDoc == targetDoc
&& (filter.twoPhaseView == null || filter.twoPhaseView.matches())) {
buffer.docs[intersectionSize] = targetDoc;
buffer.scores[intersectionSize] = buffer.scores[i];
intersectionSize++;
}
}
filter.doc = filterDoc;
buffer.size = intersectionSize;
}

private int computeOuterWindowMax(int windowMin) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,92 @@ public void collect(int doc) throws IOException {
}
}

public void testFilteredDisjunctionWithPostFilter() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w =
new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
for (int i = 0; i < 512; ++i) {
Document doc = new Document();
if (i != 10 && i != 30) {
doc.add(new StringField("filter", "yes", Field.Store.NO));
}
if (i == 10 || i == 20 || i == 30) {
doc.add(new StringField("foo", "A", Field.Store.NO));
}
if (i == 20 || i == 40) {
doc.add(new StringField("foo", "C", Field.Store.NO));
}
w.addDocument(doc);
}
w.forceMerge(1);
}

try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = newSearcher(reader);

Query clause1 =
new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2);
Query clause2 =
new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3);
Query filter =
new RandomApproximationQuery(new TermQuery(new Term("filter", "yes")), random());
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
Scorer scorer1 =
searcher
.createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f)
.scorer(context);
Scorer scorer2 =
searcher
.createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f)
.scorer(context);
Scorer filterScorer =
searcher
.createWeight(searcher.rewrite(filter), ScoreMode.TOP_SCORES, 1f)
.scorer(context);

BulkScorer scorer =
new MaxScoreBulkScorer(
context.reader().maxDoc(), Arrays.asList(scorer1, scorer2), filterScorer);

LeafCollector collector =
new LeafCollector() {

private int i;
private Scorable scorer;

@Override
public void setScorer(Scorable scorer) throws IOException {
this.scorer = scorer;
}

@Override
public void collect(int doc) throws IOException {
switch (i++) {
case 0:
assertEquals(20, doc);
assertEquals(2 + 3, scorer.score(), 0);
break;
case 1:
assertEquals(40, doc);
assertEquals(3, scorer.score(), 0);
break;
default:
fail();
break;
}
}

@Override
public void finish() throws IOException {
assertEquals(2, i);
}
};
scorer.score(collector, null, 0, DocIdSetIterator.NO_MORE_DOCS);
collector.finish();
}
}
}

public void testBasicsWithTwoDisjunctionClausesAndSkipping() throws Exception {
try (Directory dir = newDirectory()) {
writeDocuments(dir);
Expand Down
Loading