Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ Improvements

Optimizations
---------------------
* GITHUB#15519: Use filter doc ID runs to bulk-score filtered MaxScoreBulkScorer windows.
(Prithvi S)

* GITHUB#15681, GITHUB#15833: Replace pre-sized array or empty array with lambda expression to call Collection#toArray. (Zhou Hui)

* GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollectorManager;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;

/**
* Benchmarks filtered top-score disjunctions that use {@link
* org.apache.lucene.search.MaxScoreBulkScorer}.
*/
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
@State(Scope.Thread)
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 5)
@Fork(value = 1, warmups = 1)
public class MaxScoreFilterBenchmark {

private Directory dir;
private IndexReader reader;
private IndexSearcher searcher;
private Path path;
private Query query;

@Param({"1000000"})
public int docCount;

@Param({"100", "1000"})
public int essentialInterval;

@Param({"1", "10", "1000"})
public int filterInterval;

@Param({"2"})
public int lowInterval;

@Setup(Level.Trial)
public void setup() throws IOException {
path = Files.createTempDirectory("maxScoreFilterBench");
dir = MMapDirectory.open(path);

try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
for (int i = 0; i < docCount; ++i) {
Document doc = new Document();
if (matchesFilter(i, filterInterval)) {
doc.add(new StringField("filter", "yes", Field.Store.NO));
}
if (i % essentialInterval == 0) {
doc.add(new StringField("foo", "high", Field.Store.NO));
}
if (i % lowInterval == 1) {
doc.add(new StringField("foo", "low", Field.Store.NO));
}
writer.addDocument(doc);
}
writer.forceMerge(1);
}

reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
query = buildQuery();
}

private static boolean matchesFilter(int doc, int interval) {
if (interval == 1) {
return true;
}
int h = doc;
h ^= h >>> 16;
h *= 0x7feb352d;
h ^= h >>> 15;
h *= 0x846ca68b;
h ^= h >>> 16;
return (h & 0x7fffffff) % interval == 0;
}

private static Query buildQuery() {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new BoostQuery(new TermQuery(new Term("foo", "high")), 4f), Occur.SHOULD);
builder.add(new BoostQuery(new TermQuery(new Term("foo", "low")), 1f), Occur.SHOULD);
builder.add(new TermQuery(new Term("filter", "yes")), Occur.FILTER);
return builder.build();
}

@TearDown(Level.Trial)
public void tearDown() throws IOException {
reader.close();
dir.close();
if (Files.exists(path)) {
try (Stream<Path> walk = Files.walk(path)) {
walk.sorted(Comparator.reverseOrder())
.forEach(
p -> {
try {
Files.delete(p);
} catch (IOException _) {
}
});
}
}
}

@Benchmark
public TopDocs searchTopScores() throws IOException {
return searcher.search(query, new TopScoreDocCollectorManager(10, 1));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -148,17 +148,22 @@ private void scoreInnerWindow(
if (filter != null) {
scoreInnerWindowWithFilter(collector, acceptDocs, max, filter);
} else {
DisiWrapper top = essentialQueue.top();
DisiWrapper top2 = essentialQueue.top2();
if (top2 == null) {
scoreInnerWindowSingleEssentialClause(collector, acceptDocs, max);
} else if (top2.doc - INNER_WINDOW_SIZE / 2 >= top.doc) {
// The first half of the window would match a single clause. Let's collect this single
// clause until the next doc ID of the next clause.
scoreInnerWindowSingleEssentialClause(collector, acceptDocs, Math.min(max, top2.doc));
} else {
scoreInnerWindowMultipleEssentialClauses(collector, acceptDocs, max);
}
scoreInnerWindowWithoutFilter(collector, acceptDocs, max);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why all these unnecessary changes?

The idea is to improve scoreInnerWindowWithFilter right? We are focused on bulk scoring doc end run number of docs?

Please, we need way more than a JMH benchmark. We need to ensure correctness & please benchmark with lucene util to see the performance on a realistic dataset.

}
}

private void scoreInnerWindowWithoutFilter(LeafCollector collector, Bits acceptDocs, int max)
throws IOException {
DisiWrapper top = essentialQueue.top();
DisiWrapper top2 = essentialQueue.top2();
if (top2 == null) {
scoreInnerWindowSingleEssentialClause(collector, acceptDocs, max);
} else if (top2.doc - INNER_WINDOW_SIZE / 2 >= top.doc) {
// The first half of the window would match a single clause. Let's collect this single
// clause until the next doc ID of the next clause.
scoreInnerWindowSingleEssentialClause(collector, acceptDocs, Math.min(max, top2.doc));
} else {
scoreInnerWindowMultipleEssentialClauses(collector, acceptDocs, max);
}
}

Expand Down Expand Up @@ -195,9 +200,19 @@ private void scoreInnerWindowWithFilter(
} while (top.doc < filter.doc);
} else {
int doc = top.doc;
boolean match =
(acceptDocs == null || acceptDocs.get(doc))
&& (filter.twoPhaseView == null || filter.twoPhaseView.matches());
boolean filterMatch = filter.twoPhaseView == null || filter.twoPhaseView.matches();
if (filterMatch) {
int filterRunEnd =
filter.twoPhaseView == null
? filter.approximation.docIDRunEnd()
: filter.twoPhaseView.docIDRunEnd();
int upTo = MathUtil.unsignedMin(innerWindowMax, filterRunEnd);
if (upTo - doc >= INNER_WINDOW_SIZE / 2) {
scoreInnerWindowWithoutFilter(collector, acceptDocs, upTo);
return;
}
}
boolean match = (acceptDocs == null || acceptDocs.get(doc)) && filterMatch;
double score = 0;
do {
if (match) {
Expand Down Expand Up @@ -239,6 +254,12 @@ private void scoreInnerWindowSingleEssentialClause(

private void scoreInnerWindowMultipleEssentialClauses(
LeafCollector collector, Bits acceptDocs, int max) throws IOException {
collectInnerWindowMultipleEssentialClauses(acceptDocs, max);
scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer);
}

private void collectInnerWindowMultipleEssentialClauses(Bits acceptDocs, int max)
throws IOException {
DisiWrapper top = essentialQueue.top();

int innerWindowMin = top.doc;
Expand Down Expand Up @@ -276,8 +297,6 @@ private void scoreInnerWindowMultipleEssentialClauses(
windowScores[index] = 0d;
});
windowMatches.clear(0, innerWindowSize);

scoreNonEssentialClauses(collector, docAndScoreAccBuffer, firstEssentialScorer);
}

private int computeOuterWindowMax(int windowMin) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,92 @@ public void collect(int doc) throws IOException {
}
}

public void testFilteredDisjunctionWithFilterRun() throws Exception {
try (Directory dir = newDirectory()) {
try (IndexWriter w =
new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()))) {
for (int i = 0; i < 512; ++i) {
Document doc = new Document();
if (i != 10 && i != 30) {
doc.add(new StringField("filter", "yes", Field.Store.NO));
}
if (i == 10 || i == 20 || i == 30) {
doc.add(new StringField("foo", "A", Field.Store.NO));
}
if (i == 20 || i == 40) {
doc.add(new StringField("foo", "C", Field.Store.NO));
}
w.addDocument(doc);
}
w.forceMerge(1);
}

try (IndexReader reader = DirectoryReader.open(dir)) {
IndexSearcher searcher = newSearcher(reader);

Query clause1 =
new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "A"))), 2);
Query clause2 =
new BoostQuery(new ConstantScoreQuery(new TermQuery(new Term("foo", "C"))), 3);
Query filter =
new RandomApproximationQuery(new TermQuery(new Term("filter", "yes")), random());
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
Scorer scorer1 =
searcher
.createWeight(searcher.rewrite(clause1), ScoreMode.TOP_SCORES, 1f)
.scorer(context);
Scorer scorer2 =
searcher
.createWeight(searcher.rewrite(clause2), ScoreMode.TOP_SCORES, 1f)
.scorer(context);
Scorer filterScorer =
searcher
.createWeight(searcher.rewrite(filter), ScoreMode.TOP_SCORES, 1f)
.scorer(context);

BulkScorer scorer =
new MaxScoreBulkScorer(
context.reader().maxDoc(), Arrays.asList(scorer1, scorer2), filterScorer);

LeafCollector collector =
new LeafCollector() {

private int i;
private Scorable scorer;

@Override
public void setScorer(Scorable scorer) throws IOException {
this.scorer = scorer;
}

@Override
public void collect(int doc) throws IOException {
switch (i++) {
case 0:
assertEquals(20, doc);
assertEquals(2 + 3, scorer.score(), 0);
break;
case 1:
assertEquals(40, doc);
assertEquals(3, scorer.score(), 0);
break;
default:
fail();
break;
}
}

@Override
public void finish() throws IOException {
assertEquals(2, i);
}
};
scorer.score(collector, null, 0, DocIdSetIterator.NO_MORE_DOCS);
collector.finish();
}
}
}

public void testBasicsWithTwoDisjunctionClausesAndSkipping() throws Exception {
try (Directory dir = newDirectory()) {
writeDocuments(dir);
Expand Down
Loading