LUCENE-8921: IndexSearcher.termStatistics API change

Closes #797
This commit is contained in:
Bruno Roustant 2019-09-17 16:48:24 -04:00 коммит произвёл David Smiley
Родитель 93d3e5d666
Коммит fd0c8b9e81
16 изменённых файлов: 65 добавлений и 73 удалений

Просмотреть файл

@ -77,6 +77,10 @@ API Changes
* LUCENE-8956: QueryRescorer now only sorts the first topN hits instead of all
initial hits. (Paul Sanwald via Adrien Grand)
* LUCENE-8921: IndexSearcher.termStatistics() no longer takes a TermStates; it takes the docFreq and totalTermFreq.
And don't call if docFreq <= 0. The previous implementation survives as deprecated and final. It's removed in 9.0.
(Bruno Roustant, David Smiley, Alan Woodward)
New Features
* LUCENE-8936: Add SpanishMinimalStemFilter (vinod kumar via Tomoko Uchida)

Просмотреть файл

@ -41,7 +41,6 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
@ -865,19 +864,20 @@ public class IndexSearcher {
}
/**
* Returns {@link TermStatistics} for a term, or {@code null} if
* the term does not exist.
* Returns {@link TermStatistics} for a term.
*
* This can be overridden for example, to return a term's statistics
* across a distributed collection.
*
* @param docFreq The document frequency of the term. It must be greater or equal to 1.
* @param totalTermFreq The total term frequency.
* @return A {@link TermStatistics} (never null).
*
* @lucene.experimental
*/
public TermStatistics termStatistics(Term term, TermStates context) throws IOException {
if (context.docFreq() == 0) {
return null;
} else {
return new TermStatistics(term.bytes(), context.docFreq(), context.totalTermFreq());
}
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
// This constructor will throw an exception if docFreq <= 0.
return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
}
/**

Просмотреть файл

@ -233,11 +233,8 @@ public class MultiPhraseQuery extends Query {
ts = TermStates.build(context, term, scoreMode.needsScores());
termStates.put(term, ts);
}
if (scoreMode.needsScores()) {
TermStatistics termStatistics = searcher.termStatistics(term, ts);
if (termStatistics != null) {
allTermStats.add(termStatistics);
}
if (scoreMode.needsScores() && ts.docFreq() > 0) {
allTermStats.add(searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq()));
}
}
}

Просмотреть файл

@ -428,9 +428,9 @@ public class PhraseQuery extends Query {
final Term term = terms[i];
states[i] = TermStates.build(context, term, scoreMode.needsScores());
if (scoreMode.needsScores()) {
TermStatistics termStatistics = searcher.termStatistics(term, states[i]);
if (termStatistics != null) {
termStats[termUpTo++] = termStatistics;
TermStates ts = states[i];
if (ts.docFreq() > 0) {
termStats[termUpTo++] = searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
}
}
}

Просмотреть файл

@ -205,9 +205,10 @@ public final class SynonymQuery extends Query {
long totalTermFreq = 0;
termStates = new TermStates[terms.length];
for (int i = 0; i < termStates.length; i++) {
termStates[i] = TermStates.build(searcher.getTopReaderContext(), terms[i].term, true);
TermStatistics termStats = searcher.termStatistics(terms[i].term, termStates[i]);
if (termStats != null) {
TermStates ts = TermStates.build(searcher.getTopReaderContext(), terms[i].term, true);
termStates[i] = ts;
if (ts.docFreq() > 0) {
TermStatistics termStats = searcher.termStatistics(terms[i].term, ts.docFreq(), ts.totalTermFreq());
docFreq = Math.max(termStats.docFreq(), docFreq);
totalTermFreq += termStats.totalTermFreq();
}

Просмотреть файл

@ -60,7 +60,7 @@ public class TermQuery extends Query {
final TermStatistics termStats;
if (scoreMode.needsScores()) {
collectionStats = searcher.collectionStatistics(term.field());
termStats = searcher.termStatistics(term, termStates);
termStats = termStates.docFreq() > 0 ? searcher.termStatistics(term, termStates.docFreq(), termStates.totalTermFreq()) : null;
} else {
// we do not need the actual stats, use fake stats with docFreq=maxDoc=ttf=1
collectionStats = new CollectionStatistics(term.field(), 1, 1, 1, 1);

Просмотреть файл

@ -103,9 +103,9 @@ public abstract class SpanWeight extends Weight {
TermStatistics[] termStats = new TermStatistics[termStates.size()];
int termUpTo = 0;
for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
TermStatistics termStatistics = searcher.termStatistics(entry.getKey(), entry.getValue());
if (termStatistics != null) {
termStats[termUpTo++] = termStatistics;
TermStates ts = entry.getValue();
if (ts.docFreq() > 0) {
termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
}
}
CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());

Просмотреть файл

@ -329,10 +329,10 @@ public class TestMinShouldMatch2 extends LuceneTestCase {
if (ord >= 0) {
boolean success = ords.add(ord);
assert success; // no dups
TermStates context = TermStates.build(reader.getContext(), term, true);
TermStates ts = TermStates.build(reader.getContext(), term, true);
SimScorer w = weight.similarity.scorer(1f,
searcher.collectionStatistics("field"),
searcher.termStatistics(term, context));
searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq()));
sims[(int)ord] = new LeafSimScorer(w, reader, "field", true);
}
}

Просмотреть файл

@ -281,9 +281,10 @@ public final class BM25FQuery extends Query implements Accountable {
termStates = new TermStates[fieldTerms.length];
for (int i = 0; i < termStates.length; i++) {
FieldAndWeight field = fieldAndWeights.get(fieldTerms[i].field());
termStates[i] = TermStates.build(searcher.getTopReaderContext(), fieldTerms[i], true);
TermStatistics termStats = searcher.termStatistics(fieldTerms[i], termStates[i]);
if (termStats != null) {
TermStates ts = TermStates.build(searcher.getTopReaderContext(), fieldTerms[i], true);
termStates[i] = ts;
if (ts.docFreq() > 0) {
TermStatistics termStats = searcher.termStatistics(fieldTerms[i], ts.docFreq(), ts.totalTermFreq());
docFreq = Math.max(termStats.docFreq(), docFreq);
totalTermFreq += (double) field.weight * termStats.totalTermFreq();
}

Просмотреть файл

@ -360,9 +360,9 @@ public class TermAutomatonQuery extends Query implements Accountable {
for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) {
Integer termID = ent.getKey();
if (ent.getValue() != null) {
TermStatistics stats = searcher.termStatistics(new Term(field, ent.getValue()), termStates.get(termID));
if (stats != null) {
allTermStats.add(stats);
TermStates ts = termStates.get(termID);
if (ts.docFreq() > 0) {
allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq()));
}
}
}

Просмотреть файл

@ -186,8 +186,10 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
}
try {
for(Term term : terms) {
final TermStates termStates = TermStates.build(s.getIndexReader().getContext(), term, true);
stats.put(term, s.termStatistics(term, termStates));
final TermStates ts = TermStates.build(s.getIndexReader().getContext(), term, true);
if (ts.docFreq() > 0) {
stats.put(term, s.termStatistics(term, ts.docFreq(), ts.totalTermFreq()));
}
}
} finally {
node.searchers.release(s);
@ -262,36 +264,31 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
}
@Override
public TermStatistics termStatistics(Term term, TermStates context) throws IOException {
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
assert term != null;
long docFreq = 0;
long totalTermFreq = 0;
long distributedDocFreq = 0;
long distributedTotalTermFreq = 0;
for(int nodeID=0;nodeID<nodeVersions.length;nodeID++) {
final TermStatistics subStats;
if (nodeID == myNodeID) {
subStats = super.termStatistics(term, context);
subStats = super.termStatistics(term, docFreq, totalTermFreq);
} else {
final TermAndShardVersion key = new TermAndShardVersion(nodeID, nodeVersions[nodeID], term);
subStats = termStatsCache.get(key);
if (subStats == null) {
continue; // term not found
}
}
if (subStats == null) {
continue; // term not found
}
long nodeDocFreq = subStats.docFreq();
docFreq += nodeDocFreq;
distributedDocFreq += nodeDocFreq;
long nodeTotalTermFreq = subStats.totalTermFreq();
totalTermFreq += nodeTotalTermFreq;
}
if (docFreq == 0) {
return null; // term not found in any node whatsoever
} else {
return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
distributedTotalTermFreq += nodeTotalTermFreq;
}
assert distributedDocFreq > 0;
return new TermStatistics(term.bytes(), distributedDocFreq, distributedTotalTermFreq);
}
@Override

Просмотреть файл

@ -48,7 +48,6 @@ import org.apache.lucene.index.MultiPostingsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.*;
@ -324,15 +323,15 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable, SolrI
* Override these two methods to provide a way to use global collection stats.
*/
@Override
public TermStatistics termStatistics(Term term, TermStates context) throws IOException {
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
final SolrRequestInfo reqInfo = SolrRequestInfo.getRequestInfo();
if (reqInfo != null) {
final StatsSource statsSrc = (StatsSource) reqInfo.getReq().getContext().get(STATS_SOURCE);
if (statsSrc != null) {
return statsSrc.termStatistics(this, term, context);
return statsSrc.termStatistics(this, term, docFreq, totalTermFreq);
}
}
return localTermStatistics(term, context);
return localTermStatistics(term, docFreq, totalTermFreq);
}
@Override
@ -347,8 +346,8 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable, SolrI
return localCollectionStatistics(field);
}
public TermStatistics localTermStatistics(Term term, TermStates context) throws IOException {
return super.termStatistics(term, context);
public TermStatistics localTermStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
return super.termStatistics(term, docFreq, totalTermFreq);
}
public CollectionStatistics localCollectionStatistics(String field) throws IOException {

Просмотреть файл

@ -28,7 +28,6 @@ import java.util.Set;
import com.google.common.collect.Lists;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -170,11 +169,8 @@ public class ExactStatsCache extends StatsCache {
}
@Override
public TermStatistics termStatistics(Term term, TermStates context) throws IOException {
TermStatistics ts = super.termStatistics(term, context);
if (ts == null) {
return null;
}
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
TermStatistics ts = super.termStatistics(term, docFreq, totalTermFreq);
terms.add(term);
statsMap.put(term.toString(), new TermStats(term.field(), ts));
return ts;
@ -328,7 +324,7 @@ public class ExactStatsCache extends StatsCache {
this.colStatsCache = colStatsCache;
}
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, TermStates context)
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException {
TermStats termStats = termStatsCache.get(term.toString());
// TermStats == null is also true if term has no docFreq anyway,
@ -336,7 +332,7 @@ public class ExactStatsCache extends StatsCache {
// Not sure we need a warning here
if (termStats == null) {
log.debug("Missing global termStats info for term={}, using local stats", term);
return localSearcher.localTermStatistics(term, context);
return localSearcher.localTermStatistics(term, docFreq, totalTermFreq);
} else {
return termStats.toTermStatistics();
}

Просмотреть файл

@ -24,7 +24,6 @@ import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.solr.core.PluginInfo;
@ -132,12 +131,12 @@ public class LRUStatsCache extends ExactStatsCache {
this.colStatsCache = colStatsCache;
}
@Override
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, TermStates context)
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException {
TermStats termStats = termStatsCache.get(term.toString());
if (termStats == null) {
log.debug("## Missing global termStats info: {}, using local", term);
return localSearcher.localTermStatistics(term, context);
return localSearcher.localTermStatistics(term, docFreq, totalTermFreq);
} else {
return termStats.toTermStatistics();
}

Просмотреть файл

@ -19,7 +19,6 @@ package org.apache.solr.search.stats;
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.solr.search.SolrIndexSearcher;
@ -34,9 +33,9 @@ public final class LocalStatsSource extends StatsSource {
}
@Override
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, TermStates context)
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException {
return localSearcher.localTermStatistics(term, context);
return localSearcher.localTermStatistics(term, docFreq, totalTermFreq);
}
@Override

Просмотреть файл

@ -19,7 +19,6 @@ package org.apache.solr.search.stats;
import java.io.IOException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermStatistics;
@ -34,7 +33,7 @@ import org.apache.solr.search.SolrIndexSearcher;
*/
public abstract class StatsSource {
public abstract TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, TermStates context)
public abstract TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException;
public abstract CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)