下面是用于分选以及代码。尽管我认为在考虑城市规模的基础上增加一个自定义评分会更有意义,而不是强迫人口排序。另请注意,这使用FieldCache,这可能不是关于内存使用情况的最佳解决方案。
public class ShingleFilterTests {
private Analyzer analyzer;
private IndexSearcher searcher;
private IndexReader reader;
private QueryParser qp;
private Sort sort;
public static Analyzer createAnalyzer(final int shingles) {
return new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream tokenizer = new WhitespaceTokenizer(reader);
tokenizer = new StopFilter(false, tokenizer, ImmutableSet.of("de", "la", "en"));
if (shingles > 0) {
tokenizer = new ShingleFilter(tokenizer, shingles);
}
return tokenizer;
}
};
}
public class PopulationComparatorSource extends FieldComparatorSource {
@Override
public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
return new PopulationComparator(fieldname, numHits);
}
private class PopulationComparator extends FieldComparator {
private final String fieldName;
private Integer[] values;
private int[] populations;
private int bottom;
public PopulationComparator(String fieldname, int numHits) {
values = new Integer[numHits];
this.fieldName = fieldname;
}
@Override
public int compare(int slot1, int slot2) {
if (values[slot1] > values[slot2]) return -1;
if (values[slot1] < values[slot2]) return 1;
return 0;
}
@Override
public void setBottom(int slot) {
bottom = values[slot];
}
@Override
public int compareBottom(int doc) throws IOException {
int value = populations[doc];
if (bottom > value) return -1;
if (bottom < value) return 1;
return 0;
}
@Override
public void copy(int slot, int doc) throws IOException {
values[slot] = populations[doc];
}
@Override
public void setNextReader(IndexReader reader, int docBase) throws IOException {
/* XXX uses field cache */
populations = FieldCache.DEFAULT.getInts(reader, "population");
}
@Override
public Comparable value(int slot) {
return values[slot];
}
}
}
@Before
public void setUp() throws Exception {
Directory dir = new RAMDirectory();
analyzer = createAnalyzer(3);
IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
ImmutableList<String> cities = ImmutableList.of("Bosc de Planavilla", "Planavilla", "Bosc de la Planassa",
"Bosc de Plana en Blanca");
ImmutableList<Integer> populations = ImmutableList.of(5000, 20000, 1000, 100000);
for (int id = 0; id < cities.size(); id++) {
Document doc = new Document();
doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("city", cities.get(id), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("population", String.valueOf(populations.get(id)),
Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
writer.close();
qp = new QueryParser(Version.LUCENE_30, "city", createAnalyzer(0));
sort = new Sort(new SortField("population", new PopulationComparatorSource()));
searcher = new IndexSearcher(dir);
searcher.setDefaultFieldSortScoring(true, true);
reader = searcher.getIndexReader();
}
@After
public void tearDown() throws Exception {
searcher.close();
}
@Test
public void testShingleFilter() throws Exception {
System.out.println("shingle filter");
printSearch("city:\"Bosc de Planavilla\"");
printSearch("city:Planavilla");
printSearch("city:Bosc");
}
private void printSearch(String query) throws ParseException, IOException {
Query q = qp.parse(query);
System.out.println("query " + q);
TopDocs hits = searcher.search(q, null, 4, sort);
System.out.println("results " + hits.totalHits);
int i = 1;
for (ScoreDoc dc : hits.scoreDocs) {
Document doc = reader.document(dc.doc);
System.out.println(i++ + ". " + dc + " \"" + doc.get("city") + "\" population: " + doc.get("population"));
}
System.out.println();
}
}
这得出以下结果:
query city:"Bosc Planavilla"
results 1
1. doc=0 score=1.143841[5000] "Bosc de Planavilla" population: 5000
query city:Planavilla
results 2
1. doc=1 score=1.287682[20000] "Planavilla" population: 20000
2. doc=0 score=0.643841[5000] "Bosc de Planavilla" population: 5000
query city:Bosc
results 3
1. doc=3 score=0.375[100000] "Bosc de Plana en Blanca" population: 100000
2. doc=0 score=0.5[5000] "Bosc de Planavilla" population: 5000
3. doc=2 score=0.5[1000] "Bosc de la Planassa" population: 1000
非常感谢!你的方法与我最后的方法类似,并且产生良好的结果。但它不完美...在300万文档索引上,我得到的响应时间高达1秒(在一台机器上)。此外,我经常会遇到一些古怪的情况,比如在寻找“印度酒吧巴黎”时,它会返回“Rich Bar Indian Reserve”,这实际上并不是我想要的:)。如果可能的话,我会尝试使用评分和索引时间提升来改进这一点,具体取决于功能类型。感谢您的热心帮助 ! – azpublic 2012-01-13 02:31:56
3百万份文件的1秒钟声音听起来太多了。你如何排序?你可以使用探查器来检查CPU的进展情况。我正在搜索4000万个文档索引,其中包含复杂的查询和大约70毫秒的分面和自定义排序。 – wesen 2012-01-13 08:31:06