想象:
假设一个文本有以下几部分组成:
title: "Hadoop: The Definitive Guide"
content: "Hadoop got its start in Nutch"
unbreakable: "united kingdom" (先不要理会unbreakable的意义)
ignored: "Hadoop Nonsense" (注释同上)
如果按下列语句来建索引,索引大概会是什么样?
Document doc = new Document(); doc.add(new Field("ignored", passage.ignored, Field.Store.YES, Field.Index.NO)); doc.add(new Field("content", passage.content, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("unbreakable", passage.unbreakable, Field.Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("title", passage.title, Field.Store.YES, Field.Index.ANALYZED)); indexWriter.addDocument(doc);
我觉得,索引的逻辑结构可以想象为:
Token | Field | Stored Text | 备注 |
Hadoop | content | ||
got | content | ||
its | content | ||
start | content | ||
in | content | ||
Nutch | content | ||
Hadoop | title | Hadoop: The Definitive Guide | |
The | title | Hadoop: The Definitive Guide | |
Definitive | title | Hadoop: The Definitive Guide | |
Guide | title | Hadoop: The Definitive Guide | |
united kingdom | unbreakable | united kingdom | "united kingdom"整体作为一个token |
Nonsense | ignored | Nonsense | "Nonsense"这个Token未被收录 |
验证代码:
package player.kent.chen.temp.lucene.indexcomponent; import static player.kent.chen.temp.lucene.indexcomponent.MiscIndexCreator.MyPassage.SAMPLE_PASSAGE; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class MiscIndexCreator { public static void main(String[] args) throws Exception { String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions"; File indexDir = new File(rootDir, "index"); FileUtils.deleteDirectory(indexDir); indexDir.mkdirs(); doIndex(indexDir); } private static void doIndex(File id) throws IOException { Directory indexDir = FSDirectory.open(id); IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); System.out.println("Indexing ... " + SAMPLE_PASSAGE.title); Document doc = new Document(); doc.add(new Field("ignored", SAMPLE_PASSAGE.ignored, Field.Store.YES, Field.Index.NO)); doc.add(new Field("content", SAMPLE_PASSAGE.content, Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("unbreakable", SAMPLE_PASSAGE.unbreakable, Field.Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("title", SAMPLE_PASSAGE.title, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); } public static final class MyPassage { String title; String content; String unbreakable; String ignored; public static final MyPassage SAMPLE_PASSAGE = MyPassage.newInstance( "Hadoop: The Definitive Guide", "Hadoop got its start in Nutch", "united kingdom", "Nonsense"); private MyPassage() { } public static final MyPassage newInstance(String title, String content, String unbreakable, String ignored) { MiscIndexCreator.MyPassage instance = new MiscIndexCreator.MyPassage(); instance.title = title; instance.content = content; instance.unbreakable = unbreakable; instance.ignored = ignored; return instance; } } }
package player.kent.chen.temp.lucene.indexcomponent; import java.io.File; public class MiscIndexInspector { public static void main(String[] args) throws Exception { String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions"; Directory indexDir = FSDirectory.open(new File(rootDir, "index")); IndexSearcher indexSearcher = new IndexSearcher(indexDir); //按ignored field搜索 TopDocs searchOnIgnored = doSearch(indexSearcher, "ignored", "nonsense"); println("Num of result matching 'ignored:nonsense' is :" + searchOnIgnored.totalHits); //为0, 因为ignored field没有被index //按content field搜索 TopDocs searchOnContent = doSearch(indexSearcher, "content", "hadoop"); println("Result matching 'content:hadoop' is :"); Document docOfContentMatching = indexSearcher.doc(searchOnContent.scoreDocs[0].doc); println(" content : " + docOfContentMatching.get("content")); //为null, 因为content field在index中没有store println(" ignored : " + docOfContentMatching.get("ignored")); //不为空,因为ignored在index中已store,虽然没有被index //按unbreakable field搜索 TopDocs searchOnUnbreakable = doSearch(indexSearcher, "unbreakable", "united kingdom"); println("Num of result matching 'unbreakable:united kingdom' is :" + searchOnUnbreakable.totalHits); //为1 TopDocs searchOnUnbreakable2 = doSearch(indexSearcher, "unbreakable", "kingdom"); println("Num of result matching 'unbreakable:kingdom' is :" + searchOnUnbreakable2.totalHits); //为0, 因为"united kingdom"没有被analyze, 所以"kingdom"不是一个token //按title field搜索 TopDocs searchOnTitle = doSearch(indexSearcher, "title", "hadoop"); println("Result matching 'title:hadoop' is :"); Document docOfTitleMatching = indexSearcher.doc(searchOnTitle.scoreDocs[0].doc); println(" title : " + docOfTitleMatching.get("title")); //为"Hadoop: The Definitive Guide"这整个标题,因为title有被store indexSearcher.close(); } private static void println(String o) { System.out.println(o); } private static TopDocs doSearch(IndexSearcher indexSearcher, String field, String keyword) throws IOException, ParseException { return indexSearcher.search(buildQp(field, keyword), 2); } private static Query buildQp(String field, String keyword) throws ParseException { Query query = new TermQuery(new Term(field, keyword)); return query; } }