想象一下Lunece索引的逻辑结构

想象：

假设一个文本有以下几部分组成：

title: "Hadoop: The Definitive Guide"

content: "Hadoop got its start in Nutch"

unbreakable: "united kingdom" (先不要理会unbreakable的意义)

ignored: "Hadoop Nonsense" (注释同上)

如果按下列语句来建索引，索引大概会是什么样？

            Document doc = new Document();
            doc.add(new Field("ignored", passage.ignored, Field.Store.YES, Field.Index.NO));
            doc.add(new Field("content", passage.content, Field.Store.NO, Field.Index.ANALYZED));
            doc.add(new Field("unbreakable", passage.unbreakable, Field.Store.YES, Index.NOT_ANALYZED));
            doc.add(new Field("title", passage.title, Field.Store.YES, Field.Index.ANALYZED));
            indexWriter.addDocument(doc);

我觉得，索引的逻辑结构可以想象为：

Token	Field	Stored Text	备注
Hadoop	content
got	content
its	content
start	content
in	content
Nutch	content
Hadoop	title	Hadoop: The Definitive Guide
The	title	Hadoop: The Definitive Guide
Definitive	title	Hadoop: The Definitive Guide
Guide	title	Hadoop: The Definitive Guide
united kingdom	unbreakable	united kingdom	"united kingdom"整体作为一个token
Nonsense	ignored	Nonsense	"Nonsense"这个Token未被收录

验证代码：

package player.kent.chen.temp.lucene.indexcomponent;

import static player.kent.chen.temp.lucene.indexcomponent.MiscIndexCreator.MyPassage.SAMPLE_PASSAGE;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class MiscIndexCreator {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions";
        File indexDir = new File(rootDir, "index");

        FileUtils.deleteDirectory(indexDir);
        indexDir.mkdirs();

        doIndex(indexDir);

    }

    private static void doIndex(File id) throws IOException {
        Directory indexDir = FSDirectory.open(id);
        IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(Version.LUCENE_30),
                true, IndexWriter.MaxFieldLength.UNLIMITED);

        System.out.println("Indexing ... " + SAMPLE_PASSAGE.title);
        Document doc = new Document();
        doc.add(new Field("ignored", SAMPLE_PASSAGE.ignored, Field.Store.YES, Field.Index.NO));
        doc.add(new Field("content", SAMPLE_PASSAGE.content, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("unbreakable", SAMPLE_PASSAGE.unbreakable, Field.Store.YES,
                Index.NOT_ANALYZED));
        doc.add(new Field("title", SAMPLE_PASSAGE.title, Field.Store.YES, Field.Index.ANALYZED));
        writer.addDocument(doc);

        writer.close();

    }

    public static final class MyPassage {

        String                        title;
        String                        content;
        String                        unbreakable;
        String                        ignored;

        public static final MyPassage SAMPLE_PASSAGE = MyPassage.newInstance(
                                                             "Hadoop: The Definitive Guide",
                                                             "Hadoop got its start in Nutch",
                                                             "united kingdom", "Nonsense");

        private MyPassage() {
        }

        public static final MyPassage newInstance(String title, String content, String unbreakable,
                                                  String ignored) {
            MiscIndexCreator.MyPassage instance = new MiscIndexCreator.MyPassage();
            instance.title = title;
            instance.content = content;
            instance.unbreakable = unbreakable;
            instance.ignored = ignored;
            return instance;
        }

    }

}

package player.kent.chen.temp.lucene.indexcomponent;

import java.io.File;

public class MiscIndexInspector {

    public static void main(String[] args) throws Exception {
        String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene/indexOptions";
        Directory indexDir = FSDirectory.open(new File(rootDir, "index"));
        IndexSearcher indexSearcher = new IndexSearcher(indexDir);

        //按ignored field搜索
        TopDocs searchOnIgnored = doSearch(indexSearcher, "ignored", "nonsense");
        println("Num of result matching 'ignored:nonsense' is :" + searchOnIgnored.totalHits); //为0, 因为ignored field没有被index

        //按content field搜索
        TopDocs searchOnContent = doSearch(indexSearcher, "content", "hadoop");
        println("Result matching 'content:hadoop' is :");
        Document docOfContentMatching = indexSearcher.doc(searchOnContent.scoreDocs[0].doc);
        println("   content : " + docOfContentMatching.get("content")); //为null, 因为content field在index中没有store
        println("   ignored : " + docOfContentMatching.get("ignored")); //不为空，因为ignored在index中已store,虽然没有被index

        //按unbreakable field搜索
        TopDocs searchOnUnbreakable = doSearch(indexSearcher, "unbreakable", "united kingdom");
        println("Num of result matching 'unbreakable:united kingdom' is :"
                + searchOnUnbreakable.totalHits); //为1

        TopDocs searchOnUnbreakable2 = doSearch(indexSearcher, "unbreakable", "kingdom");
        println("Num of result matching 'unbreakable:kingdom' is :"
                + searchOnUnbreakable2.totalHits); //为0, 因为"united kingdom"没有被analyze, 所以"kingdom"不是一个token

        //按title field搜索
        TopDocs searchOnTitle = doSearch(indexSearcher, "title", "hadoop");
        println("Result matching 'title:hadoop' is :");
        Document docOfTitleMatching = indexSearcher.doc(searchOnTitle.scoreDocs[0].doc);
        println("   title : " + docOfTitleMatching.get("title")); //为"Hadoop: The Definitive Guide"这整个标题，因为title有被store

        indexSearcher.close();

    }

    private static void println(String o) {
        System.out.println(o);
    }

    private static TopDocs doSearch(IndexSearcher indexSearcher, String field, String keyword)
            throws IOException, ParseException {
        return indexSearcher.search(buildQp(field, keyword), 2);
    }

    private static Query buildQp(String field, String keyword) throws ParseException {
        Query query = new TermQuery(new Term(field, keyword));
        return query;
    }

}

Leave a Comment Cancel Reply