这里用的是FastVectorHighlighter,可以高效地对付大文件
<!--pom.xml--> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-fast-vector-highlighter</artifactId> <version>3.0.0</version> </dependency>
package player.kent.chen.temp.lucene.highlight; import java.io.File; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class MyHighlightIndexer { public static void main(String[] args) throws Exception { String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene-sanguo"; File contentDir = new File(rootDir, "content"); File indexDir = new File(rootDir, "index"); FileUtils.deleteDirectory(indexDir); indexDir.mkdirs(); Directory indexDir1 = FSDirectory.open(indexDir); IndexWriter writer = new IndexWriter(indexDir1, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); File[] files = contentDir.listFiles(); for (File file : files) { System.out.println("Indexing ... " + file.getAbsolutePath()); String text = FileUtils.readFileToString(file, "UTF-8"); Document doc = new Document(); doc.add(new Field("contents", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); } writer.numDocs(); writer.close(); } }
package player.kent.chen.temp.lucene.highlight; import java.io.File; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; import org.apache.lucene.search.vectorhighlight.FieldQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class MyHighlighter { public static void main(String[] args) throws Exception { String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene-sanguo"; File id = new File(rootDir, "index"); String keyword = "heed"; Directory indexDir = FSDirectory.open(id); IndexSearcher searcher = new IndexSearcher(indexDir); QueryParser qp = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer( Version.LUCENE_30)); Query query = qp.parse(keyword); TopDocs hits = searcher.search(query, 10); FastVectorHighlighter highlighter = new FastVectorHighlighter(true, true); FieldQuery fieldQuery = highlighter.getFieldQuery(query); for (ScoreDoc scoreDoc : hits.scoreDocs) { String snippet = highlighter.getBestFragment(fieldQuery, searcher.getIndexReader(), scoreDoc.doc, "contents", 200); System.out.println(snippet); } searcher.close(); } }