package player.kent.chen.temp.lucene.synonymon; import java.io.IOException; import java.util.LinkedList; import java.util.List; import java.util.Queue; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.AttributeSource; public class MySynonymFilter extends TokenFilter { private final TermAttribute termAttr; private final PositionIncrementAttribute piAttr; private final Queue<String> synonyms = new LinkedList<String>(); private AttributeSource.State attrsState; protected MySynonymFilter(TokenStream input) { super(input); this.piAttr = addAttribute(PositionIncrementAttribute.class); this.termAttr = addAttribute(TermAttribute.class); } @Override public boolean incrementToken() throws IOException { String syn = synonyms.poll(); if (syn == null) { //上次incrementToken()时没有同义词剩下 boolean hasToken = input.incrementToken(); //正常递进 if (!hasToken) { return false; //已到end of input } String term = termAttr.term(); //当前term List<String> synGroup = MySynonymRepository.getAliasGroup(term); synonyms.addAll(synGroup); //把当前term的同义词都存起来,接下来incrementToken()时再把这些同义词视为token //当前的属性状态也存起来 attrsState = captureState(); return true; } else { restoreState(attrsState); //把发现同义词时的属性状态复制过来作为当前状态,但下面两个属性要重设 termAttr.setTermBuffer(syn); //当前token的term是同义词 piAttr.setPositionIncrement(0); // 使token位置重叠 return true; } } }
package player.kent.chen.temp.lucene.synonymon; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; public class MySynonymAnalyzer extends Analyzer { private final StandardAnalyzer standardAnalyzer; public MySynonymAnalyzer(StandardAnalyzer standardAnalyzer) { this.standardAnalyzer = standardAnalyzer; } @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new MySynonymFilter(standardAnalyzer.tokenStream(fieldName, reader)); } }
package player.kent.chen.temp.lucene.synonymon; import java.util.ArrayList; public class MySynonymRepository { private static final List<String[]> wordGroups = new ArrayList<String[]>(); static { wordGroups.add(new String[] { "hello", "hi", "aloha", "nihao" }); wordGroups.add(new String[] { "goodbye", "bye", "farewell", "ciao" }); } public static List<String> getAliasGroup(String word) { for (String[] wordGroup : wordGroups) { List<String> wordGroupList = Arrays.asList(wordGroup); if (wordGroupList.contains(word)) { return wordGroupList; } } return Collections.emptyList(); } }
package player.kent.chen.temp.lucene.synonymon; import java.io.File; public class MySynonymIndexer { public static void main(String[] args) throws Exception { String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene"; File contentDir = new File(rootDir, "content"); File indexDir = new File(rootDir, "index"); FileUtils.deleteDirectory(indexDir); indexDir.mkdirs(); long begin = now(); doIndex(contentDir, indexDir); System.out.println("Done in miliseconds of : " + (now() - begin)); } private static void doIndex(File cd, File id) throws IOException { Directory indexDir = FSDirectory.open(id); IndexWriter writer = new IndexWriter(indexDir, new MySynonymAnalyzer(new StandardAnalyzer( Version.LUCENE_30)), true, IndexWriter.MaxFieldLength.UNLIMITED); File[] files = cd.listFiles(); for (File file : files) { System.out.println("Indexing ... " + file.getAbsolutePath()); Document doc = new Document(); doc.add(new Field("contents", new FileReader(file))); doc.add(new Field("filepath", file.getAbsolutePath(), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } writer.numDocs(); writer.close(); } private static long now() { return System.currentTimeMillis(); } }
package player.kent.chen.temp.lucene.synonymon; import java.io.File; import java.text.MessageFormat; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class MySynonymSearcher { public static void main(String[] args) throws Exception { String rootDir = "/home/kent/diskD/home-kent-dev/workspace/kent-temp/data/lucene"; File id = new File(rootDir, "index"); String keyword = "ciao"; Directory indexDir = FSDirectory.open(id); IndexSearcher is = new IndexSearcher(indexDir); QueryParser qp = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer( Version.LUCENE_30)); Query query = qp.parse(keyword); long begin = now(); TopDocs hits = is.search(query, 10); System.out.println(MessageFormat.format("Found {0} matches in {1} milliseconds", hits.totalHits, now() - begin)); System.out.println("They are:"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); String file = doc.get("filepath"); String grepCmd = MessageFormat.format("cat {0}", file); //可以命中含有"bye"的文档 System.out.println("Please do: " + grepCmd); } is.close(); } private static long now() { return System.currentTimeMillis(); } }