package synonym;import java.io.File;import java.io.IOException;import java.io.Reader;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.StopFilter;import org.apache.lucene.analysis.en.PorterStemFilter;import org.apache.lucene.analysis.synonym.SynonymFilter;import org.apache.lucene.analysis.synonym.SynonymMap;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.util.CharArraySet;import org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MMSegAnalyzer extends Analyzer { protected Dictionary dic; /** * @see Dictionary#getInstance() */ public MMSegAnalyzer() { dic = Dictionary.getInstance(); } /** * @param path 词库路径 * @see Dictionary#getInstance(String) */ public MMSegAnalyzer(String path) { dic = Dictionary.getInstance(path); } /** * @param path 词库目录 * @see Dictionary#getInstance(File) */ public MMSegAnalyzer(File path) { dic = Dictionary.getInstance(path); } public MMSegAnalyzer(Dictionary dic) { super(); this.dic = dic; } protected Seg newSeg() { return new MaxWordSeg(dic); } public Dictionary getDict() { return dic; } // 自定义停用词 private static final String[] stopWords = {"and", "of", "the", "to", "is", "their", "can", "all"}; @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { // 创建一个分词器 //Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_47, reader); Tokenizer tokenizer = new MMSegTokenizer(newSeg(), reader); // 创建一系列的分词过滤器 TokenFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_47, tokenizer); TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter, getSynonymMap(), true); TokenFilter stopFilter = new StopFilter(Version.LUCENE_47, synonymFilter, buildCharArraySetFromArry(stopWords)); TokenFilter stemFilter = new PorterStemFilter(stopFilter); // TokenStream的包装类 在2.2之中 是TokenStream return new TokenStreamComponents(tokenizer, stemFilter); } // 将数组转成lucene可识别的CharArraySet对象 CharArraySet类似java.util.set private CharArraySet buildCharArraySetFromArry(String[] array) { CharArraySet set = new CharArraySet(Version.LUCENE_47, array.length, true); for(String value : array) { set.add(value); } return set; } // 创建一个同义词表 private SynonymMap getSynonymMap() { String base1 = "fast"; String syn1 = "rapid"; String base2 = "slow"; String syn2 = "sluggish"; String base3 = "中国"; String syn3 = "天朝"; SynonymMap.Builder sb = new SynonymMap.Builder(true); sb.add(new CharsRef(base1), new CharsRef(syn1), true); sb.add(new CharsRef(base2), new CharsRef(syn2), true); sb.add(new CharsRef(base3), new CharsRef(syn3), true); sb.add(new CharsRef(syn3), new CharsRef(base3), true); SynonymMap smap = null; try { smap = sb.build(); } catch (IOException e) { e.printStackTrace(); } return smap; } // 测试方法 public static void testPorterStemmingAnalyzer() throws IOException { Analyzer analyzer = new MMSegAnalyzer(); String text = "Collective slow intelligence and Web2.0, fast and rapid2 天朝"; Reader reader = new StringReader(text); TokenStream ts = null; try { ts = analyzer.tokenStream(null, reader); //ts.reset(); while(ts.incrementToken()) { CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); System.out.println(ta.toString()); } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException { testPorterStemmingAnalyzer(); }}
package luncen; import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.core.WhitespaceAnalyzer;import org.apache.lucene.analysis.synonym.SynonymFilterFactory;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.util.FilesystemResourceLoader;import org.apache.lucene.util.Version; /** * @author hankcs */public class TestSynonyms{ private static void displayTokens(TokenStream ts) throws IOException { CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAttr.toString(); System.out.print(offsetAttribute.startOffset() + "-" + offsetAttribute.endOffset() + "[" + token + "] "); } System.out.println(); ts.end(); ts.close(); } public static void main(String[] args) throws Exception { String testInput = "其实 hankcs 似 好人 luncen"; Version ver = Version.LUCENE_47; MapfilterArgs = new HashMap (); filterArgs.put("luceneMatchVersion", ver.toString()); filterArgs.put("synonyms", "f:\\synonym.txt"); filterArgs.put("expand", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(filterArgs); factory.inform(new FilesystemResourceLoader()); WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(ver); TokenStream ts = factory.create(whitespaceAnalyzer.tokenStream("someField", testInput)); displayTokens(ts); }}