博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
MMSegAnalyzer 自定义 同义词分词器
阅读量:6693 次
发布时间:2019-06-25

本文共 5753 字,大约阅读时间需要 19 分钟。

hot3.png

MMSegAnalyzer 自定义 同义词分词器 博客分类: 搜索引擎,爬虫 java  
package synonym;import java.io.File;import java.io.IOException;import java.io.Reader;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.core.LowerCaseFilter;import org.apache.lucene.analysis.core.StopFilter;import org.apache.lucene.analysis.en.PorterStemFilter;import org.apache.lucene.analysis.synonym.SynonymFilter;import org.apache.lucene.analysis.synonym.SynonymMap;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.util.CharArraySet;import org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.Seg;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;public class MMSegAnalyzer extends Analyzer {		protected Dictionary dic;	/**	 * @see Dictionary#getInstance()	 */	public MMSegAnalyzer() {		dic = Dictionary.getInstance();	}	/**	 * @param path 词库路径	 * @see Dictionary#getInstance(String)	 */	public MMSegAnalyzer(String path) {		dic = Dictionary.getInstance(path);	}	/**	 * @param path 词库目录	 * @see Dictionary#getInstance(File)	 */	public MMSegAnalyzer(File path) {		dic = Dictionary.getInstance(path);	}	public MMSegAnalyzer(Dictionary dic) {		super();		this.dic = dic;	}	protected Seg newSeg() {		return new MaxWordSeg(dic);	}	public Dictionary getDict() {		return dic;	}	// 自定义停用词	private static final String[] stopWords = {"and", "of", "the", "to", "is", "their", "can", "all"};	@Override	protected TokenStreamComponents createComponents(String fieldName, Reader reader) {		// 创建一个分词器		//Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_47, reader);				Tokenizer tokenizer = new MMSegTokenizer(newSeg(), reader);						// 创建一系列的分词过滤器		TokenFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_47, tokenizer);		TokenFilter synonymFilter = new SynonymFilter(lowerCaseFilter, getSynonymMap(), true);		TokenFilter stopFilter = new StopFilter(Version.LUCENE_47, synonymFilter, buildCharArraySetFromArry(stopWords));		TokenFilter stemFilter = new PorterStemFilter(stopFilter);				// TokenStream的包装类 在2.2之中 是TokenStream		return new TokenStreamComponents(tokenizer, stemFilter);	}		// 将数组转成lucene可识别的CharArraySet对象 CharArraySet类似java.util.set	private CharArraySet buildCharArraySetFromArry(String[] array) {		CharArraySet set = new CharArraySet(Version.LUCENE_47, array.length, true);		for(String value : array) {			set.add(value);		}		return set;	}		// 创建一个同义词表	private SynonymMap getSynonymMap() {		String base1 = "fast";		String syn1 = "rapid";				String base2 = "slow";		String syn2 = "sluggish";				String base3 = "中国";		String syn3 = "天朝";				SynonymMap.Builder sb = new SynonymMap.Builder(true);		sb.add(new CharsRef(base1), new CharsRef(syn1), true);		sb.add(new CharsRef(base2), new CharsRef(syn2), true);		sb.add(new CharsRef(base3), new CharsRef(syn3), true);				sb.add(new CharsRef(syn3), new CharsRef(base3), true);				SynonymMap smap = null;		try {			smap = sb.build();		} catch (IOException e) {			e.printStackTrace();		}		return smap;	}		// 测试方法	public static void testPorterStemmingAnalyzer() throws IOException {		Analyzer analyzer = new MMSegAnalyzer();		String text = "Collective slow intelligence and Web2.0, fast and rapid2  天朝";		Reader reader = new StringReader(text);		TokenStream ts = null;		try {			ts = analyzer.tokenStream(null, reader);			//ts.reset();			while(ts.incrementToken()) {				CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);  				System.out.println(ta.toString());			}		} catch (IOException e) {			e.printStackTrace();		} 			}		public static void main(String[] args) throws IOException {		testPorterStemmingAnalyzer();	}}

 

package luncen; import java.io.IOException;import java.util.HashMap;import java.util.Map;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.core.WhitespaceAnalyzer;import org.apache.lucene.analysis.synonym.SynonymFilterFactory;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.util.FilesystemResourceLoader;import org.apache.lucene.util.Version; /** * @author hankcs */public class TestSynonyms{    private static void displayTokens(TokenStream ts) throws IOException    {        CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);        OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);        ts.reset();        while (ts.incrementToken())        {            String token = termAttr.toString();            System.out.print(offsetAttribute.startOffset() + "-" + offsetAttribute.endOffset() + "[" + token + "] ");        }        System.out.println();        ts.end();        ts.close();    }     public static void main(String[] args) throws Exception    {        String testInput = "其实 hankcs 似 好人 luncen";        Version ver = Version.LUCENE_47;        Map
filterArgs = new HashMap
(); filterArgs.put("luceneMatchVersion", ver.toString()); filterArgs.put("synonyms", "f:\\synonym.txt"); filterArgs.put("expand", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(filterArgs); factory.inform(new FilesystemResourceLoader()); WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(ver); TokenStream ts = factory.create(whitespaceAnalyzer.tokenStream("someField", testInput)); displayTokens(ts); }}

 

转载于:https://my.oschina.net/xiaominmin/blog/1597106

你可能感兴趣的文章
史上最清晰易懂的babel配置解析
查看>>
spring boot2.x 整合Mybatis
查看>>
我的前端那些事 --less进阶
查看>>
面试总结1
查看>>
Python pymysql数据库之建库建表、增删改查
查看>>
webpack打包多页面的方式
查看>>
如何学习游戏开发?游戏开发如何入门?Unity3D好学吗?怎么学习,学习路线是什么?...
查看>>
2018/12/29
查看>>
postgresql中的查询(query)二
查看>>
yii2控制台执行
查看>>
height()内容自适应,超出显示滚动条
查看>>
MySql模糊查询like通配符
查看>>
JDBC连接数据库步骤
查看>>
Shell脚本监控服务器pts登录情况记录为日志并邮件通知【CentOS 6.5】
查看>>
[leetcode] Jump Game II
查看>>
iOS开发技巧(系列十四:iOS7导航栏和iOS6的区别)
查看>>
Js针对window窗体大小设置
查看>>
MySQL 使用SELECT ... FOR UPDATE
查看>>
MYSQL级联查询,包括向上向下的级联
查看>>
Apache优化:修改最大并发连接数
查看>>