Lucene使用IK中文分词
环境
也可以通过Maven或Gradle构建工程测试和验证
对于Lucene的最新版本,需要找到IK Analyzer对应的兼容版。
传送门 Lucene 6.6.6 Documentation
IK中文分词配置
Lucene 6.x使用IK分词需要继承Analyzer、Tokenizer,重新编写逻辑配置,再使用。分别配置子类IKAnalyzer6x、IKTokenizer6x
IKAnalyzer6x.java
package com.liuyu.lucene.ik;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
/**
* @author huangliuyu
* @description
* @date 2021-04-21
*/
public class IKTokenizer6x extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词分类属性
//(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
//Lucene 6.x Tokenizer适配器类构造函数:实现Tokenizer接口
public IKTokenizer6x(boolean useSmart) {
super();
offsetAtt=super.addAttribute(OffsetAttribute.class);
termAtt=super.addAttribute(CharTermAttribute.class);
typeAtt=super.addAttribute(TypeAttribute.class);
_IKImplement=new IKSegmenter(input,useSmart);
}
public boolean incrementToken() throws IOException {
//清除所有的词元属性
super.clearAttributes();
Lexeme nextLexeme=_IKImplement.next();
if(null!=nextLexeme){
//将Lexeme转成Attribute
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(),nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition=nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeText());
//返回true告知还有下个词元
return true;
}
//返回false告知词元输出完毕
return false;
}
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public void end() throws IOException {
int finalOffset=super.correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset,finalOffset);
}
}
IKAnalyzer6x.java
package com.liuyu.lucene.ik;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
/**
* @author huangliuyu
* @date 2021-04-21
* @description
*/
public class IKAnalyzer6x extends Analyzer {
private boolean useSmart;
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
* 默认细粒度切分算法
*/
public IKAnalyzer6x(){
this(false);
}
/**
* IK分词器Lucene Analyzer接口实现类
* 当为true时,分词器进行智能切分
* @param useSmart
*/
public IKAnalyzer6x(boolean useSmart) {
super();
this.useSmart=useSmart;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
//使用配置好的ik Tokenizer
Tokenizer _IKTokenizer=new IKTokenizer6x(this.useSmart);
return new TokenStreamComponents(_IKTokenizer);
}
}
使用和比较
这里使用 Lucene 6.X自带的中文智能分词器 SmartChineseAnalyzer与IK Analyzer作比较,演示使用情况
代码
package com.liuyu.lucene.ik;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.StringReader;
/**
* @author huangliuyu
* @date 2021-04-21
* @description
*/
public class IkVSSmartCn {
private static String str1 = "公路局正在治理解放大道路面积水问题。";
private static String str2 = "IKAnalyzer是一个开源的,基于java语言开发的轻量级的中文分词工具包。";
public static void main(String[] args) throws IOException {
Analyzer analyzer = null;
System.out.println("句子一:" + str1);
System.out.println("SmartChineseAnalyzer分词结果:");
analyzer = new SmartChineseAnalyzer();
printAnalyzer(analyzer, str1);
System.out.println("IKAnalyzer分词结果:");
analyzer = new IKAnalyzer6x(true);
printAnalyzer(analyzer, str1);
System.out.println();
System.out.println("-------------------------------------------------");
System.out.println();
System.out.println("句子二:" + str2);
System.out.println("SmartChineseAnalyzer分词结果:");
analyzer = new SmartChineseAnalyzer();
printAnalyzer(analyzer, str2);
System.out.println("IKAnalyzer分词结果:");
analyzer = new IKAnalyzer6x(true);
printAnalyzer(analyzer, str2);
}
public static void printAnalyzer(Analyzer analyzer, String str) throws IOException {
StringReader reader = new StringReader(str);
TokenStream toStream = analyzer.tokenStream(str, reader);
toStream.reset();//清空流
CharTermAttribute teAttribute = toStream.getAttribute(CharTermAttribute.class);
while (toStream.incrementToken()) {
System.out.print(teAttribute.toString() + "|");
}
System.out.println("\n");
analyzer.close();
}
}
效果
由效果可见IK Analyzer的中文分词效果要比Lucene SmartChineseAnalyzer的好。