Terry.Li-彬

虚其心，可解天下之问；专其心，可治天下之学；静其心，可悟天下之理；恒其心，可成天下之业。

BlogJava :: 首页 :: 新随笔 :: 联系 :: 聚合

:: 管理 ::

143 随笔 :: 344 文章 :: 130 评论 :: 0 Trackbacks

基于词典的正向最大匹配中文分词算法，能实现中英文数字混合分词

基于词典的正向最大匹配中文分词算法，能实现中英文数字混合分词。比如能分出这样的词：bb霜、3室、乐phone、touch4、mp3、T恤

第一次写中文分词程序，欢迎拍砖。

public class MM2

{

private static final Log log = LogFactory.getLog(MM2.class);

private static HashMap<String, Integer> dictionary = null;

private static final int WORD_MAX_LENGTH = 9;

private Reader reader;

static

{

loadDictionary();

}

public MM2(Reader reader)

{

this.reader = reader;

}

//切分出由中文、字母、数字组成的句子

public ArrayList<Sentence> getSentence() throws IOException

{

ArrayList<Sentence> list=new ArrayList<Sentence>();

StringBuffer cb=new StringBuffer();

int d=reader.read();

int offset=0;

boolean b=false;

while(d>-1)

{

int type=Character.getType(d);

if(type==2 || type==9 || type==5)

{

d=toAscii(d);

cb.append((char)d);

}

else

{

b=true;

}

d=reader.read();

if(d==-1 || b)

{

if(d==-1) offset++;

b=false;

char[] ioBuffer = new char[cb.length()];

cb.getChars(0, cb.length(), ioBuffer, 0);

Sentence sen=new Sentence(ioBuffer,offset-cb.length());

list.add(sen);

cb.setLength(0);

}

offset++;

}

return list;

}

//将句子切分出词

public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException

{

ArrayList<Token> tokenlist=new ArrayList<Token>();

for(Sentence sen:list)

{

StringBuffer word = new StringBuffer();

int offset=sen.getStartOffset();

int bufferIndex = 0;

char c;

boolean b=false;

while(bufferIndex<sen.getText().length)

{

offset++;

c=sen.getText()[bufferIndex++];

if(word.length()==0)

word.append(c);

else

{

String temp = (word.toString() + c).intern();

if(dictionary.containsKey(temp) && dictionary.get(temp)==1)

word.append(c);

else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)

word.append(c);

else

{

bufferIndex--;

offset--;

while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)

{

word.deleteCharAt(word.length()-1);

bufferIndex--;

offset--;

}

b=true;

}

if(b || bufferIndex==sen.getText().length)

{

Token token = new Token(word.toString(),offset-word.length(),offset,"word");

word.setLength(0);

tokenlist.add(token);

b=false;

}

return tokenlist;

}

//将相连的单个英文或数字组合成词

public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException

{

ArrayList<Token> tokenlist=new ArrayList<Token>();

Token word=null;

for(int i=0;i<list.size();i++)

{

Token t=list.get(i);

if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)

{

if(word==null)

word=t;

else if(word.getEnd()==t.getStart())

{

word.setEnd(t.getEnd());

word.setWord(word.getWord()+t.getWord());

}

else

{

tokenlist.add(word);

word=t;

}

else if(word!=null)

{

tokenlist.add(word);

word=null;

tokenlist.add(t);

}

else

tokenlist.add(t);

}

if(word!=null)

tokenlist.add(word);

return tokenlist;

}

//双角转单角

public static int toAscii(int codePoint)

{

if((codePoint>=65296 && codePoint<=65305) //０-９

|| (codePoint>=65313 && codePoint<=65338) //Ａ-Ｚ

|| (codePoint>=65345 && codePoint<=65370) //ａ-ｚ

)

{

codePoint -= 65248;

}

return codePoint;

}

//加载词典

public static void loadDictionary()

{

if (dictionary == null)

{

dictionary = new HashMap<String, Integer>();

InputStream is = null;

BufferedReader br = null;

try

{

is = new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));

br = new BufferedReader(new InputStreamReader(is, "UTF-8"));

String word = null;

while ((word = br.readLine()) != null)

{

word=word.toLowerCase();

if ((word.indexOf("#") == -1) && (word.length() <= WORD_MAX_LENGTH))

{

dictionary.put(word.intern(), 1);

int i = word.length()-1;

while(i >= 2)

{

String temp = word.substring(0, i).intern();

if (!dictionary.containsKey(temp))

dictionary.put(temp,2);

i--;

}

catch (Exception e)

{

log.info(e);

}

finally

{

try

{

if(br!=null)

br.close();

if(is!=null)

is.close();

}

catch (IOException e)

{

log.info(e);

}

public static String[] segWords(Reader input)

{

ArrayList<String> list=new ArrayList<String>();

try

{

MM2 f=new MM2(input);

ArrayList<Token> tlist= f.getNewToken(f.getToken(f.getSentence()));

for(Token t:tlist)

{

list.add(t.getWord());

}

catch(IOException e)

{

log.info(e);

}

return (String[])list.toArray(new String[0]);

}

public static void main(String[] args)

{

String[] cc=MM2.segWords(new StringReader("ibm商务机t60p".toLowerCase()));

for(String c:cc)

{

System.out.println(c);

}

posted on 2011-08-05 08:34 礼物阅读(2128) 评论(2) 编辑收藏

# re: 基于词典的正向最大匹配中文分词算法，能实现中英文数字混合分词 2013-07-25 22:09 yi

这是全的么，楼主？我导入到MyEclipse里好多错误呀，除了import包之外还有好多错，看不懂。。。回复更多评论

# re: 基于词典的正向最大匹配中文分词算法，能实现中英文数字混合分词 2013-08-22 20:01 love code

麻烦博主把dictionary.txt发给我吧，让我学习学习
1182787467@qq.com
谢谢回复更多评论

新用户注册刷新评论列表


只有注册用户登录后才能发表评论。




网站导航: 博客园 IT新闻 Chat2DB C++博客博问管理

Terry.Li-彬

常用链接

留言簿(19)

随笔分类(107)

随笔档案(141)

文章分类(284)

文章档案(342)

相册

收藏夹(58)

家装

最新随笔

搜索

积分与排名

最新评论

阅读排行榜

评论排行榜

评论