随笔-23  评论-58  文章-0  trackbacks-0
本隐马可夫(HMM)中文分词词性标注程序 中的 隐马可夫(HMM)概率模型 是由 PFR人民日报标注语料199801语料库 生成
public class HMM
{
    
static final String[] states = new String[52];
    
static final HashMap<String, Double> start_probability = new HashMap<String, Double>();
    
static final HashMap<String, HashMap<String, Double>> transition_probability = new HashMap<String, HashMap<String, Double>>();
    
static final HashMap<String, HashMap<String, Double>> emission_probability =new HashMap<String, HashMap<String, Double>>();
    
    
static
    
{
        
for(int i=0;i<52;i++)
            states[i]
=CountPOS.getPOSFromId(i);
    
        InputStream is 
= Viterbi.class.getClassLoader().getResourceAsStream("startprob.txt");
        FileUtil.readFileByLine(is, 
"UTF-8"new Callback(){
             
int ss=0;
             
public void execute(String line) {
                 start_probability.put(states[ss], Double.parseDouble(line));
                 ss
++;
             }

        }
);

        is 
= Viterbi.class.getClassLoader().getResourceAsStream("tranprob.txt");
        FileUtil.readFileByLine(is, 
"UTF-8"new Callback(){
            
int ss=0;
            
public void execute(String line) {
                HashMap
<String, Double> t = new HashMap<String, Double>();
                String[] cc
=line.split("\t");
                
for(int j=0;j<cc.length;j++)
                    t.put(states[j], Double.parseDouble(cc[j]));
                transition_probability.put(states[ss], t);
                ss
++;
            }

        }
);

        is 
= Viterbi.class.getClassLoader().getResourceAsStream("emissionprob.txt");
        FileUtil.readFileByLine(is, 
"UTF-8"new Callback(){
            
public void execute(String line) {
                String[] cc
=line.split("\t");
                String[] nn
=cc[1].split(" ");
                
for(String n:nn)
                
{
                    HashMap
<String, Double> e=null;
                    String[] bb
=n.split(":");
                    
if(emission_probability.containsKey(bb[0]))
                        e
=emission_probability.get(bb[0]);
                    
else
                        e
=new HashMap<String, Double>();
                    e.put(cc[
0], Double.parseDouble(bb[1]));
                    emission_probability.put(bb[
0], e);
                }

            }

        }
);
    }

    
    
public static String[] tagging(String[] observations)
    
{
        
return forward_viterbi(observations,states,start_probability,transition_probability,emission_probability);
    }

    
    
public static String[]  forward_viterbi(String[] observations, String[] states,HashMap<String, Double> start_probability, HashMap<String, HashMap<String, Double>> transition_probability, HashMap<String, HashMap<String, Double>> emission_probability)
    
{
        
int[][] path=new int[observations.length][states.length];
        
double[][] r=new double[observations.length][states.length];
        
for(int j=0;j<states.length;j++)
        
{
            
if(emission_probability.get(states[j])!=null && emission_probability.get(states[j]).get(observations[0])!=null)
                r[
0][j]=start_probability.get(states[j])*emission_probability.get(states[j]).get(observations[0]);
            path[
0][j]=0;
        }

        
        
for(int t=1;t<observations.length;t++)
        
{
            
for(int i=0;i<states.length;i++)
            
{
                
double tmp=0;int m=0;
                
for(int j=0;j<states.length;j++)
                
{
                    
double tem=0;
                    
if(emission_probability.get(states[i])!=null && emission_probability.get(states[i]).get(observations[t])!=null)
                        tem
=r[t-1][j]*transition_probability.get(states[j]).get(states[i]) *emission_probability.get(states[i]).get(observations[t]);
                    
if(tem>tmp)
                    
{
                        tmp
=tem;
                        m
=j;
                    }

                }

                r[t][i]
=tmp;
                path[t][i]
=m;
            }

        }

        
        
double p=0;int m=0;
        
for(int i=0;i<r[0].length;i++)
        
{
            
if(r[r.length-1][i]>p)
            
{
                p
=r[r.length-1][i];
                m
=i;
            }

        }

        
//System.out.println("p="+p);
        int[] trace=new int[observations.length];
        trace[observations.length
-1]=m;
        
for(int t=observations.length-1;t>0;t--)
        
{
            trace[t
-1]=path[t][m];
            m
=path[t][m];
        }

        
        String[] ret
=new String[observations.length];
        
for(int i=0;i<trace.length;i++)
            ret[i]
=states[trace[i]];
        
return ret;
    }

    
    
public static void main(String[] args)
    
{
       
//String[] observations = new String[] {"这些","服务","实体","改","由","当地","有关","部门","管理"};
        String[] observations = new String[] {"研究","生命","","起源"};
        String[] ret
=tagging(observations);
        
for(String c:ret)
            System.out.print(c
+",");
    }

}

posted on 2012-09-14 17:08 nianzai 阅读(3809) 评论(0)  编辑  收藏 所属分类: 中文分词

只有注册用户登录后才能发表评论。


网站导航: