Lucene全文检索小试 - 生命科学领域的专业信息解决方案！

HTML 解析器
package com.rain.util;

import Java.io.FileInputStream;
import Java.io.FileNotFoundException;
import Java.io.IOException;
import Java.io.InputStream;
import Java.io.InputStreamReader;
import Java.io.Reader;
import Java.io.UnsupportedEncodingException;

import org.apache.lucene.demo.html.HTMLParser;

public class HTMLDocParser {

private String htmlPath;
private HTMLParser htmlParser;

public HTMLDocParser(String htmlPath){
  this.htmlPath=htmlPath;
  initHtmlParser();
}
public void initHtmlParser(){
  InputStream inputStream=null;
  try{
   inputStream=new FileInputStream(htmlPath);
  }catch(FileNotFoundException e){
   e.printStackTrace();
  }
  if(null!=inputStream){
   try{
    htmlParser=new HTMLParser(new InputStreamReader(inputStream,"utf-8"));
   }catch(UnsupportedEncodingException e){
    e.printStackTrace();
   }
  }
}
public String getTitle(){
  if(null!=htmlParser){
   try{
    return htmlParser.getTitle();
   }catch(IOException e){
    e.printStackTrace();
   }catch(InterruptedException e){
    e.printStackTrace();
   }
  }
  return "";
}
public Reader getContent(){
  if(null!=htmlParser){
   try{
    return htmlParser.getReader();
   }catch(IOException e){
    e.printStackTrace();
   }
  }
  return null;
}
public String getPath(){
  return this.htmlPath;
}
}

描述搜索结果的结构实体Bean
package com.rain.search;

public class SearchResultBean {
    private String htmlPath;

    private String htmlTitle;

public String getHtmlPath() {
return htmlPath;
}

public void setHtmlPath(String htmlPath) {
this.htmlPath = htmlPath;
}

public String getHtmlTitle() {
return htmlTitle;
}

public void setHtmlTitle(String htmlTitle) {
this.htmlTitle = htmlTitle;
}
}

索引子系统的实现

package com.rain.index;

import Java.io.File;
import Java.io.IOException;
import Java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Field;

import com.rain.util.HTMLDocParser;

public class IndexManager {

//the directory that stores HTML files
private final String dataDir="E:\\dataDir";

//the directory that is used to store a Lucene index
private final String indexDir="E:\\indexDir";

public boolean creatIndex()throws IOException{
  if(true==inIndexExist()){
   return true;
  }
  File dir=new File(dataDir);
  if(!dir.exists()){
   return false;
  }
  File[] htmls=dir.listFiles();
  Directory fsDirectory=FSDirectory.getDirectory(indexDir,true);
  Analyzer analyzer=new StandardAnalyzer();
  IndexWriter indexWriter=new IndexWriter(fsDirectory,analyzer,true);
  for(int i=0;i<htmls.length;i++){
   String htmlPath=htmls[i].getAbsolutePath();
   if(htmlPath.endsWith(".html")||htmlPath.endsWith("htm")){
    addDocument(htmlPath,indexWriter);
   }
  }
  indexWriter.optimize();
  indexWriter.close();
  return true;
}

public void addDocument(String htmlPath,IndexWriter indexWriter){
  HTMLDocParser htmlParser=new HTMLDocParser(htmlPath);
  String path=htmlParser.getPath();
  String title=htmlParser.getTitle();
  Reader content=htmlParser.getContent();

  Document document=new Document();
  document.add(new Field("path",path,Field.Store.YES,Field.Index.NO));
  document.add(new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED));
     document.add(new Field("content",content));
     try{
     indexWriter.addDocument(document);
     }catch(IOException e){
     e.printStackTrace();
     }
}
public String getDataDir(){
  return this.dataDir;
}

public String getIndexDir(){
  return this.indexDir;
}

public boolean inIndexExist(){
  File directory=new File(indexDir);
  if(0<directory.listFiles().length){
   return true;
  }else{
   return false;
  }
}
}

搜索功能的实现
package com.rain.search;

import Java.io.IOException;
import Java.util.ArrayList;
import Java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;

import com.rain.index.IndexManager;

public class SearchManager {
private String searchWord;
private IndexManager indexManager;
private Analyzer analyzer;

public SearchManager(String searchWord){
  this.searchWord=searchWord;
  this.indexManager=new IndexManager();
  this.analyzer=new StandardAnalyzer();
}

/**
     * do search
     */
public List search(){
  List searchResult=new ArrayList();
  if(false==indexManager.inIndexExist()){
   try{
    if(false==indexManager.creatIndex()){
     return searchResult;
    }
   }catch(IOException e){
    e.printStackTrace();
    return searchResult;
   }
  }
  IndexSearcher indexSearcher=null;
  try{
   indexSearcher=new IndexSearcher(indexManager.getIndexDir());
  }catch(IOException e){
   e.printStackTrace();
  }
  QueryParser queryParser=new QueryParser("content",analyzer);
  Query query=null;
  try{
   query=queryParser.parse(searchWord);
  }catch(ParseException e){
   e.printStackTrace();
  }
  if(null!=query&&null!=indexSearcher){
   try{
    Hits hits=indexSearcher.search(query);
    for(int i=0;i<hits.length();i++){
     SearchResultBean resultBean=new SearchResultBean();
     resultBean.setHtmlPath(hits.doc(i).get("path"));
     resultBean.setHtmlTitle(hits.doc(i).get("title"));
     searchResult.add(resultBean);
    }
   }catch(IOException e){
    e.printStackTrace();
   }
  }
   return searchResult;
}

}

请求管理器的实现

package com.rain.servlet;

import Java.io.IOException;
import Java.util.List;

import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.rain.search.SearchManager;

/**
* @author zhourui
* 2007-1-28
*/
public class SearchController extends HttpServlet {
private static final long serialVersionUID=1L;

/* (non-Javadoc)
* @see javax.servlet.http.HttpServlet#doPost(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse)
*/
@Override
protected void doPost(HttpServletRequest arg0, HttpServletResponse arg1) throws ServletException, IOException {
  // TODO Auto-generated method stub
  String searchWord=arg0.getParameter("searchWord");
  SearchManager searchManager=new SearchManager(searchWord);
  List searchResult=null;
  searchResult=searchManager.search();
  RequestDispatcher dispatcher=arg0.getRequestDispatcher("search.jsp");
  arg0.setAttribute("searchResult",searchResult);
        dispatcher.forward(arg0, arg1);
}

}

向Web服务器提交搜索请求
<form action="SearchController" method="post">
      <table>
        <tr>
          <td colspan="3">
            SearchWord:<input type="text" name="searchWord" id="searchWord" size="40">
            <input id="doSearch" type="submit" value="search">
          </td>
        </tr>
      </table>
    </form>
显示搜索结果
<table class="result">
      <%
        List searchResult=(List)request.getAttribute("searchResult");
        int resultCount=0;
        if(null!=searchResult){
        resultCount=searchResult.size();
        }
        for(int i=0;i<resultCount;i++){
        SearchResultBean resultBean=(SearchResultBean)searchResult.get(i);
        String title=resultBean.getHtmlTitle();
        String path=resultBean.getHtmlPath();
        %>
        <tr>
           <td class="title"><h3><a href="<%=path%>"><%=title%></a></h3></td>
        </tr>
        <%
        }
      %>
    </table>

posted on 2007-01-29 09:57 周锐阅读(852) 评论(0) 编辑收藏所属分类: Lucene

生命科学领域的专业信息解决方案！

留言簿(15)

随笔分类(1019)

文章分类(3)

文章档案(21)

收藏夹

Link

好友博客

最新随笔

搜索

积分与排名

最新评论

阅读排行榜


只有注册用户登录后才能发表评论。




网站导航: 博客园 IT新闻 Chat2DB C++博客博问管理
相关文章: 当前几个主要的Lucene中文分词器的比较【转载】 Lucene全文检索小试 Lucene基本使用介绍