用 Lucene 做一个简单的 Java 搜索工具

初学 Lucene ，刚接触搜索引擎。知道了一点点，想做个小工具，实现根据 “ 单词 ” 搜索某个 java 源文件。比如输入 “String” 去查询某些 java 源文件里用到了这个类。

这个想法的来源是，在以前刚学 java 时，有一本 java 基础教程的书的附带光盘里有作者写的一个程序，可以方便初学者查找某些类在哪个实例里出现。当时没有太在意，觉得作者的代码很长。所以现在想自己也写一个这样的小程序。

开发工具与运行环境：

使用 Lucene2.0 的包， jdk1.5 ，在 WindowsXP 下运行。

思路分析与设计：

整个程序里，除了 Lucene 的必要操作外，就是 IO 的基本操作了。因为要对某目录下及其子目录下的所有 Java 源文件进行索引，就要用到递归，同时要过滤掉非 Java 源文件。根据这种情况，设计了以下 5 个类。

主类：索引类（ IndexJavaFiles ），搜索类（ SearchJavaFiles ）

异常类：索引异常类 (IndexException) ，搜索异常类 (SearchException)

还有一个文件过滤工厂类（ FileFilterFactory ）。

异常类不是必要的，特意设计来包装 IO 异常、文件异常和 Lucene 的异常。文件过滤工厂类的出现并不是故弄玄虚，只是不想太多代码集中一起，就把文件过虑器的设计放到一个类里。下面是程序的完整代码及注释。

IndexJavaFiles.java

/**

* index the java source files

package powerwind;

import java.io.*;

import java.util.Date;

import org.apache.lucene.document.*;

import org.apache.lucene.index.IndexWriter;

/**

* @author Powerwind

* @version 1.0

public class IndexJavaFiles {

/**

* 默认构造方法

public IndexJavaFiles() {

}

/**

* 这个私有递归方法由 index 方法调用，保证 index 传入的 file 是目录不是文件

* @param writer

* @param file

* @param ff

* @throws IndexException

private void indexDirectory(IndexWriter writer, File file, FileFilter filter) throws IndexException {

if (file.isDirectory()) {

// 有选择地（过滤）获取目录下的文件和目录

File[] files = file.listFiles(filter);

// 非空目录

if (files != null ) {

for ( int i = 0; i < files. length ; i++) {

indexDirectory(writer, files[i], filter);

}

} else {

try {

　　 // 这里的 file 经过先前的过滤

writer.addDocument(parseFile(file));

System. out .println( " 增加文件： " + file);

} catch (IOException ioe) {

throw new IndexException(ioe.getMessage());

}

/**

* 传参数是文件就直接索引，若是目录则交给 indexDirectory 递归

* @param writer

* @param file

* @param ff

* @throws IndexException

public void index(IndexWriter writer, File file, FileFilter filter) throws IndexException {

// 确定可读

if (file.exists() && file.canRead()) {

if (file.isDirectory()) {

indexDirectory(writer, file, filter);

} else if (filter.accept(file)) {

try {

writer.addDocument(parseFile(file));

System. out .println( " 增加文件： " + file);

} catch (IOException ioe) {

throw new IndexException(ioe.getMessage());

}

} else {

System. out .println( " 指定文件或目录错误，没有完成索引 " );

}

/**

* @param file

* 把 File 变成 Document

private Document parseFile(File file) throws IndexException {

Document doc = new Document();

doc.add( new Field( "path" , file.getAbsolutePath(), Field.Store. YES ,

Field.Index. UN_TOKENIZED ));

try {

doc.add( new Field( "contents" , new FileReader(file)));

} catch (FileNotFoundException fnfe) {

throw new IndexException(fnfe.getMessage());

}

return doc;

}

index(IndexWriter writer, File file, FileFilter filter) 调用私有方法 indexDirectory(IndexWriter writer, File file, FileFilter filter) 完成文件的索引。

下面是 IndexException 异常类。

IndexException.java

package powerwind;

public class IndexException extends Exception {

public IndexException(String message) {

super ( "Throw IndexException while indexing files: " + message);

}

下面是 FileFilterFactory 类，返回一个特定的文件过滤器（ FileFilter ）。

FileFilterFactory.java

package powerwind;

import java.io.*;

public class FileFilterFactory {

/**

* 静态匿名内部类

private static FileFilter filter = new FileFilter() {

public boolean accept(File file) {

long len;

return file.isDirectory()||

(file.getName().endsWith( ".java" ) &&

((len = file.length()) > 0) && len < 1024 * 1024);

}

};

public static FileFilter getFilter() {

return filter ;

}

main 方法

/**

* main 方法

public static void main(String[] args) throws Exception {

IndexJavaFiles ijf = new IndexJavaFiles();

Date start = new Date();

try {

IndexWriter writer = IndexWriterFactory.newInstance().createWriter( "./index" , true );

System. out .println( "Indexing ..." );

ijf.index(writer, new File( "." ), FileFilterFactory.getFilter());

System. out .println( "Optimizing..." );

writer.optimize();

writer.close();

Date end = new Date();

System. out .println(end.getTime() - start.getTime() + " total milliseconds" );

} catch (IOException e) {

System. out .println( " caught a " + e.getClass() + "\n with message: " + e.getMessage());

}

SearchJavaFiles.java

package powerwind;

import java.io.*;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.*;

import org.apache.lucene.search.*;

public class SearchJavaFiles {

private IndexSearcher searcher ;

private QueryParser parser ;

/**

* @param searcher

public SearchJavaFiles(IndexSearcher searcher) {

this . searcher = searcher;

}

/**

* @param field

* @param analyzer

public void setParser(String field, Analyzer analyzer) {

setParser( new QueryParser(field, analyzer));

}

/**

* @param parser

public void setParser(QueryParser parser) {

this . parser = parser;

}

/**

* @param query

* @return Hits

* @throws SearchException

public Hits serach(Query query) throws SearchException {

try {

return searcher .search(query);

} catch (IOException ioe) {

throw new SearchException(ioe.getMessage());

}

/**

* @param queryString

* @return Hits

* @throws SearchException

public Hits serach(String queryString) throws SearchException {

if ( parser == null )

throw new SearchException( "parser is null!" );

try {

return searcher .search( parser .parse(queryString));

} catch (IOException ioe) {

throw new SearchException(ioe.getMessage());

} catch (ParseException pe) {

throw new SearchException(pe.getMessage());

}

/**

* 输出 hits 的结果，从 start 开始到 end ，不包括 end

* @param hits

* @param start

* @param end

* @throws SearchException

public static Hits display(Hits hits, int start, int end) throws SearchException {

try {

while (start < end) {

Document doc = hits.doc(start);

String path = doc.get( "path" );

if (path != null ) {

System. out .println((start + 1) + "- " + path);

} else {

System. out .println((start + 1) + "- " + "No such path" );

}

start++;

}

} catch (IOException ioe) {

throw new SearchException(ioe.getMessage());

}

return hits;

}

main 方法

/**

* @param args

public static void main(String[] args) throws Exception {

String field = "contents" ;

String index = "./index" ;

final int rows_per_page = 2;

final char NO = 'n' ;

SearchJavaFiles sjf = new SearchJavaFiles( new IndexSearcher(IndexReader.open(index)));

sjf.setParser(field, new StandardAnalyzer());

BufferedReader in = new BufferedReader( new InputStreamReader(System. in , "UTF-8" ));

while ( true ) {

System. out .println( "Query: " );

String line = in.readLine();

if (line == null || line.length() < 2) {

System. out .println( "eixt query" );

break ;

}

Hits hits = sjf.serach(line);

System. out .println( "searching for " + line + " Result is " );

int len = hits.length();

int i = 0;

if (len > 0)

while ( true ) {

if (i + rows_per_page >= len) {

SearchJavaFiles.display(hits, i, len);

break ;

} else {

SearchJavaFiles.display(hits, i, i += rows_per_page);

System. out .println( "more y/n?" );

line = in.readLine();

if (line.length() < 1 || line.charAt(0) == NO)

break ;

}

else

System. out .println( "not found" );

}

SearchException.java

package powerwind;

public class SearchException extends Exception {

public SearchException(String message) {

super ( "Throw SearchException while searching files: " + message);

}

完善设想：

1 、文件格式：

能够处理 Zip 文件 Jar 文件，索引里面的 java 源文件。

通过反射机制索引 class 类文件。

2 、输入输出：

除控制台输入输出外，还可以选择从文件读取查询关键字，输出查询结果到文件。

3 、用户界面：

图形界面操作，双击查询结果的某条记录可以打开相应文件。

4 、性能方面

索引文件时，用缓存和多线程处理

这是个简易的小程序。不管怎样，这是一个开始。呵呵～

本文的PDF格式下载版：http://pickup.mofile.com/8248225466821319

posted on 2006-12-12 01:00 powerwind 阅读(240) 评论(0) 编辑收藏所属分类: java基础

新用户注册刷新评论列表


只有注册用户登录后才能发表评论。




网站导航: 博客园博客园最新博文博问

剑心博客

公告

常用链接

留言簿(1)

随笔分类

随笔档案(1)

文章分类(41)

文章档案(29)

新闻分类

Myself

搜索

最新评论

用 Lucene 做一个简单的 Java 搜索工具

完善设想：

1 、文件格式：

2 、输入输出：

3 、用户界面：

4 、性能方面