| |
常用链接
留言簿(1)
随笔档案(28)
文章档案(1)
我最爱的网站
搜索
最新评论

阅读排行榜
评论排行榜
Powered by: 博客园
模板提供:沪江博客
|
|
|
|
|
发新文章 |
|
|
HTML Parser 简述:这是一个在 SourceForge.net 上比较活跃的项目之一,目前的最新版本是 1.6 发行版, (我现在用在 自己NBA网站上 的也是1.6).他是一个对现有的 HTML 进行分析的快速实时的解析器,事实上在应用过程中你更为惊叹于 HTML Parser 给你带来一些周到的处理。他主要用在这几个方面:
文本信息抽取,
链接提取,用于自动给页面的链接文本加上链接的标签
资源提取,例如对一些图片、声音的资源的处理
链接检查,用于检查HTML中的链接是否有效
页面内容的监控
呵呵.废话少说:)上代码.
 public class BaseAction {

public static final Logger logger = Logger.getLogger(BaseAction.class);
public String keyWords = "姚明|姚明NBA";
 public static NodeList getAllNodeList(String urlOrfile, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction getAllNodeList(" + urlOrfile + ")");
Parser parser;
 try {
parser = new Parser(urlOrfile);
parser.setEncoding(Constent.Encode);
NodeList list = parser.parse(filter);
return list;
 } catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}

}

 /** *//**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
 public List<String> parseLink(String file, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parseLink(" + file + ")");
List<String> hrefList = new ArrayList<String>();
 try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
 for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
 if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
logger.debug("textnode=" + line);
 } else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
logger.debug("link=" + line);
}
if (HttpParserUtil.isTrimEmpty(line))
continue;
hrefList.add(line);
}
 } catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return hrefList;
}

 /** *//**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public Map<String, String> parseLinkWithText(String file,
 NodeFilter filter, Pattern pHtml, Pattern pPhp) {
if (logger.isDebugEnabled())
logger.debug("SinaAction parseLinkWithText(" + file + ")");
Map<String, String> map = new HashMap<String, String>();
List<String> list = new ArrayList<String>();
 try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
 for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
 if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("textnode=" + line);
list.add(line);
 } else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("link=" + line);
list.add(line);
}
}
int endPostion = list.size();
 for (int i = 0; i < endPostion; i++) {
String getCurr = list.get(i);
Matcher mHtml = pHtml.matcher(getCurr);
Matcher mPhp = pPhp.matcher(getCurr);
if ((mHtml.matches() == true || mPhp.matches() == true)
 && i < (endPostion - 1)) {
String getNext = list.get(i + 1);
Matcher mHtmlNext = pHtml.matcher(getNext);
Matcher mPhpNext = pPhp.matcher(getNext);
if ((mHtml.matches() == true && mHtmlNext.matches() == false)
 || (mPhp.matches() == true && mPhpNext.matches() == false)) {
map.put(getCurr, getNext);
i = i + 1;
 } else {

}
}
}
 } catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return map;
}
 /** *//**
* 分析内容
*
* @param list
* @return
*/
 public String parserContent(NodeList list) {
return parserContent(list, false);
}

 public String parserContent(NodeList list, boolean isCreateFile) {
return parserContent(list,isCreateFile,list.size()+1);
}
 public String parserContent(NodeList list, int listIndex) {
return parserContent(list,false,listIndex);
}
 public String parserContent(NodeList list, boolean isCreateFile,int listIndex) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parserContent()");
StringBuffer content = new StringBuffer();
 if(list.size() < listIndex) {//说明是整个取出进行rex
 for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
NodeList sublist = node.getChildren();
if (sublist == null)
continue;
Node[] listNode = sublist.toNodeArray();
 for (Node inNode : listNode) {
if (HttpParserUtil.isTrimEmpty(inNode.getText()))
continue;
logger.debug(inNode.toHtml());
content.append(inNode.toHtml());
if (isCreateFile)
content.append("\n");
}
}
 }else {
Node node = list.elementAt(listIndex);
 if (node == null) {
logger.warn("the listIndex may is wrong! please do it");
return null;
}
NodeList sublist = node.getChildren();
 if (sublist == null) {
logger.warn("the listIndex may is wrong! please do it");
return null;
}
Node[] listNode = sublist.toNodeArray();
 if (listNode == null) {
logger.warn("the listIndex may is wrong! please do it");
return null;
}
 for (Node inNode : listNode) {
if (HttpParserUtil.isTrimEmpty(inNode.getText()))
continue;
logger.debug(inNode.toHtml());
content.append(inNode.toHtml());
if (isCreateFile)
content.append("\n");
}
}
 if (content.toString() == null) {
logger.warn("you get the text is null");
}
return content.toString();
}
 /** *//**
* 抓取meta标签
* @param list
* @return
*/
 public MetaModel getMetaInfo(NodeList list) {
MetaModel metaModel = new MetaModel();
 for (int index = 0; index < list.size(); index++) {
Node firstNode = list.elementAt(index);
if(!(firstNode instanceof Html))
continue;
NodeList htmlList = firstNode.getChildren();
 for (int i = 0; i < htmlList.size(); i++) {
Node htmlNode = htmlList.elementAt(i);
if (!(htmlNode instanceof HeadTag))
continue;
NodeList headList = htmlNode.getChildren();
 for(int j = 0; j < headList.size(); j++) {
Node headNode = headList.elementAt(j);
 if(headNode instanceof TitleTag) {
TitleTag titleTag = (TitleTag) headNode;
metaModel.setTitle(titleTag.getTitle());
}
if (!(headNode instanceof MetaTag))
continue;
MetaTag it = (MetaTag) headNode;
if(it.getMetaTagName()==null)
continue;
String keywords = it.getMetaTagName().toLowerCase();
 if ("keywords".equals(keywords)) {
metaModel.setKeywords(it.getMetaContent().replaceAll("Hoopchina", keyWords));
 } else if&n | |