﻿<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:trackback="http://madskills.com/public/xml/rss/module/trackback/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/"><channel><title>BlogJava-沙漠中的鱼-文章分类-数据挖掘</title><link>http://www.blogjava.net/aoneany/category/53204.html</link><description>欲上天堂，先下地狱</description><language>zh-cn</language><lastBuildDate>Tue, 08 Jan 2013 07:38:32 GMT</lastBuildDate><pubDate>Tue, 08 Jan 2013 07:38:32 GMT</pubDate><ttl>60</ttl><item><title>Carrot2对文章进行聚类代码示例</title><link>http://www.blogjava.net/aoneany/articles/393951.html</link><dc:creator>沙漠中的鱼</dc:creator><author>沙漠中的鱼</author><pubDate>Tue, 08 Jan 2013 07:08:00 GMT</pubDate><guid>http://www.blogjava.net/aoneany/articles/393951.html</guid><wfw:comment>http://www.blogjava.net/aoneany/comments/393951.html</wfw:comment><comments>http://www.blogjava.net/aoneany/articles/393951.html#Feedback</comments><slash:comments>0</slash:comments><wfw:commentRss>http://www.blogjava.net/aoneany/comments/commentRss/393951.html</wfw:commentRss><trackback:ping>http://www.blogjava.net/aoneany/services/trackbacks/393951.html</trackback:ping><description><![CDATA[<div style="background-color:#eeeeee;font-size:13px;border:1px solid #CCCCCC;padding-right: 5px;padding-bottom: 4px;padding-left: 4px;padding-top: 4px;width: 98%;word-break:break-all"><!--<br /><br />Code highlighting produced by Actipro CodeHighlighter (freeware)<br />http://www.CodeHighlighter.com/<br /><br />--><span style="color: #0000FF; ">public</span>&nbsp;<span style="color: #0000FF; ">class</span>&nbsp;ClusteringFlyStoneDocument&nbsp;{<br /><br />&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">public</span>&nbsp;<span style="color: #0000FF; ">static</span>&nbsp;<span style="color: #0000FF; ">void</span>&nbsp;main(String[]&nbsp;args)&nbsp;{<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">final</span>&nbsp;Controller&nbsp;controller&nbsp;=&nbsp;ControllerFactory.createCachingPooling(IDocumentSource.<span style="color: #0000FF; ">class</span>);<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">创建需要处理的属性对象</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">final</span>&nbsp;Map&lt;String,&nbsp;Object&gt;&nbsp;attributes&nbsp;=&nbsp;Maps.newHashMap();<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">需要处理的文档集合</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;List&lt;Document&gt;&nbsp;documents&nbsp;=&nbsp;SampleDocumentData.DOCUMENTS_DATA_MINING;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">将文档添加到聚类属性中</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;CommonAttributesDescriptor.attributeBuilder(attributes).documents(documents);<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">设置处理的语言（）</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;MultilingualClusteringDescriptor.attributeBuilder(attributes).defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">设置需要处理的对象，以及聚类的算法</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">final</span>&nbsp;ProcessingResult&nbsp;englishResult&nbsp;=&nbsp;controller.process(attributes,&nbsp;LingoClusteringAlgorithm.<span style="color: #0000FF; ">class</span>);<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">获取结果，打印聚类主题及关联信息</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">final</span>&nbsp;List&lt;Cluster&gt;&nbsp;clustersByTopic&nbsp;=&nbsp;englishResult.getClusters();<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;System.out.println("=======聚类主题=====");<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">for</span>(Cluster&nbsp;cluster&nbsp;:&nbsp;clustersByTopic){<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;System.out.println("【主题&nbsp;】"&nbsp;+cluster.getLabel());<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;List&lt;Document&gt;&nbsp;cDocLst&nbsp;=&nbsp;cluster.getAllDocuments();<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">for</span>(Document&nbsp;doc&nbsp;:&nbsp;cDocLst){<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;System.out.println("--------"&nbsp;+&nbsp;doc.getTitle());<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #008000; ">//</span><span style="color: #008000; ">通过URL进行聚类</span><span style="color: #008000; "><br /></span>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">final</span>&nbsp;ProcessingResult&nbsp;byDomainClusters&nbsp;=&nbsp;controller.process(documents,&nbsp;<span style="color: #0000FF; ">null</span>,ByUrlClusteringAlgorithm.<span style="color: #0000FF; ">class</span>);<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">final</span>&nbsp;List&lt;Cluster&gt;&nbsp;clustersByDomain&nbsp;=&nbsp;byDomainClusters.getClusters();<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;System.out.println("=======URL聚类=======");<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">for</span>(Cluster&nbsp;cluster&nbsp;:&nbsp;clustersByDomain){<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;System.out.println("【URL】"&nbsp;+cluster.getLabel());<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;List&lt;Document&gt;&nbsp;cDocLst&nbsp;=&nbsp;cluster.getAllDocuments();<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span style="color: #0000FF; ">for</span>(Document&nbsp;doc&nbsp;:&nbsp;cDocLst){<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;System.out.println("----"&nbsp;+&nbsp;doc.getTitle());<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br />&nbsp;&nbsp;&nbsp;&nbsp;}<br /><br />}</div><img src ="http://www.blogjava.net/aoneany/aggbug/393951.html" width = "1" height = "1" /><br><br><div align=right><a style="text-decoration:none;" href="http://www.blogjava.net/aoneany/" target="_blank">沙漠中的鱼</a> 2013-01-08 15:08 <a href="http://www.blogjava.net/aoneany/articles/393951.html#Feedback" target="_blank" style="text-decoration:none;">发表评论</a></div>]]></description></item></channel></rss>