Skynet

---------- ---------- 我的新 blog : liukaiyi.cublog.cn ---------- ----------

BlogJava :: 首页 :: 联系 :: 聚合

:: 管理

112 Posts :: 1 Stories :: 49 Comments :: 0 Trackbacks

k-means （python）算法

转:http://www.daniweb.com/forums/thread31449.html
什么都不说了，直接看代码吧。
注解应该写的比较详细

# liukaiyi
# 注 k-means ，维度类型 - 数值形式 ( 199 或 23.13

)
import sys, math, random

# -- 类化 '数据'
# 在 n-维度空间
class Point:
    def __init__(self, coords, reference=None):
        self.coords = coords
        self.n = len(coords)
        self.reference = reference
    def __repr__(self):
        return str(self.coords)

# -- 类化 '聚集点 / 聚类平均距离点 '
# -- 在 n-维度空间
# -- k-means 核心类
# -- 每次聚集各点围绕她进行聚集
# -- 并提供方法求-聚集后的计算中心点，同时记入此次中心点(聚集各点平均距离)，为下一次聚集提供中心点.
class Cluster:
    def __init__(self, points):
        if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
        self.points = points
        self.n = points[0].n
    for p in points:
            if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
        # 求聚集各点后平均点
    self.centroid = self.calculateCentroid()
    def __repr__(self):
        return str(self.points)

    # 更新中心点，并返回原中心点与现中心点(聚集各点平均距离)距离
    def update(self, points):
        old_centroid = self.centroid
        self.points = points
        self.centroid = self.calculateCentroid()
        return getDistance(old_centroid, self.centroid)

    # 计算平均点（聚集/收集各点（离本类的中心点）最近数据,后生成新的中心点）
    def calculateCentroid(self):
        centroid_coords = []
        #  维度迭代
    for i in range(self.n):
            centroid_coords.append(0.0)
            # 收集各点迭代
        for p in self.points:
                centroid_coords[i] = centroid_coords[i]+p.coords[i]
            centroid_coords[i] = centroid_coords[i]/len(self.points)
        return Point(centroid_coords)

# -- 返回根据 k-means 聚集形成的数据集
def kmeans(points, k, cutoff):
    # Randomly sample k Points from the points list, build Clusters around them
    initial = random.sample(points, k)
    clusters = []
    for p in initial: clusters.append(Cluster([p]))
    # 迭代 k-means 直到每次迭代各收集点别的最多不超过 0.5
    while True:
        #  k 个收集数组
        lists = []
        for c in clusters: lists.append([])
    # 迭代每个数据点，并计算与每个中心点距离
    # 并把数据点添加入相应最短的中心点收集数组中
    # 在迭代中 smallest_distance 为每个点与各中心点最短距离参数，请注意看
        for p in points:
            smallest_distance = getDistance(p, clusters[0].centroid)
            index = 0
            for i in range(len(clusters[1:])):
                distance = getDistance(p, clusters[i+1].centroid)
                if distance < smallest_distance:
                    smallest_distance = distance
                    index = i+1
            # 添加到离最短中心距离的数组中
        lists[index].append(p)

        # 聚集完，计算新中心点
    # 并 cluster.centroid 属性记入下新中心点（下一次聚集的中心点）
    # 并计算与上一次中心点距离，如果差值在 cutoff 0.5 以下 ,跳出迭代（结束，返回最后一次聚集集合）
    biggest_shift = 0.0
        for i in range(len(clusters)):
            shift = clusters[i].update(lists[i])
            biggest_shift = max(biggest_shift, shift)
        if biggest_shift < cutoff: break
    return clusters

# -- 得到欧几里德距离两点之间
def getDistance(a, b):
    # Forbid measurements between Points in different spaces
    if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
    # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
    ret = 0.0
    for i in range(a.n):
        ret = ret+pow((a.coords[i]-b.coords[i]), 2)
    return math.sqrt(ret)

# -- 在 n-维度空间中创建随机点
# -- 随机生成测试数据
def makeRandomPoint(n, lower, upper):
    coords = []
    for i in range(n): coords.append(random.uniform(lower, upper))
    return Point(coords)

# main
def main(args):
    # 参数说明
    # num_points,    n,    k,      cutoff,         lower,        upper
    # 随机数据数量 , 维度, 聚集数, 跳出迭代最小距离 ,   维度数最大值,维度数最小值
    num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200

    # 在 n-维度空间里 , 创建 num_points 随机点
    # 测试数据生成
    points = []
    for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))

    # 使用 k-means 算法，来聚集数据点 (算法入口点)
    clusters = kmeans(points, k, cutoff)

    print "\nPOINTS:"
    for p in points: print "P:", p
    print "\nCLUSTERS:"
    for c in clusters: print "C:", c
if __name__ == "__main__": main(sys.argv)

整理 www.blogjava.net/Good-Game

posted on 2009-08-07 16:20 刘凯毅阅读(2075) 评论(0) 编辑收藏所属分类: python 、算法/函数

新用户注册刷新评论列表


只有注册用户登录后才能发表评论。




网站导航: 博客园 IT新闻 Chat2DB C++博客博问管理
相关文章: MoinMoin wiki 服务器搭建与尝试给自己的图片处理工具 (py2exe) 跟我一起学 - 算法导论 - 快速排序 python pil 使用(转) shell txt 分析小结跟我一起学 - 算法导论 - 递归式理解高斯函数，以及在推荐算法中的应用跟我一起学 - 算法导论 - 插入排序文件存储 - 数据结构( py ) beanstalkd 消息队列的第一手资料

Skynet

常用链接

留言簿(13)

我参与的团队

随笔分类

随笔档案

相册

搜索

最新评论

阅读排行榜

评论排行榜