2008年9月1日月曜日

Python: RPyで階層的クラスタリング

このブログ記事をはてなブックマークに追加

Python+RPy+SciPyで階層的クラスタリングを行う。下記のtest.dat(30個の5次元データ)に対して閾値0.5として計算する。

clustering.py test.dat 0.5

標準出力:

Data file: test.dat Element file: element.out Output file: data.out Cluster file: cluster.out EPS file: cluster.eps Threshold: 0.500000 (distance) 30 x 30 = 900 Cluster 1 : 5 Cluster 2 : 8 Cluster 3 : 1 Cluster 4 : 1 Cluster 5 : 12 Cluster 6 : 2 Cluster 7 : 1 Total : 30 Number of clusters: 7

test.dat: 30個の5次元データ。

-2.39257 0.39147 0.21229 -0.84501 -0.02255 -2.13082 0.44121 0.12561 -0.91345 -0.12832 -2.23498 0.26590 -0.35356 -0.03484 -0.17437 -2.27747 0.29900 -0.05188 -0.63788 -0.13000 -2.11555 0.39492 0.11465 -0.86242 -0.34207 -2.31947 0.26036 0.47846 -0.71782 0.07270 -2.32941 0.38637 0.44304 -0.75346 -0.04302 -2.30690 0.59296 0.22947 -0.82801 -0.09882 -2.29088 0.53157 0.03204 -0.75087 -0.05802 -2.36682 0.34835 0.23964 -0.72942 -0.09720 -2.11368 0.52013 0.27528 -0.81122 -0.20050 -2.27156 0.21167 0.57165 -0.66193 -0.03302 -2.15076 0.42352 0.39504 -0.76157 -0.12652 -2.21785 0.36763 0.32698 -0.77082 -0.14381 -2.27547 0.37300 0.27672 -0.84605 -0.11913 -2.21365 0.37827 0.12720 -0.72738 -0.16452 -2.22145 0.53808 0.36045 -0.75871 -0.13850 -2.12462 0.43006 0.16845 -0.87023 -0.09789 -2.20644 0.53991 0.19739 -0.82177 -0.29343 -2.10046 0.77307 0.21435 -0.86245 -0.15118 -2.37488 0.50979 0.17556 -0.78224 -0.14136 -2.31240 0.48578 0.20484 -0.85149 -0.12238 -2.19001 0.33037 0.09407 -0.91014 -0.09473 -2.31135 0.29111 0.38171 -0.73491 -0.03447 -2.23133 0.35395 0.29436 -0.48961 -0.06276 -2.19413 0.36376 0.21777 -0.18686 -0.20020 -2.06165 0.38107 0.43080 -0.76940 -0.38291 -2.34113 0.44097 0.42368 -0.73850 0.02570 -2.41489 0.40263 0.32783 -0.80521 0.03404 -2.34918 0.48361 0.16130 -0.87229 -0.09810

cluster.eps: image file



element.out: height and merge data

0.0653862929061 [-22 -30] 0.0692241388534 [ -2 -18] 0.0918671214309 [ -7 -28] 0.107376254824 [-21 1] 0.110198113414 [-14 -15] 0.136962358698 [ -1 -29] 0.139532210618 [-13 -17] 0.142665365804 [-8 4] 0.146113402192 [-23 2] 0.148827203159 [ -6 -24] 0.154304262741 [-11 -19] 0.184603648393 [-10 5] 0.196790849635 [ 3 10] 0.205812622791 [-16 12] 0.224987104075 [-9 8] 0.253271677848 [-5 11] 0.285769571333 [ 7 14] 0.297698839601 [-12 13] 0.319598069143 [ 9 16] 0.34335611295 [-25 -26] 0.356127683001 [ 6 17] 0.386487913265 [15 19] 0.466665096402 [-4 22] 0.500220834332 [18 21] 0.514176928693 [-20 -27] 0.614681702347 [23 25] 0.710561224034 [20 24] 0.796960005207 [26 27] 1.13283489335 [-3 28]

data.out: averages and member of groups

-2.314584 0.318096 0.459708 -0.721324 -0.002422 12 7 28 6 24 -2.281683 0.402869 0.283269 -0.780521 -0.097274 1 29 13 17 16 10 14 15 -2.100460 0.773070 0.214350 -0.862450 -0.151180 20 -2.061650 0.381070 0.430800 -0.769400 -0.382910 27 -2.232736 0.463276 0.143898 -0.826001 -0.150468 4 9 8 21 22 30 23 2 18 5 11 19 -2.212730 0.358855 0.256065 -0.338235 -0.131480 25 26 -2.234980 0.265900 -0.353560 -0.034840 -0.174370 3

cluster.out: order and member of groups

001: 12 7 28 6 24 002: 1 29 13 17 16 10 14 15 003: 20 004: 27 005: 4 9 8 21 22 30 23 2 18 5 11 19 006: 25 26 007: 3

clustering.py

#!/usr/bin/env python import sys, os from rpy import * from scipy import * def create_data(data, is_square_root=False): """Create distance data. Arguments: data: position data is_square_root: whether to apply square root or not Return values: in_data: distance data ndim: number of dimensions """ in_data = [] num = len(data) ndim = len(data[0]) for i in range(num): for j in range(num): if i < j: dist = 0.0 for k in range(ndim): dist += (data[i][k] - data[j][k])**2 if is_square_root: dist = sqrt(dist) in_data.append(dist) elif i > j: in_data.append(in_data[j*num+i]) elif i == j: in_data.append(0.0) return in_data, ndim def clustering(in_data, data, ele_file, cluster_method="complete", eps_file=None): """Clustering. Arguments: in_data: distance data data: position data ele_file: element filename (height/merge data) cluster_method: clustering method [complete] eps_file: EPS filename [None] """ efile = file(ele_file, "w") if len(in_data) == 2: efile.write("%5.3f [-1 -2]\n" % in_data[1]) efile.close() return mat = in_data[:] print "%d x %d = %d" % (len(data), len(data), len(mat)) mat = with_mode(NO_CONVERSION, r.matrix)(mat, nrow=len(data)) d = with_mode(NO_CONVERSION, r.as_dist)(mat) hc = r.hclust(d, method=cluster_method) for h, m in zip(r["$"](hc, "height"), r["$"](hc, "merge")): efile.write("%s %s\n" % (h, m)) efile.close() if eps_file: r.postscript(eps_file, title=eps_file) r.plclust(hc, hang=-0.1, ann=r.FALSE) r.dev_off() def expand_group(m, n): """Expand group list.""" group = [] for x in m[n-1]: if x < 0: group.append(-x) else: group.extend(expand_group(m, x)) return group def arrange_cluster(ele_file, data, ndim, threshold, out_file): """Arrange clustering data. Arguments: ele_file: element filename (height/merge data) data: position data ndim: number of dimensions threshold: threshold of clustering out_file: output data filename Return value: groups: pair list of hierarchical clustering """ if len(data) == 2: print "Cluster 1 : 1" print "Total : 1" return [] elif len(data) < 2: print "No cluster" print "Total : 0" return [] efile = file(ele_file) ofile = file(out_file, "w") h = [] m = [] for l in efile: e = l.replace("[", "").replace("]", "").replace(",", " ").strip().rstrip("\n").split() h.append(float(e[0])) m.append([int(e[1]), int(e[2])]) tt = [[i, x] for i, (x, d) in enumerate(zip(m, h)) if d > threshold] t = [] for d in tt: for x in d[1]: if x < tt[0][0]+1: t.append(x) groups = [] all = [] for x in t: g = [] if x < 0: g = [-x] else: g = expand_group(m, x) groups.append(g) if not groups: groups = [range(1, len(h) + 2)] for i, g in enumerate(groups): print "Cluster %-3d: %d" % (i+1, len(g)) g_data = [data[j-1] for j in g] for d in range(ndim): ofile.write("%f " % average(mat(g_data).T[d])) for j in g: ofile.write("%d " % j) ofile.write("\n") efile.close() ofile.close() s = 0 for x in groups: s += len(x) print "Total : %d" % s return groups def write_cluster(groups, cluster_out): """Write clustering result.""" clust_file = file(cluster_out, "w") print "Number of clusters: %d" % len(groups) count = 1 for c in groups: clust_file.write("%03d:" % count) for d in c: clust_file.write(" %d" % d) clust_file.write("\n") count += 1 clust_file.close() def main(args): if len(args) < 2: print >>sys.stderr, "Usage: %s data_file [threshold=5.0]>" % os.path.basename(args[0]) sys.exit(1) elif len(args) == 2: th = 5.0 elif len(args) > 2: th = float(args[2]) DATA_FILE = args[1] ELEMENT_FILE = "element.out" OUT_FILE = "data.out" CLUSTER_FILE = "cluster.out" EPS_FILE = "cluster.eps" IS_SQUARE_ROOT = True print "Data file: %s" % DATA_FILE print "Element file: %s" % ELEMENT_FILE print "Output file: %s" % OUT_FILE print "Cluster file: %s" % CLUSTER_FILE print "EPS file: %s" % EPS_FILE if IS_SQUARE_ROOT: print "Threshold: %f (distance)" % th else: th = th**2 print "Threshold: %f (distance**2)" % th # position data data = [[float(d) for d in l.split()] for l in file(DATA_FILE)] if len(data) > 1: # distance data in_data, ndim = create_data(data, is_square_root=IS_SQUARE_ROOT) # clustering by position and distance data and output element file clustering(in_data, data, ELEMENT_FILE, eps_file=EPS_FILE) else: print >>sys.stderr, "Error: no data." sys.exit(1) # arrange clustering data by element file and threshold, and output group data groups = arrange_cluster(ELEMENT_FILE, data, ndim, th, OUT_FILE) # output ordered clustering data by using group data write_cluster(groups, CLUSTER_FILE) if __name__ == "__main__": main(sys.argv)

仕事で使おうと思って一応書いたけど、コード量が多くてなんだかスッキリしないなぁ。Linuxでは問題なかったけど、WindowsではEPSファイルでエラーが出るし。使用環境、RとRPyの相性の問題もあるようだ。Rは2.1.1、RPyは0.4.3でちょっと古いかもしれない。最近はrpy2が出ているようだけどどうなんだろう? 使ってみるかな。

0 コメント: