Python+RPy+SciPyで階層的クラスタリングを行う。下記のtest.dat(30個の5次元データ)に対して閾値0.5として計算する。
clustering.py test.dat 0.5
標準出力:
Data file: test.dat Element file: element.out Output file: data.out Cluster file: cluster.out EPS file: cluster.eps Threshold: 0.500000 (distance) 30 x 30 = 900 Cluster 1 : 5 Cluster 2 : 8 Cluster 3 : 1 Cluster 4 : 1 Cluster 5 : 12 Cluster 6 : 2 Cluster 7 : 1 Total : 30 Number of clusters: 7
clustering.py test.dat 0.5
標準出力:
Data file: test.dat Element file: element.out Output file: data.out Cluster file: cluster.out EPS file: cluster.eps Threshold: 0.500000 (distance) 30 x 30 = 900 Cluster 1 : 5 Cluster 2 : 8 Cluster 3 : 1 Cluster 4 : 1 Cluster 5 : 12 Cluster 6 : 2 Cluster 7 : 1 Total : 30 Number of clusters: 7
test.dat: 30個の5次元データ。
cluster.eps: image file
element.out: height and merge data
data.out: averages and member of groups
cluster.out: order and member of groups
clustering.py
仕事で使おうと思って一応書いたけど、コード量が多くてなんだかスッキリしないなぁ。Linuxでは問題なかったけど、WindowsではEPSファイルでエラーが出るし。使用環境、RとRPyの相性の問題もあるようだ。Rは2.1.1、RPyは0.4.3でちょっと古いかもしれない。最近はrpy2が出ているようだけどどうなんだろう? 使ってみるかな。
-2.39257 0.39147 0.21229 -0.84501 -0.02255
-2.13082 0.44121 0.12561 -0.91345 -0.12832
-2.23498 0.26590 -0.35356 -0.03484 -0.17437
-2.27747 0.29900 -0.05188 -0.63788 -0.13000
-2.11555 0.39492 0.11465 -0.86242 -0.34207
-2.31947 0.26036 0.47846 -0.71782 0.07270
-2.32941 0.38637 0.44304 -0.75346 -0.04302
-2.30690 0.59296 0.22947 -0.82801 -0.09882
-2.29088 0.53157 0.03204 -0.75087 -0.05802
-2.36682 0.34835 0.23964 -0.72942 -0.09720
-2.11368 0.52013 0.27528 -0.81122 -0.20050
-2.27156 0.21167 0.57165 -0.66193 -0.03302
-2.15076 0.42352 0.39504 -0.76157 -0.12652
-2.21785 0.36763 0.32698 -0.77082 -0.14381
-2.27547 0.37300 0.27672 -0.84605 -0.11913
-2.21365 0.37827 0.12720 -0.72738 -0.16452
-2.22145 0.53808 0.36045 -0.75871 -0.13850
-2.12462 0.43006 0.16845 -0.87023 -0.09789
-2.20644 0.53991 0.19739 -0.82177 -0.29343
-2.10046 0.77307 0.21435 -0.86245 -0.15118
-2.37488 0.50979 0.17556 -0.78224 -0.14136
-2.31240 0.48578 0.20484 -0.85149 -0.12238
-2.19001 0.33037 0.09407 -0.91014 -0.09473
-2.31135 0.29111 0.38171 -0.73491 -0.03447
-2.23133 0.35395 0.29436 -0.48961 -0.06276
-2.19413 0.36376 0.21777 -0.18686 -0.20020
-2.06165 0.38107 0.43080 -0.76940 -0.38291
-2.34113 0.44097 0.42368 -0.73850 0.02570
-2.41489 0.40263 0.32783 -0.80521 0.03404
-2.34918 0.48361 0.16130 -0.87229 -0.09810
cluster.eps: image file
element.out: height and merge data
0.0653862929061 [-22 -30]
0.0692241388534 [ -2 -18]
0.0918671214309 [ -7 -28]
0.107376254824 [-21 1]
0.110198113414 [-14 -15]
0.136962358698 [ -1 -29]
0.139532210618 [-13 -17]
0.142665365804 [-8 4]
0.146113402192 [-23 2]
0.148827203159 [ -6 -24]
0.154304262741 [-11 -19]
0.184603648393 [-10 5]
0.196790849635 [ 3 10]
0.205812622791 [-16 12]
0.224987104075 [-9 8]
0.253271677848 [-5 11]
0.285769571333 [ 7 14]
0.297698839601 [-12 13]
0.319598069143 [ 9 16]
0.34335611295 [-25 -26]
0.356127683001 [ 6 17]
0.386487913265 [15 19]
0.466665096402 [-4 22]
0.500220834332 [18 21]
0.514176928693 [-20 -27]
0.614681702347 [23 25]
0.710561224034 [20 24]
0.796960005207 [26 27]
1.13283489335 [-3 28]
data.out: averages and member of groups
-2.314584 0.318096 0.459708 -0.721324 -0.002422 12 7 28 6 24
-2.281683 0.402869 0.283269 -0.780521 -0.097274 1 29 13 17 16 10 14 15
-2.100460 0.773070 0.214350 -0.862450 -0.151180 20
-2.061650 0.381070 0.430800 -0.769400 -0.382910 27
-2.232736 0.463276 0.143898 -0.826001 -0.150468 4 9 8 21 22 30 23 2 18 5 11 19
-2.212730 0.358855 0.256065 -0.338235 -0.131480 25 26
-2.234980 0.265900 -0.353560 -0.034840 -0.174370 3
cluster.out: order and member of groups
001: 12 7 28 6 24
002: 1 29 13 17 16 10 14 15
003: 20
004: 27
005: 4 9 8 21 22 30 23 2 18 5 11 19
006: 25 26
007: 3
clustering.py
#!/usr/bin/env python
import sys, os
from rpy import *
from scipy import *
def create_data(data, is_square_root=False):
"""Create distance data.
Arguments:
data: position data
is_square_root: whether to apply square root or not
Return values:
in_data: distance data
ndim: number of dimensions
"""
in_data = []
num = len(data)
ndim = len(data[0])
for i in range(num):
for j in range(num):
if i < j:
dist = 0.0
for k in range(ndim): dist += (data[i][k] - data[j][k])**2
if is_square_root: dist = sqrt(dist)
in_data.append(dist)
elif i > j:
in_data.append(in_data[j*num+i])
elif i == j:
in_data.append(0.0)
return in_data, ndim
def clustering(in_data, data, ele_file, cluster_method="complete", eps_file=None):
"""Clustering.
Arguments:
in_data: distance data
data: position data
ele_file: element filename (height/merge data)
cluster_method: clustering method [complete]
eps_file: EPS filename [None]
"""
efile = file(ele_file, "w")
if len(in_data) == 2:
efile.write("%5.3f [-1 -2]\n" % in_data[1])
efile.close()
return
mat = in_data[:]
print "%d x %d = %d" % (len(data), len(data), len(mat))
mat = with_mode(NO_CONVERSION, r.matrix)(mat, nrow=len(data))
d = with_mode(NO_CONVERSION, r.as_dist)(mat)
hc = r.hclust(d, method=cluster_method)
for h, m in zip(r["$"](hc, "height"), r["$"](hc, "merge")):
efile.write("%s %s\n" % (h, m))
efile.close()
if eps_file:
r.postscript(eps_file, title=eps_file)
r.plclust(hc, hang=-0.1, ann=r.FALSE)
r.dev_off()
def expand_group(m, n):
"""Expand group list."""
group = []
for x in m[n-1]:
if x < 0: group.append(-x)
else: group.extend(expand_group(m, x))
return group
def arrange_cluster(ele_file, data, ndim, threshold, out_file):
"""Arrange clustering data.
Arguments:
ele_file: element filename (height/merge data)
data: position data
ndim: number of dimensions
threshold: threshold of clustering
out_file: output data filename
Return value:
groups: pair list of hierarchical clustering
"""
if len(data) == 2:
print "Cluster 1 : 1"
print "Total : 1"
return []
elif len(data) < 2:
print "No cluster"
print "Total : 0"
return []
efile = file(ele_file)
ofile = file(out_file, "w")
h = []
m = []
for l in efile:
e = l.replace("[", "").replace("]", "").replace(",", " ").strip().rstrip("\n").split()
h.append(float(e[0]))
m.append([int(e[1]), int(e[2])])
tt = [[i, x] for i, (x, d) in enumerate(zip(m, h)) if d > threshold]
t = []
for d in tt:
for x in d[1]:
if x < tt[0][0]+1: t.append(x)
groups = []
all = []
for x in t:
g = []
if x < 0: g = [-x]
else: g = expand_group(m, x)
groups.append(g)
if not groups: groups = [range(1, len(h) + 2)]
for i, g in enumerate(groups):
print "Cluster %-3d: %d" % (i+1, len(g))
g_data = [data[j-1] for j in g]
for d in range(ndim): ofile.write("%f " % average(mat(g_data).T[d]))
for j in g: ofile.write("%d " % j)
ofile.write("\n")
efile.close()
ofile.close()
s = 0
for x in groups: s += len(x)
print "Total : %d" % s
return groups
def write_cluster(groups, cluster_out):
"""Write clustering result."""
clust_file = file(cluster_out, "w")
print "Number of clusters: %d" % len(groups)
count = 1
for c in groups:
clust_file.write("%03d:" % count)
for d in c: clust_file.write(" %d" % d)
clust_file.write("\n")
count += 1
clust_file.close()
def main(args):
if len(args) < 2:
print >>sys.stderr, "Usage: %s data_file [threshold=5.0]>" % os.path.basename(args[0])
sys.exit(1)
elif len(args) == 2:
th = 5.0
elif len(args) > 2:
th = float(args[2])
DATA_FILE = args[1]
ELEMENT_FILE = "element.out"
OUT_FILE = "data.out"
CLUSTER_FILE = "cluster.out"
EPS_FILE = "cluster.eps"
IS_SQUARE_ROOT = True
print "Data file: %s" % DATA_FILE
print "Element file: %s" % ELEMENT_FILE
print "Output file: %s" % OUT_FILE
print "Cluster file: %s" % CLUSTER_FILE
print "EPS file: %s" % EPS_FILE
if IS_SQUARE_ROOT:
print "Threshold: %f (distance)" % th
else:
th = th**2
print "Threshold: %f (distance**2)" % th
# position data
data = [[float(d) for d in l.split()] for l in file(DATA_FILE)]
if len(data) > 1:
# distance data
in_data, ndim = create_data(data, is_square_root=IS_SQUARE_ROOT)
# clustering by position and distance data and output element file
clustering(in_data, data, ELEMENT_FILE, eps_file=EPS_FILE)
else:
print >>sys.stderr, "Error: no data."
sys.exit(1)
# arrange clustering data by element file and threshold, and output group data
groups = arrange_cluster(ELEMENT_FILE, data, ndim, th, OUT_FILE)
# output ordered clustering data by using group data
write_cluster(groups, CLUSTER_FILE)
if __name__ == "__main__": main(sys.argv)
仕事で使おうと思って一応書いたけど、コード量が多くてなんだかスッキリしないなぁ。Linuxでは問題なかったけど、WindowsではEPSファイルでエラーが出るし。使用環境、RとRPyの相性の問題もあるようだ。Rは2.1.1、RPyは0.4.3でちょっと古いかもしれない。最近はrpy2が出ているようだけどどうなんだろう? 使ってみるかな。
コメント