Attachment 'sheet08.m'
Download 1 function sheet08
2
3 % Generate data
4 centers = [0, 0; 7, 3; -2, 4; 0, 10; -5, -5];
5 % (2): You centers here
6 % centers = [...]
7 X = generate_data(centers, 50);
8
9 MAXK = 8; % maximal number of clusters
10 ITERS = 10; % how many restarts.
11
12 % compute (in-)stability for k-means and a random clustering
13 DKMEANS = evaluate_clusterer(@k_means_clustering, X, MAXK, ITERS);
14 DRAND = evaluate_clusterer(@random_clustering, X, MAXK, ITERS);
15
16 % plot the two stability values
17 figure(1)
18 errorbar(2:8, mean(DKMEANS), std(DKMEANS));
19 hold on
20 errorbar(2:8, mean(DRAND), std(DRAND), 'r');
21 hold off
22 legend('k-means stability', 'random clusterer')
23
24 % normalize the stability by the random clusterer
25 D = normalize(DKMEANS, DRAND);
26
27 figure(2)
28 errorbar(2:8, mean(D), std(D));
29 title('normalized k-means')
30
31 % show clustering for most stable number of clusters
32 [dummy, OPTK] = min(mean(D)); OPTK = OPTK + 1;
33
34 Y = k_means_clustering(OPTK, X);
35 figure(3);
36 plot_clustering(X, Y);
37
38 % generate data from some centers
39 function X = generate_data(C, N)
40 X = [];
41 for I = 1:size(C, 1)
42 X = [ X ; randn(N, 2) + repmat(C(I, :), N, 1) ];
43 end
44
45 % A variant of the k-means clustering algorithm which just iterates 100
46 % times.
47 function Y = k_means_clustering(K, X)
48 [N, D] = size(X);
49
50 % randomly select K points as centers
51 P = randperm(N);
52 MEANS = X(P(1:K), :);
53
54 % set up variables
55 for I = 1:100
56 % compute nearest neighbor assignments
57 dist = pwdist(MEANS, X);
58 [dummy, Y] = min(dist);
59
60 % compute new means
61 for J = 1:K
62 NJ = sum(Y == J);
63 if NJ > 0
64 MEANS(J, :) = sum(X(find(Y == J), :))/NJ;
65 end
66 end
67 end
68
69 % Plot the clustering and the centers.
70 function plot_clustering(X, Y)
71 gscatter(X(:, 1), X(:, 2), Y);
72
73 % Compute all pairwise distances quickly.
74 function D = pwdist(X, Y)
75 D = size(X, 2);
76 N = size(X, 1);
77 M = size(Y, 1);
78
79 XX = sum(X.*X, 2);
80 YY = sum(Y.*Y, 2);
81 D = repmat(XX, 1, M) + repmat(YY', N, 1) - 2*X*Y';
82
83 % evaluate a clustering algorithm for clusters 2:MAXK
84 % for ITERS restarts.
85 function D = evaluate_clusterer(method, X, MAXK, ITERS)
86 D = zeros(ITERS, MAXK-1);
87 for K = 2:MAXK
88 fprintf('K = %d\n', K);
89 for I = 1:ITERS
90 Y1 = feval(method, K, X);
91 Y2 = feval(method, K, X);
92 D(I, K-1) = cocluster_distance(Y1, Y2);
93 end
94 end
95
96 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
97 % Fill in your solutions below
98 %
99
100 % 1 (a). The random clusterer. It should return random labels from 1..K
101 % for each data point in X (stored in the rows.
102 function Y = random_clustering(K, X)
103 % ...
104
105 % 1 (b). Compute the co-cluster distance for two labelings Y1 and Y2.
106 function D = cocluster_distance(Y1, Y2)
107 % ...
108
109 % 1 (c). Normalize DKMEANS by dividing the result for a fixed
110 % K by the mean value of the corrsponding values in DRAND.
111 function D = normalize(DKMEANS, DRAND)
112 % ...
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.