attachment:sheet08.m of Main/MaschinellesLernenW08

   1 function sheet08
   2 
   3 % Generate data
   4 centers = [0, 0; 7, 3; -2, 4; 0, 10; -5, -5];
   5 % (2): You centers here
   6 % centers = [...]
   7 X = generate_data(centers, 50);
   8 
   9 MAXK = 8; % maximal number of clusters
  10 ITERS = 10; % how many restarts.
  11 
  12 % compute (in-)stability for k-means and a random clustering
  13 DKMEANS = evaluate_clusterer(@k_means_clustering, X, MAXK, ITERS);
  14 DRAND = evaluate_clusterer(@random_clustering, X, MAXK, ITERS);
  15 
  16 % plot the two stability values
  17 figure(1)
  18 errorbar(2:8, mean(DKMEANS), std(DKMEANS));
  19 hold on
  20 errorbar(2:8, mean(DRAND), std(DRAND), 'r');
  21 hold off
  22 legend('k-means stability', 'random clusterer')
  23 
  24 % normalize the stability by the random clusterer
  25 D = normalize(DKMEANS, DRAND);
  26 
  27 figure(2)
  28 errorbar(2:8, mean(D), std(D));
  29 title('normalized k-means')
  30 
  31 % show clustering for most stable number of clusters
  32 [dummy, OPTK] = min(mean(D)); OPTK = OPTK + 1;
  33 
  34 Y = k_means_clustering(OPTK, X);
  35 figure(3);
  36 plot_clustering(X, Y);
  37 
  38 % generate data from some centers
  39 function X = generate_data(C, N)
  40 X = [];
  41 for I = 1:size(C, 1)
  42   X = [ X ; randn(N, 2) + repmat(C(I, :), N, 1) ];
  43 end
  44 
  45 % A variant of the k-means clustering algorithm which just iterates 100
  46 % times.
  47 function Y = k_means_clustering(K, X)
  48 [N, D] = size(X);
  49 
  50 % randomly select K points as centers
  51 P = randperm(N);
  52 MEANS = X(P(1:K), :);
  53 
  54 % set up variables
  55 for I = 1:100
  56   % compute nearest neighbor assignments
  57   dist = pwdist(MEANS, X);
  58   [dummy, Y] = min(dist);
  59   
  60   % compute new means
  61   for J = 1:K
  62     NJ = sum(Y == J);
  63     if NJ > 0
  64       MEANS(J, :) = sum(X(find(Y == J), :))/NJ;
  65     end
  66   end
  67 end
  68 
  69 % Plot the clustering and the centers.
  70 function plot_clustering(X, Y)
  71 gscatter(X(:, 1), X(:, 2), Y);
  72 
  73 % Compute all pairwise distances quickly.
  74 function D = pwdist(X, Y)
  75 D = size(X, 2);
  76 N = size(X, 1);
  77 M = size(Y, 1);
  78   
  79 XX = sum(X.*X, 2);
  80 YY = sum(Y.*Y, 2);
  81 D = repmat(XX, 1, M) + repmat(YY', N, 1) - 2*X*Y';
  82 
  83 % evaluate a clustering algorithm for clusters 2:MAXK
  84 % for ITERS restarts.
  85 function D = evaluate_clusterer(method, X, MAXK, ITERS)
  86 D = zeros(ITERS, MAXK-1);
  87 for K = 2:MAXK
  88   fprintf('K = %d\n', K);
  89   for I = 1:ITERS
  90     Y1 = feval(method, K, X);
  91     Y2 = feval(method, K, X);
  92     D(I, K-1) = cocluster_distance(Y1, Y2);
  93   end
  94 end
  95 
  96 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  97 % Fill in your solutions below
  98 %
  99 
 100 % 1 (a). The random clusterer. It should return random labels from 1..K
 101 % for each data point in X (stored in the rows.
 102 function Y = random_clustering(K, X)
 103 % ...
 104 
 105 % 1 (b). Compute the co-cluster distance for two labelings Y1 and Y2.
 106 function D = cocluster_distance(Y1, Y2)
 107 % ...
 108 
 109 % 1 (c). Normalize DKMEANS by dividing the result for a fixed
 110 % K by the mean value of the corrsponding values in DRAND.
 111 function D = normalize(DKMEANS, DRAND)
 112 % ...
Attachment 'sheet08.m'

Attached Files