ourMELONS/matlab/graph/linkage.m

function Z = linkage(Y, method)
%LINKAGE Create hierarchical cluster tree. 
%   Z = LINKAGE(Y) creates a hierarchical cluster tree, using the single
%   linkage algorithm.  The input Y is a distance matrix such as is
%   generated by PDIST.  Y may also be a more general dissimilarity
%   matrix conforming to the output format of PDIST.
%
%   Z = LINKAGE(Y, method) creates a hierarchical cluster tree using
%   the specified algorithm. The available methods are:
%
%      'single'   --- nearest distance
%      'complete' --- furthest distance
%      'average'  --- average distance
%      'centroid' --- center of mass distance (the output Z is meaningful
%                     only if Y contains Euclidean distances)
%      'ward'     --- inner squared distance
%
%   Cluster information will be returned in the matrix Z with size m-1
%   by 3, where m is the number of observations in the original data. 
%   Column 1 and 2 of Z contain cluster indices linked in pairs
%   to form a binary tree. The leaf nodes are numbered from 1 to
%   m. They are the singleton clusters from which all higher clusters
%   are built. Each newly-formed cluster, corresponding to Z(i,:), is
%   assigned the index m+i, where m is the total number of initial
%   leaves. Z(i,1:2) contains the indices of the two component
%   clusters which form cluster m+i. There are n-1 higher clusters
%   which correspond to the interior nodes of the output clustering
%   tree. Z(i,3) contains the corresponding linkage distances between
%   the two clusters which are merged in Z(i,:), e.g. if there are
%   total of 30 initial nodes, and at step 12, cluster 5 and cluster 7
%   are combined and their distance at this time is 1.5, then row 12
%   of Z will be (5,7,1.5). The newly formed cluster will have an
%   index 12+30=42. If cluster 42 shows up in a latter row, that means
%   this newly formed cluster is being combined again into some bigger
%   cluster.
%
%   The centroid method can produce a cluster tree that is not monotonic.
%   This occurs when the distance from the union of two clusters to a third
%   cluster is less than the distance from either individual cluster to
%   that third cluster. In such a case, sections of the dendrogram change
%   direction.  This is an indication that another method should be used.
%
%   See also PDIST, INCONSISTENT, COPHENET, DENDROGRAM, CLUSTER, CLUSTERDATA,
%   KMEANS, SILHOUETTE.

%   Copyright 1993-2002 The MathWorks, Inc. 
%   $Revision: 1.16 $

[k, n] = size(Y);


m = (1+sqrt(1+8*n))/2;
if k ~= 1 | m ~= fix(m)
  error('The first input has to match the output of the PDIST function in size.');   
end

if nargin == 1 % set default switch to be 'co' 
   method = 'co';
end

method = lower(method(1:2)); % simplify the switch string.

% a flag for non-monotonic distances in tree.  this can only happen with
% the centroid method
monotonic = 1;

Z = zeros(m-1,3); % allocate the output matrix.

% during updating clusters, cluster index is constantly changing, R is
% a index vector mapping the original index to the current (row, column)
% index in Y.  N denotes how many points are contained in each cluster.

N = zeros(1,2*m-1);
N(1:m) = 1;
n = m; % since m is changing, we need to save m in n. 
R = 1:n;

for s = 1:(n-1)
   X = Y;
        
   [v, k] = min(X);
      
   i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1)));
   j = k - (i-1)*(m-i/2)+i;
   
   Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A
   
   % Update Y.  In order to vectorize the computation, we need to compute all
   % the indices corresponding to cluster i and j in Y, denoted by I and J.
   I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables.
   U = [I1 I2 I3];
   I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3];
   J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3];
   
   switch method
   case 'si' %single linkage
      Y(I) = min(Y(I),Y(J));
   case 'av' % average linkage
      Y(I) = Y(I) + Y(J);
   case 'co' %complete linkage
      Y(I) = max(Y(I),Y(J));
   case 'ce' % centroid linkage
      K = N(R(i))+N(R(j));
      Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K;
   case 'wa'
      Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ...
	  N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U)));
   end
   J = [J i*(m-(i+1)/2)-m+j];
   Y(J) = []; % no need for the cluster information about j.
   
   % update m, N, R
   m = m-1; 
   N(n+s) = N(R(i)) + N(R(j));
   R(i) = n+s;
   R(j:(n-1))=R((j+1):n); 
end
Added source Matlab code for reference 2019-12-16 16:47:21 +01:00			`function Z = linkage(Y, method)`
			`%LINKAGE Create hierarchical cluster tree.`
			`% Z = LINKAGE(Y) creates a hierarchical cluster tree, using the single`
			`% linkage algorithm. The input Y is a distance matrix such as is`
			`% generated by PDIST. Y may also be a more general dissimilarity`
			`% matrix conforming to the output format of PDIST.`
			`%`
			`% Z = LINKAGE(Y, method) creates a hierarchical cluster tree using`
			`% the specified algorithm. The available methods are:`
			`%`
			`% 'single' --- nearest distance`
			`% 'complete' --- furthest distance`
			`% 'average' --- average distance`
			`% 'centroid' --- center of mass distance (the output Z is meaningful`
			`% only if Y contains Euclidean distances)`
			`% 'ward' --- inner squared distance`
			`%`
			`% Cluster information will be returned in the matrix Z with size m-1`
			`% by 3, where m is the number of observations in the original data.`
			`% Column 1 and 2 of Z contain cluster indices linked in pairs`
			`% to form a binary tree. The leaf nodes are numbered from 1 to`
			`% m. They are the singleton clusters from which all higher clusters`
			`% are built. Each newly-formed cluster, corresponding to Z(i,:), is`
			`% assigned the index m+i, where m is the total number of initial`
			`% leaves. Z(i,1:2) contains the indices of the two component`
			`% clusters which form cluster m+i. There are n-1 higher clusters`
			`% which correspond to the interior nodes of the output clustering`
			`% tree. Z(i,3) contains the corresponding linkage distances between`
			`% the two clusters which are merged in Z(i,:), e.g. if there are`
			`% total of 30 initial nodes, and at step 12, cluster 5 and cluster 7`
			`% are combined and their distance at this time is 1.5, then row 12`
			`% of Z will be (5,7,1.5). The newly formed cluster will have an`
			`% index 12+30=42. If cluster 42 shows up in a latter row, that means`
			`% this newly formed cluster is being combined again into some bigger`
			`% cluster.`
			`%`
			`% The centroid method can produce a cluster tree that is not monotonic.`
			`% This occurs when the distance from the union of two clusters to a third`
			`% cluster is less than the distance from either individual cluster to`
			`% that third cluster. In such a case, sections of the dendrogram change`
			`% direction. This is an indication that another method should be used.`
			`%`
			`% See also PDIST, INCONSISTENT, COPHENET, DENDROGRAM, CLUSTER, CLUSTERDATA,`
			`% KMEANS, SILHOUETTE.`

			`% Copyright 1993-2002 The MathWorks, Inc.`
			`% $Revision: 1.16 $`

			`[k, n] = size(Y);`


			`m = (1+sqrt(1+8*n))/2;`
			`if k ~= 1 \| m ~= fix(m)`
			`error('The first input has to match the output of the PDIST function in size.');`
			`end`

			`if nargin == 1 % set default switch to be 'co'`
			`method = 'co';`
			`end`

			`method = lower(method(1:2)); % simplify the switch string.`

			`% a flag for non-monotonic distances in tree. this can only happen with`
			`% the centroid method`
			`monotonic = 1;`

			`Z = zeros(m-1,3); % allocate the output matrix.`

			`% during updating clusters, cluster index is constantly changing, R is`
			`% a index vector mapping the original index to the current (row, column)`
			`% index in Y. N denotes how many points are contained in each cluster.`

			`N = zeros(1,2*m-1);`
			`N(1:m) = 1;`
			`n = m; % since m is changing, we need to save m in n.`
			`R = 1:n;`

			`for s = 1:(n-1)`
			`X = Y;`

			`[v, k] = min(X);`

			`i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1)));`
			`j = k - (i-1)*(m-i/2)+i;`

			`Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A`

			`% Update Y. In order to vectorize the computation, we need to compute all`
			`% the indices corresponding to cluster i and j in Y, denoted by I and J.`
			`I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables.`
			`U = [I1 I2 I3];`
			`I = [I1.(m-(I1+1)/2)-m+i i(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3];`
			`J = [I1.(m-(I1+1)/2)-m+j I2.(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3];`

			`switch method`
			`case 'si' %single linkage`
			`Y(I) = min(Y(I),Y(J));`
			`case 'av' % average linkage`
			`Y(I) = Y(I) + Y(J);`
			`case 'co' %complete linkage`
			`Y(I) = max(Y(I),Y(J));`
			`case 'ce' % centroid linkage`
			`K = N(R(i))+N(R(j));`
			`Y(I) = (N(R(i)).Y(I)+N(R(j)).Y(J)-(N(R(i)).N(R(j))v^2)./K)./K;`
			`case 'wa'`
			`Y(I) = ((N(R(U))+N(R(i))).Y(I) + (N(R(U))+N(R(j))).Y(J) - ...`
			`N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U)));`
			`end`
			`J = [J i*(m-(i+1)/2)-m+j];`
			`Y(J) = []; % no need for the cluster information about j.`

			`% update m, N, R`
			`m = m-1;`
			`N(n+s) = N(R(i)) + N(R(j));`
			`R(i) = n+s;`
			`R(j:(n-1))=R((j+1):n);`
			`end`