ourMELONS/matlab/graph/@phyTree/pDist.m
2019-12-16 16:47:21 +01:00

153 lines
5.2 KiB
Matlab

function [dist,comm] = pdist(tr,varargin)
%PDIST computes the pairwise patristic distance.
%
% D = PDIST(TREE) returns a vector D containing the patristic
% distances between every possible pair of leaf nodes in the phylogenic
% tree object TREE. The distance is computed following the path through
% the branches of the tree.
%
% The output vector D is arranged in the order of ((2,1),(3,1),...,
% (M,1),(3,2),...(M,3),.....(M,M-1)), i.e. the lower left triangle of the
% full M-by-M distance matrix. To get the distance between the Ith and
% Jth nodes (I > J) use the formula D((J-1)*(M-J/2)+I-J). M is the
% number of leaves)
%
% D = PDIST(...,'NODES',N) indicates the nodes to be included in the
% computation. N can be 'leaves' (default) or 'all'. In the former
% case the output will be order as before, but M is the total number of
% nodes in the tree, i.e. NUMLEAVES+NUMBRANCHES.
%
% D = PDIST(...,'SQUAREFORM',true) coverts the output into a square
% format, so that D(I,J) denotes the distance between the Ith and the Jth
% node. The output matrix is symmetric and has a zero diagonal.
%
% D = PDIST(...,'CRITERIA',C) changes the criteria used to relate pairs.
% C can be 'distance' (default) or 'levels'.
%
% [D,C] = PDIST(TREE) returns in C the index of the closest common parent
% nodes for every possible pair of query nodes.
%
% Example:
%
% % get the tree distances between every leaf
% tr = phytreeread('pf00002.tree')
% dist = pdist(tr,'nodes','leaves','squareform',true)
%
% See also SEQPDIST, SEQLINKAGE, PHYTREE, PHYTREETOOL.
% Copyright 2003-2005 The MathWorks, Inc.
% $Revision: 1.1.6.6 $ $Author: batserve $ $Date: 2005/06/09 21:55:59 $
if numel(tr)~=1
error('Bioinfo:phytree:pdist:NoMultielementArrays',...
'Phylogenetic tree must be an 1-by-1 object.');
end
% set default
squaredOutput = false;
CriteriaIsLevels = false;
outNodes = 'leaves';
numBranches = size(tr.tree,1);
numLeaves = numBranches + 1;
numLabels = numBranches + numLeaves;
% process input arguments
if nargin > 1
if rem(nargin,2) == 0
error('Bioinfo:phytree:pdist:IncorrectNumberOfArguments',...
'Incorrect number of arguments to %s.',mfilename);
end
okargs = {'nodes','squareform','criteria'};
for ind = 2 : 2: nargin
pname = varargin{ind-1};
pval = varargin{ind};
k = find(strncmpi(pname,okargs,numel(pname)));
if isempty(k)
error('Bioinfo:phytree:pdist:UnknownParameterName',...
'Unknown parameter name: %s.',pname);
elseif length(k)>1
error('Bioinfo:phytree:pdist:AmbiguousParameterName',...
'Ambiguous parameter name: %s.',pname);
else
switch(k)
case 1 % nodes
okNodes = {'leaves','all','branches'};
h = strmatch(lower(pval),okNodes ); %#ok
if isempty(h)
error('Bioinfo:phytree:pdist:IncorrectReferenceNode',...
'Incorrect node selection')
else
outNodes = okNodes{h};
end
case 2 % squareform
squaredOutput = (pval == true);
case 3 % criteria
h = strmatch(lower(pval),{'levels','distance'}); %#ok
if numel(h)
CriteriaIsLevels = (h == 1);
else error('Bioinfo:phytree:pdist:InvalidCriteria',...
'Invalid string for criteria.');
end
end
end
end
end
% create indexes to work only the lower leff triangle
m = numLabels*(numLabels-1)/2;
p = cumsum([1 (numLabels-1):-1:2]);
I = ones(m,1); I(p) = [2 3-numLabels:0];
J = zeros(m,1); J(p) = 1;
H = I; H(p) = 2:numLabels;
I = cumsum(I); J = cumsum(J); H = cumsum(H);
switch outNodes
case 'leaves'
outSelection = (I <= numLeaves) & (J <= numLeaves);
case 'all'
outSelection = (I>0);
case 'branches'
outSelection = (I > numLeaves) & (J > numLeaves);
end
% find closest common branch for every pair of nodes
% diagonal is invalid ! but not needed
% initializing full matrix
commf = zeros(numLabels,'int16');
children = false(1,numLabels);
for ind = numBranches:-1:1
children(:) = false;
children(ind+numLeaves) = true;
for ind2 = ind:-1:1
if children(ind2+numLeaves)
children(tr.tree(ind2,:))=true;
end
end
commf(children,children)=int16(ind);
end
% output vector with the lower leff triangle closest common branches
comm = double(commf(H));
% compute the distance to root for every node
cdist = tr.dist;
if CriteriaIsLevels % set to count levels instead
cdist(:) = 1;
cdist(end) = 0;
end
for ind = numBranches:-1:1
cdist(tr.tree(ind,:)) = cdist(tr.tree(ind,:)) + cdist(ind+numLeaves);
end
% compute pairwise distance
dist = cdist(I)+cdist(J)-2*cdist(comm+numLeaves);
dist = dist(outSelection);
comm = comm(outSelection);
if squaredOutput
dist = squareform(dist);
comm = squareform(comm);
end