ourMELONS/matlab/graph/@phyTree/Select.m
2019-12-16 16:47:21 +01:00

313 lines
12 KiB
Matlab

function [sel,sell,selb] = select(tr,varargin)
%SELECT Selects tree branches and leaves.
%
% S = SELECT(T,N) returns a logical vector S of size [NUMNODES x 1]
% indicating N closest nodes to the root node of the phylogenetic tree
% object T (NUMNODES = NUMLEAVES + NUMBRANCHES). The first criteria used
% is branch levels, then patristic distance, also known as tree distance.
% By default SELECT, uses INF as the value of N, therefore SELECT(T) will
% return a vector of 'trues'.
%
% S = SELECT(...,'REFERENCE',R) changes the reference point(s) to measure
% the closeness. R can be 'root' (default) or 'leaves'. When using
% 'leaves', a node to be tested may have different distances to its
% descendant leaves, which are the references (e.g. a non-ultrametric
% tree), if this the case the minimum distance to any descendant leaf
% will be considered. R may also be an index which points to any node of
% the tree.
%
% S = SELECT(...,'CRITERIA',C) changes the criteria used to measure
% closeness. If C='levels' (default) then the first criteria is branch
% levels and then patristic distance. If C='distance' then the first
% criteria is patristic distance and then branch levels.
%
% S = SELECT(...,'THRESHOLD',V) selects all the nodes which closeness is
% less or equal than the threshold value V. Observe that either
% 'criteria' or either 'reference' can be used. If N is not specified
% N = INF, otherwise the output can be further size limited by N.
%
% S = SELECT(...,'EXCLUDE',E) sets a post-filter which excludes all the
% branch nodes from S when E=='branches' or all the leaf nodes when
% E=='leaves'. The default is 'none'.
%
% S = SELECT(...,'PROPAGATE',P) activates a post-functionality which
% propagates the selected nodes to the leaves when P=='toleaves' or
% towards the root finding a common ancestor when P=='toroot'. The
% default is 'none', P may also be 'both'. 'PROPAGATE' switch acts after
% 'EXCLUDE' switch.
%
% [S,SELLEAVES,SELBRANCHES] = SELECT(...) returns two additional logical
% vectors, one for the selected leaves and one for the selected branches.
%
% Examples:
%
% % Load a phylogenetic tree created from a protein family:
% tr = phytreeread('pf00002.tree');
%
% % To find close products for a given protein (e.g. vips_human):
% ind = getbyname(tr,'vips_human');
% [sel,sel_leaves] = select(tr,'criteria','distance',...
% 'threshold',0.6,'reference',ind);
% view(tr,sel_leaves)
%
% % To find potential outliers in the tree use:
% [sel,sel_leaves] = select(tr,'criteria','distance','threshold',.3,...
% 'reference','leaves','exclude','leaves','propagate','toleaves');
% view(tr,~sel_leaves)
%
%
% See also PHYTREE, PHYTREE/GET, PHYTREE/PDIST, PHYTREE/PRUNE, PHYTREETOOL.
%
% Copyright 2003-2006 The MathWorks, Inc.
% $Revision: 1.1.6.7.2.1 $ $Author: batserve $ $Date: 2006/07/27 21:37:50 $
if numel(tr)~=1
error('Bioinfo:phytree:select:NoMultielementArrays',...
'Phylogenetic tree must be an 1-by-1 object.');
end
% set defaults
V=inf;
CriteriaIsDistance = false;
ReferenceIs = 'root';
ExcludeSwitch = false;
PostPropagate = false;
% check is first argument is N, otherwise set N to default
if (nargin>1 && isnumeric(varargin{1}) && isreal(varargin{1}))
N = floor(varargin{1});
first_arg = 3;
else
N = inf;
first_arg = 2;
end
numBranches = size(tr.tree,1);
numLeaves = numBranches + 1;
numLabels = numBranches + numLeaves;
% identify input arguments
if nargin - first_arg + 1 > 0
if rem(nargin - first_arg,2) == 0
error('Bioinfo:phytree:select:IncorrectNumberOfArguments',...
'Incorrect number of arguments to %s.',mfilename);
end
okargs = {'reference','criteria','threshold','exclude','propagate'};
for j=first_arg - 1 : 2 : nargin - first_arg + 1
pname = varargin{j};
pval = varargin{j+1};
k = find(strncmpi(pname,okargs,numel(pname)));
if isempty(k)
error('Bioinfo:phytree:select:UnknownParameterName',...
'Unknown parameter name: %s.',pname);
elseif length(k)>1
error('Bioinfo:phytree:select:AmbiguousParameterName',...
'Ambiguous parameter name: %s.',pname);
else
switch(k)
case 1 % reference
if islogical(pval)
if numel(pval)==numLabels
pval = pval(:)==true;
elseif numel(pval)==numLeaves
pval = [pval(:);false(numBranches,1)];
elseif numel(pval)==numBranches
pval = [false(numLeaves,1);pval(:)];
else
error('Bioinfo:phytree:select:InvalidSizeLogicalReferenceNode',...
'When reference node is a logical vector it must contain NUMNODES, NUMLEAVES or NUMBRANCHES elements.')
end
pval = find(pval);
ReferenceIs = 'node';
if numel(pval) ~= 1
error('Bioinfo:phytree:select:InvalidValuesLogicalReferenceNode',...
'When reference node is a logical vector one element must be true and all others false.')
else
ReferenceNode = pval;
end
elseif isnumeric(pval)
ReferenceIs = 'node';
if numel(pval) ~= 1
error('Bioinfo:phytree:select:InvalidSizeReferenceNode',...
'Reference node must be scalar.')
elseif all(pval~=1:numLabels)
error('Bioinfo:phytree:select:InvalidValueReferenceNode',...
'Incorrect reference node.')
else
ReferenceNode = pval;
end
else
h = strmatch(lower(pval),{'root','leaves'}); %#ok
if numel(h)
switch(h)
case 1
ReferenceIs = 'root';
case 2
ReferenceIs = 'leaves';
end
else error('Bioinfo:phytree:select:InvalidStringReferenceNode',...
'Invalid string for the reference node.');
end
end
case 2 % criteria
h = strmatch(lower(pval),{'distance','levels'}); %#ok
if numel(h)
CriteriaIsDistance = (h == 1);
else error('Bioinfo:phytree:select:InvalidCriteria',...
'Invalid string for criteria.');
end
case 3 % threshold
V = pval;
if (~isnumeric(V) || ~isreal(V) || numel(V)>1)
error('Bioinfo:phytree:select:InvalidThreshold',...
'Invalid value for V.');
end
case 4 % exclude
h = strmatch(lower(pval),{'branches','leaves','none'}); %#ok
if numel(h)
switch(h)
case 1
ExcludeSwitch = true;
ExcludeType = 'branches';
case 2
ExcludeSwitch = true;
ExcludeType = 'leaves';
case 3
ExcludeSwitch = false;
end
else error('Bioinfo:phytree:select:InvalidExcludeOption',...
'Invalid string for exclude switch.');
end
case 5 % propagate
h = strmatch(lower(pval),{'toleaves','toroot','both','none'}); %#ok
if numel(h)
switch(h)
case 1
PostPropagate = true;
PostPropagateType = 'toleaves';
case 2
PostPropagate = true;
PostPropagateType = 'toroot';
case 3
PostPropagate = true;
PostPropagateType = 'both';
case 4
PostPropagate = false;
end
else error('Bioinfo:phytree:select:InvalidPostPropagate',...
'Invalid string for post-propagate switch.');
end
end % switch(k)
end % if ...
end % for j=...
end % nargin
% calculate the distance (and levels) of every node to the reference
levels2Ref = zeros(numLabels,1);
switch ReferenceIs
case 'root'
% calculate the distance to the root for every node
dist2Ref = tr.dist;
for ind = numBranches:-1:1
dist2Ref(tr.tree(ind,:)) = ...
dist2Ref(tr.tree(ind,:)) + dist2Ref(ind+numLeaves);
levels2Ref(tr.tree(ind,:)) = levels2Ref(ind+numLeaves) + 1;
end
case 'leaves'
% calculate the distance to the closest leaf for every node
dist2Ref = zeros(numLabels,1);
for ind = 1:numBranches
dist2Ref(ind+numLeaves) = ...
min(dist2Ref(tr.tree(ind,:))+tr.dist(tr.tree(ind,:)));
levels2Ref(ind+numLeaves) = min(levels2Ref(tr.tree(ind,:))) + 1;
end
case 'node'
refVector = zeros(numLabels,1);
refVector(ReferenceNode)=1;
dist2Ref = pdist(tr,'SquareForm',true,'nodes','all')...
* refVector;
tr.dist = ones(numLabels,1); % to count now levels !
levels2Ref = pdist(tr,'SquareForm',true,'nodes','all') ...
* refVector;
end
% applies the threshold value
if CriteriaIsDistance
sel = dist2Ref < V;
else % ~CriteriaIsDistance
sel = levels2Ref < V;
end % if CriteriaIsDistance
% needs to remove additional nodes because of N
if sum(sel)>N
if CriteriaIsDistance
[dum,h]=sortrows([dist2Ref levels2Ref]); %#ok
else % ~CriteriaIsDistance
[dum,h]=sortrows([levels2Ref dist2Ref]); %#ok
end % if CriteriaIsDistance
g=h(sel(h));
sel(g(N+1:end))=false;
end
% exclude option
if ExcludeSwitch
switch ExcludeType
case 'branches'
sel((1+numLeaves):numLabels) = false;
case 'leaves'
sel(1:numLeaves) = false;
end
end
% post-propagate option
if PostPropagate
% expands all the current nodes towards the leaves
if any(strcmp({'toleaves','both'},PostPropagateType))
for ind = numBranches:-1:1
if sel(ind+numLeaves)
sel(tr.tree(ind,:))=true;
end
end
end
% propagates towards the root finding the common ancestors
if any(strcmp({'toroot','both'},PostPropagateType))
% find closest common branch for every pair of nodes
% diagonal is invalid ! but not needed
% initializing full matrix
commf = zeros(numLabels,'int16');
children = false(1,numLabels);
for ind = numBranches:-1:1
children(:) = false;
children(ind+numLeaves) = true;
for ind2 = ind:-1:1
if children(ind2+numLeaves)
children(tr.tree(ind2,:))=true;
end
end
commf(children,children)=int16(ind);
end
commf = commf(sel,sel);
commf = commf - diag(diag(commf));
commf = unique(commf(commf(:)>0));
sel(commf+numLeaves) = true;
% now propagates towards the common ancestor
for ind = 1:double(max(commf))
if any(sel(tr.tree(ind,:)))
sel(ind+numLeaves) = true;
end
end
end
end
if nargout > 1
sell = sel(1:numLeaves);
end
if nargout > 2
selb = sel(numLeaves+1:numLabels);
end