314 lines
12 KiB
Mathematica
314 lines
12 KiB
Mathematica
|
|
function [sel,sell,selb] = select(tr,varargin)
|
||
|
|
%SELECT Selects tree branches and leaves.
|
||
|
|
%
|
||
|
|
% S = SELECT(T,N) returns a logical vector S of size [NUMNODES x 1]
|
||
|
|
% indicating N closest nodes to the root node of the phylogenetic tree
|
||
|
|
% object T (NUMNODES = NUMLEAVES + NUMBRANCHES). The first criteria used
|
||
|
|
% is branch levels, then patristic distance, also known as tree distance.
|
||
|
|
% By default SELECT, uses INF as the value of N, therefore SELECT(T) will
|
||
|
|
% return a vector of 'trues'.
|
||
|
|
%
|
||
|
|
% S = SELECT(...,'REFERENCE',R) changes the reference point(s) to measure
|
||
|
|
% the closeness. R can be 'root' (default) or 'leaves'. When using
|
||
|
|
% 'leaves', a node to be tested may have different distances to its
|
||
|
|
% descendant leaves, which are the references (e.g. a non-ultrametric
|
||
|
|
% tree), if this the case the minimum distance to any descendant leaf
|
||
|
|
% will be considered. R may also be an index which points to any node of
|
||
|
|
% the tree.
|
||
|
|
%
|
||
|
|
% S = SELECT(...,'CRITERIA',C) changes the criteria used to measure
|
||
|
|
% closeness. If C='levels' (default) then the first criteria is branch
|
||
|
|
% levels and then patristic distance. If C='distance' then the first
|
||
|
|
% criteria is patristic distance and then branch levels.
|
||
|
|
%
|
||
|
|
% S = SELECT(...,'THRESHOLD',V) selects all the nodes which closeness is
|
||
|
|
% less or equal than the threshold value V. Observe that either
|
||
|
|
% 'criteria' or either 'reference' can be used. If N is not specified
|
||
|
|
% N = INF, otherwise the output can be further size limited by N.
|
||
|
|
%
|
||
|
|
% S = SELECT(...,'EXCLUDE',E) sets a post-filter which excludes all the
|
||
|
|
% branch nodes from S when E=='branches' or all the leaf nodes when
|
||
|
|
% E=='leaves'. The default is 'none'.
|
||
|
|
%
|
||
|
|
% S = SELECT(...,'PROPAGATE',P) activates a post-functionality which
|
||
|
|
% propagates the selected nodes to the leaves when P=='toleaves' or
|
||
|
|
% towards the root finding a common ancestor when P=='toroot'. The
|
||
|
|
% default is 'none', P may also be 'both'. 'PROPAGATE' switch acts after
|
||
|
|
% 'EXCLUDE' switch.
|
||
|
|
%
|
||
|
|
% [S,SELLEAVES,SELBRANCHES] = SELECT(...) returns two additional logical
|
||
|
|
% vectors, one for the selected leaves and one for the selected branches.
|
||
|
|
%
|
||
|
|
% Examples:
|
||
|
|
%
|
||
|
|
% % Load a phylogenetic tree created from a protein family:
|
||
|
|
% tr = phytreeread('pf00002.tree');
|
||
|
|
%
|
||
|
|
% % To find close products for a given protein (e.g. vips_human):
|
||
|
|
% ind = getbyname(tr,'vips_human');
|
||
|
|
% [sel,sel_leaves] = select(tr,'criteria','distance',...
|
||
|
|
% 'threshold',0.6,'reference',ind);
|
||
|
|
% view(tr,sel_leaves)
|
||
|
|
%
|
||
|
|
% % To find potential outliers in the tree use:
|
||
|
|
% [sel,sel_leaves] = select(tr,'criteria','distance','threshold',.3,...
|
||
|
|
% 'reference','leaves','exclude','leaves','propagate','toleaves');
|
||
|
|
% view(tr,~sel_leaves)
|
||
|
|
%
|
||
|
|
%
|
||
|
|
% See also PHYTREE, PHYTREE/GET, PHYTREE/PDIST, PHYTREE/PRUNE, PHYTREETOOL.
|
||
|
|
|
||
|
|
%
|
||
|
|
% Copyright 2003-2006 The MathWorks, Inc.
|
||
|
|
% $Revision: 1.1.6.7.2.1 $ $Author: batserve $ $Date: 2006/07/27 21:37:50 $
|
||
|
|
|
||
|
|
if numel(tr)~=1
|
||
|
|
error('Bioinfo:phytree:select:NoMultielementArrays',...
|
||
|
|
'Phylogenetic tree must be an 1-by-1 object.');
|
||
|
|
end
|
||
|
|
|
||
|
|
% set defaults
|
||
|
|
V=inf;
|
||
|
|
CriteriaIsDistance = false;
|
||
|
|
ReferenceIs = 'root';
|
||
|
|
ExcludeSwitch = false;
|
||
|
|
PostPropagate = false;
|
||
|
|
|
||
|
|
% check is first argument is N, otherwise set N to default
|
||
|
|
if (nargin>1 && isnumeric(varargin{1}) && isreal(varargin{1}))
|
||
|
|
N = floor(varargin{1});
|
||
|
|
first_arg = 3;
|
||
|
|
else
|
||
|
|
N = inf;
|
||
|
|
first_arg = 2;
|
||
|
|
end
|
||
|
|
|
||
|
|
numBranches = size(tr.tree,1);
|
||
|
|
numLeaves = numBranches + 1;
|
||
|
|
numLabels = numBranches + numLeaves;
|
||
|
|
|
||
|
|
% identify input arguments
|
||
|
|
if nargin - first_arg + 1 > 0
|
||
|
|
if rem(nargin - first_arg,2) == 0
|
||
|
|
error('Bioinfo:phytree:select:IncorrectNumberOfArguments',...
|
||
|
|
'Incorrect number of arguments to %s.',mfilename);
|
||
|
|
end
|
||
|
|
okargs = {'reference','criteria','threshold','exclude','propagate'};
|
||
|
|
for j=first_arg - 1 : 2 : nargin - first_arg + 1
|
||
|
|
pname = varargin{j};
|
||
|
|
pval = varargin{j+1};
|
||
|
|
k = find(strncmpi(pname,okargs,numel(pname)));
|
||
|
|
if isempty(k)
|
||
|
|
error('Bioinfo:phytree:select:UnknownParameterName',...
|
||
|
|
'Unknown parameter name: %s.',pname);
|
||
|
|
elseif length(k)>1
|
||
|
|
error('Bioinfo:phytree:select:AmbiguousParameterName',...
|
||
|
|
'Ambiguous parameter name: %s.',pname);
|
||
|
|
else
|
||
|
|
switch(k)
|
||
|
|
case 1 % reference
|
||
|
|
if islogical(pval)
|
||
|
|
if numel(pval)==numLabels
|
||
|
|
pval = pval(:)==true;
|
||
|
|
elseif numel(pval)==numLeaves
|
||
|
|
pval = [pval(:);false(numBranches,1)];
|
||
|
|
elseif numel(pval)==numBranches
|
||
|
|
pval = [false(numLeaves,1);pval(:)];
|
||
|
|
else
|
||
|
|
error('Bioinfo:phytree:select:InvalidSizeLogicalReferenceNode',...
|
||
|
|
'When reference node is a logical vector it must contain NUMNODES, NUMLEAVES or NUMBRANCHES elements.')
|
||
|
|
end
|
||
|
|
pval = find(pval);
|
||
|
|
ReferenceIs = 'node';
|
||
|
|
if numel(pval) ~= 1
|
||
|
|
error('Bioinfo:phytree:select:InvalidValuesLogicalReferenceNode',...
|
||
|
|
'When reference node is a logical vector one element must be true and all others false.')
|
||
|
|
else
|
||
|
|
ReferenceNode = pval;
|
||
|
|
end
|
||
|
|
elseif isnumeric(pval)
|
||
|
|
ReferenceIs = 'node';
|
||
|
|
if numel(pval) ~= 1
|
||
|
|
error('Bioinfo:phytree:select:InvalidSizeReferenceNode',...
|
||
|
|
'Reference node must be scalar.')
|
||
|
|
elseif all(pval~=1:numLabels)
|
||
|
|
error('Bioinfo:phytree:select:InvalidValueReferenceNode',...
|
||
|
|
'Incorrect reference node.')
|
||
|
|
else
|
||
|
|
ReferenceNode = pval;
|
||
|
|
end
|
||
|
|
else
|
||
|
|
h = strmatch(lower(pval),{'root','leaves'}); %#ok
|
||
|
|
if numel(h)
|
||
|
|
switch(h)
|
||
|
|
case 1
|
||
|
|
ReferenceIs = 'root';
|
||
|
|
case 2
|
||
|
|
ReferenceIs = 'leaves';
|
||
|
|
end
|
||
|
|
else error('Bioinfo:phytree:select:InvalidStringReferenceNode',...
|
||
|
|
'Invalid string for the reference node.');
|
||
|
|
end
|
||
|
|
end
|
||
|
|
case 2 % criteria
|
||
|
|
h = strmatch(lower(pval),{'distance','levels'}); %#ok
|
||
|
|
if numel(h)
|
||
|
|
CriteriaIsDistance = (h == 1);
|
||
|
|
else error('Bioinfo:phytree:select:InvalidCriteria',...
|
||
|
|
'Invalid string for criteria.');
|
||
|
|
end
|
||
|
|
case 3 % threshold
|
||
|
|
V = pval;
|
||
|
|
if (~isnumeric(V) || ~isreal(V) || numel(V)>1)
|
||
|
|
error('Bioinfo:phytree:select:InvalidThreshold',...
|
||
|
|
'Invalid value for V.');
|
||
|
|
end
|
||
|
|
case 4 % exclude
|
||
|
|
h = strmatch(lower(pval),{'branches','leaves','none'}); %#ok
|
||
|
|
if numel(h)
|
||
|
|
switch(h)
|
||
|
|
case 1
|
||
|
|
ExcludeSwitch = true;
|
||
|
|
ExcludeType = 'branches';
|
||
|
|
case 2
|
||
|
|
ExcludeSwitch = true;
|
||
|
|
ExcludeType = 'leaves';
|
||
|
|
case 3
|
||
|
|
ExcludeSwitch = false;
|
||
|
|
end
|
||
|
|
else error('Bioinfo:phytree:select:InvalidExcludeOption',...
|
||
|
|
'Invalid string for exclude switch.');
|
||
|
|
end
|
||
|
|
case 5 % propagate
|
||
|
|
h = strmatch(lower(pval),{'toleaves','toroot','both','none'}); %#ok
|
||
|
|
if numel(h)
|
||
|
|
switch(h)
|
||
|
|
case 1
|
||
|
|
PostPropagate = true;
|
||
|
|
PostPropagateType = 'toleaves';
|
||
|
|
case 2
|
||
|
|
PostPropagate = true;
|
||
|
|
PostPropagateType = 'toroot';
|
||
|
|
case 3
|
||
|
|
PostPropagate = true;
|
||
|
|
PostPropagateType = 'both';
|
||
|
|
case 4
|
||
|
|
PostPropagate = false;
|
||
|
|
end
|
||
|
|
else error('Bioinfo:phytree:select:InvalidPostPropagate',...
|
||
|
|
'Invalid string for post-propagate switch.');
|
||
|
|
end
|
||
|
|
end % switch(k)
|
||
|
|
end % if ...
|
||
|
|
end % for j=...
|
||
|
|
end % nargin
|
||
|
|
|
||
|
|
% calculate the distance (and levels) of every node to the reference
|
||
|
|
levels2Ref = zeros(numLabels,1);
|
||
|
|
switch ReferenceIs
|
||
|
|
case 'root'
|
||
|
|
% calculate the distance to the root for every node
|
||
|
|
dist2Ref = tr.dist;
|
||
|
|
for ind = numBranches:-1:1
|
||
|
|
dist2Ref(tr.tree(ind,:)) = ...
|
||
|
|
dist2Ref(tr.tree(ind,:)) + dist2Ref(ind+numLeaves);
|
||
|
|
levels2Ref(tr.tree(ind,:)) = levels2Ref(ind+numLeaves) + 1;
|
||
|
|
end
|
||
|
|
case 'leaves'
|
||
|
|
% calculate the distance to the closest leaf for every node
|
||
|
|
dist2Ref = zeros(numLabels,1);
|
||
|
|
for ind = 1:numBranches
|
||
|
|
dist2Ref(ind+numLeaves) = ...
|
||
|
|
min(dist2Ref(tr.tree(ind,:))+tr.dist(tr.tree(ind,:)));
|
||
|
|
levels2Ref(ind+numLeaves) = min(levels2Ref(tr.tree(ind,:))) + 1;
|
||
|
|
end
|
||
|
|
case 'node'
|
||
|
|
refVector = zeros(numLabels,1);
|
||
|
|
refVector(ReferenceNode)=1;
|
||
|
|
dist2Ref = pdist(tr,'SquareForm',true,'nodes','all')...
|
||
|
|
* refVector;
|
||
|
|
tr.dist = ones(numLabels,1); % to count now levels !
|
||
|
|
levels2Ref = pdist(tr,'SquareForm',true,'nodes','all') ...
|
||
|
|
* refVector;
|
||
|
|
end
|
||
|
|
|
||
|
|
% applies the threshold value
|
||
|
|
if CriteriaIsDistance
|
||
|
|
sel = dist2Ref < V;
|
||
|
|
else % ~CriteriaIsDistance
|
||
|
|
sel = levels2Ref < V;
|
||
|
|
end % if CriteriaIsDistance
|
||
|
|
|
||
|
|
% needs to remove additional nodes because of N
|
||
|
|
if sum(sel)>N
|
||
|
|
if CriteriaIsDistance
|
||
|
|
[dum,h]=sortrows([dist2Ref levels2Ref]); %#ok
|
||
|
|
else % ~CriteriaIsDistance
|
||
|
|
[dum,h]=sortrows([levels2Ref dist2Ref]); %#ok
|
||
|
|
end % if CriteriaIsDistance
|
||
|
|
g=h(sel(h));
|
||
|
|
sel(g(N+1:end))=false;
|
||
|
|
end
|
||
|
|
|
||
|
|
% exclude option
|
||
|
|
if ExcludeSwitch
|
||
|
|
switch ExcludeType
|
||
|
|
case 'branches'
|
||
|
|
sel((1+numLeaves):numLabels) = false;
|
||
|
|
case 'leaves'
|
||
|
|
sel(1:numLeaves) = false;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
% post-propagate option
|
||
|
|
if PostPropagate
|
||
|
|
|
||
|
|
% expands all the current nodes towards the leaves
|
||
|
|
if any(strcmp({'toleaves','both'},PostPropagateType))
|
||
|
|
for ind = numBranches:-1:1
|
||
|
|
if sel(ind+numLeaves)
|
||
|
|
sel(tr.tree(ind,:))=true;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
% propagates towards the root finding the common ancestors
|
||
|
|
if any(strcmp({'toroot','both'},PostPropagateType))
|
||
|
|
|
||
|
|
% find closest common branch for every pair of nodes
|
||
|
|
% diagonal is invalid ! but not needed
|
||
|
|
|
||
|
|
% initializing full matrix
|
||
|
|
commf = zeros(numLabels,'int16');
|
||
|
|
children = false(1,numLabels);
|
||
|
|
for ind = numBranches:-1:1
|
||
|
|
children(:) = false;
|
||
|
|
children(ind+numLeaves) = true;
|
||
|
|
for ind2 = ind:-1:1
|
||
|
|
if children(ind2+numLeaves)
|
||
|
|
children(tr.tree(ind2,:))=true;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
commf(children,children)=int16(ind);
|
||
|
|
end
|
||
|
|
commf = commf(sel,sel);
|
||
|
|
commf = commf - diag(diag(commf));
|
||
|
|
commf = unique(commf(commf(:)>0));
|
||
|
|
sel(commf+numLeaves) = true;
|
||
|
|
|
||
|
|
% now propagates towards the common ancestor
|
||
|
|
for ind = 1:double(max(commf))
|
||
|
|
if any(sel(tr.tree(ind,:)))
|
||
|
|
sel(ind+numLeaves) = true;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
if nargout > 1
|
||
|
|
sell = sel(1:numLeaves);
|
||
|
|
end
|
||
|
|
if nargout > 2
|
||
|
|
selb = sel(numLeaves+1:numLabels);
|
||
|
|
end
|