ourMELONS/matlab/graph/seqNeighJoin.m
2019-12-16 16:47:21 +01:00

250 lines
9.8 KiB
Matlab

function t = seqNeighJoin(D, method, names, varargin)
%SEQNEIGHJOIN neighbor-joining method for phylogenetic tree reconstruction.
%
% TREE = SEQNEIGHJOIN(DIST) computes a phylogenetic tree object from the
% pairwise distances DIST between the species or products applying the
% neighbor-joining method. The input DIST is a matrix (or vector) such as
% is generated by SEQPDIST.
%
% TREE = SEQNEIGHJOIN(DIST,METHOD) selects the method to compute the
% distances of the new nodes to all other nodes at every iteration. The
% general expression to calculate the distances between the new node (n)
% (after joining i and j) and all others nodes (k) is given by:
%
% D(n,k) = a*D(i,k) + (1-a)*D(j,k) - a*D(n,i) - (1-a)*D(n,j)
%
% This expression guarantees to find the correct tree with additive data
% (a.k.a. minimum variance reduction). The options for METHOD are:
%
% 'equivar' --- Assumes equal variance and independence of
% (default) evolutionary distance estimates (a = 1/2). Such as
% in Studier and Keppler, JMBE (1988).
% 'firstorder' --- Assumes a first order model of the variances and
% covariances of evolutionary distance estimates, 'a'
% is adjusted at every iteration to a value between 0
% and 1. Such as in Gascuel, JMBE (1997).
% 'average' --- New distances are the weighted average of previous
% distances, the branch distances are ignored:
% D(n,k) = [ D(i,k) + D(j,k) ] /2
% As in the original neighbor-joining algorithm by
% Saitou and Nei, JMBE (1987)
%
% TREE = SEQNEIGHJOIN(DIST,METHOD,NAMES) passes a list of names to label
% the leaf nodes (e.g. species or products) in the phylogenetic tree
% object. NAMES can be a vector of structures with the fields 'Header' or
% 'Name' or a cell array of strings. In both cases the number of elements
% provided must comply with the number of samples used to generate the
% pairwise distances in DIST.
%
% TREE = SEQNEIGHJOIN(...,'REROOT',false) excludes rerooting the resulting
% tree. This is useful to observe the original linkage order followed by
% the algorithm. By default SEQNEIGHJOIN reroots the resulting tree using
% the mid-point method.
%
% Example:
%
% % Load a multiple alignment of amino acids:
% seqs = fastaread('pf00002.fa');
%
% % Measure the 'Jukes-Cantor' pairwise distances:
% dist = seqpdist(seqs,'method','jukes-cantor','indels','pair');
%
% % Build the phylogenetic using the neighbor-joining algorithm
% tree = seqneighjoin(dist,'equivar',seqs)
% view(tree)
%
% See also MULTIALIGN, PHYTREE, PHYTREE/REROOT, PHYTREE/VIEW, SEQLINKAGE,
% SEQPDIST.
% References:
% [1] Saitou N, Nei M.The neighbor-joining method: a new method for
% reconstructing phylogenetic trees. Mol Biol Evol.(1987) 4(4):406-25
% [2] Gascuel O. BIONJ: An improved version of the NJ algorithm based on a
% simple model of sequence data. Mol. Biol. Evol. (1997) 14:685-695
% [3] Studier JA, Keppler KJ. A note on the neighbor-joining algorithm of
% Saitou and Nei. Mol Biol Evol. (1988) 5(6):729-31.
% Copyright 2003-2006 The MathWorks, Inc.
% $Revision: 1.1.8.2 $ $Date: 2006/05/17 20:48:43 $
rerootTree = true;
checkForNames = true;
% check the input distances
if isnumeric(D)
[m, n] = size(D);
if isvector(D)
n = numel(D);
m = (1+sqrt(1+8*n))/2; % number of leaves
if m ~= fix(m)
error('Bioinfo:seqneighjoin:DbadSize',...
'Size of DIST not compatible with the output of the SEQPDIST function.');
end
D = squareform(D);
elseif m~=n
error('Bioinfo:seqneighjoin:DnotSquare',...
'Size of DIST not compatible with the output of the SEQPDIST function.');
end
else
error('Bioinfo:seqneighjoin:DnotNumeric',...
'DIST must be a numeric vector compatible with the output of the SEQPDIST function.');
end
% Selects appropiate method
if nargin == 1
method = 'e'; % set default method
checkForNames = false;
else
okmethods = {'equivar','firstorder','average','reroot'};
methodkeys = {'e','f','a'};
k = find(strncmpi(method,okmethods,numel(method)));
if isempty(k)
error('Bioinfo:seqneighjoin:UnknownMethod',...
'Unknown method name: %s.',method);
elseif length(k)>1
error('Bioinfo:seqneighjoin:IncorrectMethod',...
'Ambiguous method name: %s.',method);
elseif k<4
method = methodkeys{k};
else % case that second and third input are optional paired inputs
if numel(varargin) || nargin==2
error('Bioinfo:seqneighjoin:IncorrectInputArguments',...
'Incorrect format of input arguments.')
end
varargin = {method,names};
checkForNames = false;
method = 'e'; % set default method
end
end
% detects the names
if checkForNames && (nargin >2) % names were supplied, check validity
if iscell(names) || isfield(names,'Header') || isfield(names,'Name') || isfield(names,'LocusName')
if isfield(names,'Header') % if struct put them in a cell
names = {names(:).Header};
elseif isfield(names,'Name') % if struct put them in a cell
names = {names(:).Name};
elseif isfield(names,'LocusName') % if struct put them in a cell
names = {names(:).LocusName};
end
names = names(:);
namesSupplied = true;
if numel(names)~=m
error('Bioinfo:seqneighjoin:IncorrectSize',...
'NAMES must have the same size as number of leaves in the tree')
end
elseif strncmpi(names,{'reroot'},numel(names))
varargin = {names varargin{:}};
namesSupplied = false;
else
error('Bioinfo:seqneighjoin:IncorrectInputType',...
'NAMES must be a cell with char arrays or a vector of structures.')
end
else
namesSupplied = false;
end
% check optional input arguments
nvarargin = numel(varargin);
if nvarargin
if rem(nvarargin,2)
error('Bioinfo:seqneighjoin:IncorrectNumberOfArguments',...
'Incorrect number of arguments to %s.',mfilename);
end
okargs = {'reroot',''};
for j=1:2:nvarargin
pname = varargin{j};
pval = varargin{j+1};
k = find(strncmpi(pname,okargs,numel(pname)));
if isempty(k)
error('Bioinfo:seqneighjoin:UnknownParameterName',...
'Unknown parameter name: %s.',pname);
elseif length(k)>1
error('Bioinfo:seqneighjoin:AmbiguousParameterName',...
'Ambiguous parameter name: %s.',pname);
else
rerootTree = opttf(pval);
if isempty(rerootTree)
error('Bioinfo:seqneighjoin:rerootInputOptionNotLogical',...
'%s must be a logical value, true or false.',...
upper(char(okargs(k))));
end
end
end
end
% ------------------------------------------------------------------------
% Algorithm starts here:
%
% D - pairwise distance matrix (full form)
% method - 'a' for Saitou & Nei, 'e' for Studier & Keppler, and 'f' for
% Gascuel
% names - labels for leaf nodes
N = size(D,1);
Z = zeros(N-1,2);
bd = zeros(N*2-1,1);
p = 1:N; % pointers in matrix
bc=1; % branch counter
if method=='f'
V=D; %initialize the variance matrix for Gascuel method
end
for n = N:-1:3
R = sum(D)/(n-2); % sums of columns
Q = D-repmat(R,n,1)-repmat(R',1,n)+diag(inf(n,1)); %Studier & Keppler optimized
% S = (sum(sum(D))/(n-2)+Q)/2 % original total sum in Saitou & Nei
[m,g] = min(Q(:)); %#ok
[i,j] = ind2sub(n,g); % find minimum
if i>j
k=i;i=j;j=k; % j>i always
end
pp = p([i j]); % pointers to join
bl = (R([i j])*[1 -1;-1 1] + D(j,i))/2; % branch lengths
bl = max(bl,0);
bd(pp) = bl; % save branch lengths
Z(bc,:) = pp; % save pointers
h = [1:i-1 i+1:j-1 j+1:n];
switch method % distances to new node
case 'a' % Saitou & Nei method
d = (sum(D(h,[i j]),2))/2;
D = [[D(h,h) d];[d' 0]]; % update distance matrix
case 'e' % Studier & Keppler method
d = (sum(D(h,[i j]),2)-D(j,i))/2;
if any(d<0)
d = max(d,0);
end
D = [[D(h,h) d];[d' 0]]; % update distance matrix
case 'f' % Gascuel method
if V(i,j)
lambda = max(0,min(1,(1+sum(V(h,i)-V(h,j))/(n-2)/V(i,j))/2));
else
lambda = 1/2;
end
d = D(h,[i j])*[lambda;1-lambda] - bl*[lambda;1-lambda];
if any(d<0)
d = max(d,0);
end
D = [[D(h,h) d];[d' 0]]; % update distance matrix
v = V(h,[i j])*[lambda;1-lambda] - V(i,j)*lambda*(1-lambda);
if any(v<0)
v = max(v,0);
end
V = [[V(h,h) v];[v' 0]]; % update variance matrix
end
p = [p(h),N+bc]; % update pointers
bc = bc+1; % update branch counter
end
Z(bc,:) = p; % pointers of last branch
bd(p) = D(2)/2; % lengths of last branch
% convert data to a phylogenetic tree object
if namesSupplied
t = phyTree(Z,bd,names);
else
t = phyTree(Z,bd);
end
if rerootTree
t = reRoot(t); % reroot with mid-point method
end