Added source Matlab code for reference
This commit is contained in:
parent
b8af977117
commit
b5d99903d2
186 changed files with 61405 additions and 1 deletions
250
matlab/graph/seqNeighJoin.m
Normal file
250
matlab/graph/seqNeighJoin.m
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
function t = seqNeighJoin(D, method, names, varargin)
|
||||
%SEQNEIGHJOIN neighbor-joining method for phylogenetic tree reconstruction.
|
||||
%
|
||||
% TREE = SEQNEIGHJOIN(DIST) computes a phylogenetic tree object from the
|
||||
% pairwise distances DIST between the species or products applying the
|
||||
% neighbor-joining method. The input DIST is a matrix (or vector) such as
|
||||
% is generated by SEQPDIST.
|
||||
%
|
||||
% TREE = SEQNEIGHJOIN(DIST,METHOD) selects the method to compute the
|
||||
% distances of the new nodes to all other nodes at every iteration. The
|
||||
% general expression to calculate the distances between the new node (n)
|
||||
% (after joining i and j) and all others nodes (k) is given by:
|
||||
%
|
||||
% D(n,k) = a*D(i,k) + (1-a)*D(j,k) - a*D(n,i) - (1-a)*D(n,j)
|
||||
%
|
||||
% This expression guarantees to find the correct tree with additive data
|
||||
% (a.k.a. minimum variance reduction). The options for METHOD are:
|
||||
%
|
||||
% 'equivar' --- Assumes equal variance and independence of
|
||||
% (default) evolutionary distance estimates (a = 1/2). Such as
|
||||
% in Studier and Keppler, JMBE (1988).
|
||||
% 'firstorder' --- Assumes a first order model of the variances and
|
||||
% covariances of evolutionary distance estimates, 'a'
|
||||
% is adjusted at every iteration to a value between 0
|
||||
% and 1. Such as in Gascuel, JMBE (1997).
|
||||
% 'average' --- New distances are the weighted average of previous
|
||||
% distances, the branch distances are ignored:
|
||||
% D(n,k) = [ D(i,k) + D(j,k) ] /2
|
||||
% As in the original neighbor-joining algorithm by
|
||||
% Saitou and Nei, JMBE (1987)
|
||||
%
|
||||
% TREE = SEQNEIGHJOIN(DIST,METHOD,NAMES) passes a list of names to label
|
||||
% the leaf nodes (e.g. species or products) in the phylogenetic tree
|
||||
% object. NAMES can be a vector of structures with the fields 'Header' or
|
||||
% 'Name' or a cell array of strings. In both cases the number of elements
|
||||
% provided must comply with the number of samples used to generate the
|
||||
% pairwise distances in DIST.
|
||||
%
|
||||
% TREE = SEQNEIGHJOIN(...,'REROOT',false) excludes rerooting the resulting
|
||||
% tree. This is useful to observe the original linkage order followed by
|
||||
% the algorithm. By default SEQNEIGHJOIN reroots the resulting tree using
|
||||
% the mid-point method.
|
||||
%
|
||||
% Example:
|
||||
%
|
||||
% % Load a multiple alignment of amino acids:
|
||||
% seqs = fastaread('pf00002.fa');
|
||||
%
|
||||
% % Measure the 'Jukes-Cantor' pairwise distances:
|
||||
% dist = seqpdist(seqs,'method','jukes-cantor','indels','pair');
|
||||
%
|
||||
% % Build the phylogenetic using the neighbor-joining algorithm
|
||||
% tree = seqneighjoin(dist,'equivar',seqs)
|
||||
% view(tree)
|
||||
%
|
||||
% See also MULTIALIGN, PHYTREE, PHYTREE/REROOT, PHYTREE/VIEW, SEQLINKAGE,
|
||||
% SEQPDIST.
|
||||
|
||||
% References:
|
||||
% [1] Saitou N, Nei M.The neighbor-joining method: a new method for
|
||||
% reconstructing phylogenetic trees. Mol Biol Evol.(1987) 4(4):406-25
|
||||
% [2] Gascuel O. BIONJ: An improved version of the NJ algorithm based on a
|
||||
% simple model of sequence data. Mol. Biol. Evol. (1997) 14:685-695
|
||||
% [3] Studier JA, Keppler KJ. A note on the neighbor-joining algorithm of
|
||||
% Saitou and Nei. Mol Biol Evol. (1988) 5(6):729-31.
|
||||
|
||||
% Copyright 2003-2006 The MathWorks, Inc.
|
||||
% $Revision: 1.1.8.2 $ $Date: 2006/05/17 20:48:43 $
|
||||
|
||||
rerootTree = true;
|
||||
checkForNames = true;
|
||||
|
||||
% check the input distances
|
||||
if isnumeric(D)
|
||||
[m, n] = size(D);
|
||||
if isvector(D)
|
||||
n = numel(D);
|
||||
m = (1+sqrt(1+8*n))/2; % number of leaves
|
||||
if m ~= fix(m)
|
||||
error('Bioinfo:seqneighjoin:DbadSize',...
|
||||
'Size of DIST not compatible with the output of the SEQPDIST function.');
|
||||
end
|
||||
D = squareform(D);
|
||||
elseif m~=n
|
||||
error('Bioinfo:seqneighjoin:DnotSquare',...
|
||||
'Size of DIST not compatible with the output of the SEQPDIST function.');
|
||||
end
|
||||
else
|
||||
error('Bioinfo:seqneighjoin:DnotNumeric',...
|
||||
'DIST must be a numeric vector compatible with the output of the SEQPDIST function.');
|
||||
end
|
||||
|
||||
% Selects appropiate method
|
||||
if nargin == 1
|
||||
method = 'e'; % set default method
|
||||
checkForNames = false;
|
||||
else
|
||||
okmethods = {'equivar','firstorder','average','reroot'};
|
||||
methodkeys = {'e','f','a'};
|
||||
k = find(strncmpi(method,okmethods,numel(method)));
|
||||
if isempty(k)
|
||||
error('Bioinfo:seqneighjoin:UnknownMethod',...
|
||||
'Unknown method name: %s.',method);
|
||||
elseif length(k)>1
|
||||
error('Bioinfo:seqneighjoin:IncorrectMethod',...
|
||||
'Ambiguous method name: %s.',method);
|
||||
elseif k<4
|
||||
method = methodkeys{k};
|
||||
else % case that second and third input are optional paired inputs
|
||||
if numel(varargin) || nargin==2
|
||||
error('Bioinfo:seqneighjoin:IncorrectInputArguments',...
|
||||
'Incorrect format of input arguments.')
|
||||
end
|
||||
varargin = {method,names};
|
||||
checkForNames = false;
|
||||
method = 'e'; % set default method
|
||||
end
|
||||
end
|
||||
|
||||
% detects the names
|
||||
if checkForNames && (nargin >2) % names were supplied, check validity
|
||||
if iscell(names) || isfield(names,'Header') || isfield(names,'Name') || isfield(names,'LocusName')
|
||||
if isfield(names,'Header') % if struct put them in a cell
|
||||
names = {names(:).Header};
|
||||
elseif isfield(names,'Name') % if struct put them in a cell
|
||||
names = {names(:).Name};
|
||||
elseif isfield(names,'LocusName') % if struct put them in a cell
|
||||
names = {names(:).LocusName};
|
||||
end
|
||||
names = names(:);
|
||||
namesSupplied = true;
|
||||
if numel(names)~=m
|
||||
error('Bioinfo:seqneighjoin:IncorrectSize',...
|
||||
'NAMES must have the same size as number of leaves in the tree')
|
||||
end
|
||||
elseif strncmpi(names,{'reroot'},numel(names))
|
||||
varargin = {names varargin{:}};
|
||||
namesSupplied = false;
|
||||
else
|
||||
error('Bioinfo:seqneighjoin:IncorrectInputType',...
|
||||
'NAMES must be a cell with char arrays or a vector of structures.')
|
||||
end
|
||||
else
|
||||
namesSupplied = false;
|
||||
end
|
||||
|
||||
% check optional input arguments
|
||||
nvarargin = numel(varargin);
|
||||
if nvarargin
|
||||
if rem(nvarargin,2)
|
||||
error('Bioinfo:seqneighjoin:IncorrectNumberOfArguments',...
|
||||
'Incorrect number of arguments to %s.',mfilename);
|
||||
end
|
||||
okargs = {'reroot',''};
|
||||
for j=1:2:nvarargin
|
||||
pname = varargin{j};
|
||||
pval = varargin{j+1};
|
||||
k = find(strncmpi(pname,okargs,numel(pname)));
|
||||
if isempty(k)
|
||||
error('Bioinfo:seqneighjoin:UnknownParameterName',...
|
||||
'Unknown parameter name: %s.',pname);
|
||||
elseif length(k)>1
|
||||
error('Bioinfo:seqneighjoin:AmbiguousParameterName',...
|
||||
'Ambiguous parameter name: %s.',pname);
|
||||
else
|
||||
rerootTree = opttf(pval);
|
||||
if isempty(rerootTree)
|
||||
error('Bioinfo:seqneighjoin:rerootInputOptionNotLogical',...
|
||||
'%s must be a logical value, true or false.',...
|
||||
upper(char(okargs(k))));
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
% ------------------------------------------------------------------------
|
||||
% Algorithm starts here:
|
||||
%
|
||||
% D - pairwise distance matrix (full form)
|
||||
% method - 'a' for Saitou & Nei, 'e' for Studier & Keppler, and 'f' for
|
||||
% Gascuel
|
||||
% names - labels for leaf nodes
|
||||
|
||||
N = size(D,1);
|
||||
Z = zeros(N-1,2);
|
||||
bd = zeros(N*2-1,1);
|
||||
|
||||
p = 1:N; % pointers in matrix
|
||||
bc=1; % branch counter
|
||||
if method=='f'
|
||||
V=D; %initialize the variance matrix for Gascuel method
|
||||
end
|
||||
|
||||
for n = N:-1:3
|
||||
R = sum(D)/(n-2); % sums of columns
|
||||
Q = D-repmat(R,n,1)-repmat(R',1,n)+diag(inf(n,1)); %Studier & Keppler optimized
|
||||
% S = (sum(sum(D))/(n-2)+Q)/2 % original total sum in Saitou & Nei
|
||||
[m,g] = min(Q(:)); %#ok
|
||||
[i,j] = ind2sub(n,g); % find minimum
|
||||
if i>j
|
||||
k=i;i=j;j=k; % j>i always
|
||||
end
|
||||
pp = p([i j]); % pointers to join
|
||||
bl = (R([i j])*[1 -1;-1 1] + D(j,i))/2; % branch lengths
|
||||
bl = max(bl,0);
|
||||
bd(pp) = bl; % save branch lengths
|
||||
Z(bc,:) = pp; % save pointers
|
||||
h = [1:i-1 i+1:j-1 j+1:n];
|
||||
switch method % distances to new node
|
||||
case 'a' % Saitou & Nei method
|
||||
d = (sum(D(h,[i j]),2))/2;
|
||||
D = [[D(h,h) d];[d' 0]]; % update distance matrix
|
||||
case 'e' % Studier & Keppler method
|
||||
d = (sum(D(h,[i j]),2)-D(j,i))/2;
|
||||
if any(d<0)
|
||||
d = max(d,0);
|
||||
end
|
||||
D = [[D(h,h) d];[d' 0]]; % update distance matrix
|
||||
case 'f' % Gascuel method
|
||||
if V(i,j)
|
||||
lambda = max(0,min(1,(1+sum(V(h,i)-V(h,j))/(n-2)/V(i,j))/2));
|
||||
else
|
||||
lambda = 1/2;
|
||||
end
|
||||
d = D(h,[i j])*[lambda;1-lambda] - bl*[lambda;1-lambda];
|
||||
if any(d<0)
|
||||
d = max(d,0);
|
||||
end
|
||||
D = [[D(h,h) d];[d' 0]]; % update distance matrix
|
||||
v = V(h,[i j])*[lambda;1-lambda] - V(i,j)*lambda*(1-lambda);
|
||||
if any(v<0)
|
||||
v = max(v,0);
|
||||
end
|
||||
V = [[V(h,h) v];[v' 0]]; % update variance matrix
|
||||
end
|
||||
p = [p(h),N+bc]; % update pointers
|
||||
bc = bc+1; % update branch counter
|
||||
end
|
||||
Z(bc,:) = p; % pointers of last branch
|
||||
bd(p) = D(2)/2; % lengths of last branch
|
||||
|
||||
% convert data to a phylogenetic tree object
|
||||
if namesSupplied
|
||||
t = phyTree(Z,bd,names);
|
||||
else
|
||||
t = phyTree(Z,bd);
|
||||
end
|
||||
if rerootTree
|
||||
t = reRoot(t); % reroot with mid-point method
|
||||
end
|
||||
Loading…
Add table
Add a link
Reference in a new issue