198 lines
7.6 KiB
Mathematica
198 lines
7.6 KiB
Mathematica
|
|
function tr = phyTreeRead(filename)
|
||
|
|
%PHYTREEREAD reads a NEWICK tree formatted file.
|
||
|
|
%
|
||
|
|
% TREE = PHYTREEREAD(FILENAME) reads a NEWICK tree formatted file
|
||
|
|
% FILENAME, returning the data in the file as a PHYTREE object. FILENAME
|
||
|
|
% can also be a URL or MATLAB character array that contains the text of a
|
||
|
|
% NEWICK format file.
|
||
|
|
%
|
||
|
|
% The NEWICK tree format is found at:
|
||
|
|
% http://evolution.genetics.washington.edu/phylip/newicktree.html
|
||
|
|
%
|
||
|
|
% Note: This implementation only allows binary trees, non-binary trees
|
||
|
|
% will be translated into a binary tree with extra branches of length 0.
|
||
|
|
%
|
||
|
|
% Example:
|
||
|
|
%
|
||
|
|
% tr = phytreeread('pf00002.tree')
|
||
|
|
%
|
||
|
|
% See also GETHMMTREE, PHYTREE, PHYTREETOOL, PHYTREEWRITE.
|
||
|
|
|
||
|
|
% Copyright 2003-2004 The MathWorks, Inc.
|
||
|
|
% $Revision: 1.1.6.10 $ $Author: batserve $ $Date: 2004/04/14 23:57:16 $
|
||
|
|
|
||
|
|
|
||
|
|
if nargin==0
|
||
|
|
[filename, pathname] = uigetfile({'*.tree';'*.dnd'},'Select Phylogenetic Tree File');
|
||
|
|
if ~filename
|
||
|
|
disp('Canceled, file not read.');
|
||
|
|
tr=[];
|
||
|
|
return;
|
||
|
|
end
|
||
|
|
filename = [pathname, filename];
|
||
|
|
end
|
||
|
|
|
||
|
|
% check input is char
|
||
|
|
% in a future version we may accept also cells
|
||
|
|
if ~ischar(filename)
|
||
|
|
error('Bioinfo:InvalidInput','Input must be a character array')
|
||
|
|
end
|
||
|
|
|
||
|
|
if size(filename,1)>1 % is padded string ?
|
||
|
|
strin = cellstr(filename);
|
||
|
|
strin = [strin{:}];
|
||
|
|
elseif (strfind(filename(1:min(10,end)), '://')) % is an url ?
|
||
|
|
if (~usejava('jvm'))
|
||
|
|
error('Bioinfo:NoJava','Reading from a URL requires Java.')
|
||
|
|
end
|
||
|
|
try
|
||
|
|
strin = urlread(filename);
|
||
|
|
catch
|
||
|
|
error('Bioinfo:CannotReadURL','Cannot read URL "%s".', filename);
|
||
|
|
end
|
||
|
|
strin = strread(strin,'%c','delimiter','\n')';
|
||
|
|
elseif (exist(filename,'file') || ...
|
||
|
|
exist(fullfile(cd,filename),'file') ) % is a valid filename ?
|
||
|
|
strin = textread(filename,'%c','delimiter','\n')';
|
||
|
|
else % must be single a string with '\n'
|
||
|
|
strin = strread(filename,'%c','delimiter','\n')';
|
||
|
|
end
|
||
|
|
|
||
|
|
% characterizing the string
|
||
|
|
numBranches = sum(strin==',');
|
||
|
|
numLeaves = numBranches + 1;
|
||
|
|
numLabels = numBranches + numLeaves;
|
||
|
|
|
||
|
|
if (numBranches == 0)
|
||
|
|
error('Bioinfo:NoCommaInInputString', ...
|
||
|
|
['There is not any comma in the data,\ninput string may not '...
|
||
|
|
'be in Newick style or is not a valid filename.'])
|
||
|
|
end
|
||
|
|
|
||
|
|
% find the string features: open and close parentheses and leaves
|
||
|
|
leafPositions = regexp(strin,'[(,][^(,)]')+1;
|
||
|
|
parenthesisPositions = regexp(strin,'[()]');
|
||
|
|
strFeatures = strin(sort([leafPositions parenthesisPositions]));
|
||
|
|
|
||
|
|
% some consistency checking on the parenthesis
|
||
|
|
temp = cumsum((strFeatures=='(') - (strFeatures==')'));
|
||
|
|
if any(temp(1:end-1)<1) || (temp(end)~=0)
|
||
|
|
error('Bioinfo:InconsistentParentheses','The parentheses structure is inconsistent,\ninput string may not be in Newick style or is not a valid filename.')
|
||
|
|
end
|
||
|
|
|
||
|
|
dist = zeros(numLabels,1); % allocating space for distances
|
||
|
|
tree = zeros(numBranches,2); % allocating space for tree pointers
|
||
|
|
names = cell(numLabels,1); % allocating space for tree labels
|
||
|
|
|
||
|
|
try
|
||
|
|
|
||
|
|
% extract label information for the leaves
|
||
|
|
leafData = regexp(strin,'[(,][^(,);\[\]]+','match');
|
||
|
|
for j=1:numel(leafData)
|
||
|
|
coi = find(leafData{j}==':',1,'last');
|
||
|
|
if isempty(coi) % if no colon no length, the whole label is the name
|
||
|
|
dist(j) = 0;
|
||
|
|
names{j} = leafData{j}(2:end);
|
||
|
|
else % if there is colon, get name and length
|
||
|
|
dist(j) = strread(leafData{j}(coi+1:end),'%f');
|
||
|
|
names{j} = leafData{j}(2:coi-1);
|
||
|
|
end
|
||
|
|
end
|
||
|
|
% uniformizing empty cells, value inside the brackets can never be empty
|
||
|
|
% because branch names will always be empty
|
||
|
|
[names{cellfun('isempty',names)}] = deal('');
|
||
|
|
|
||
|
|
% extract label information for the parenthesis
|
||
|
|
parenthesisData = regexp(strin,')[^(,);\[\]]*','match');
|
||
|
|
parenthesisDist = zeros(numel(parenthesisData),1);
|
||
|
|
for j=1:numel(parenthesisData)
|
||
|
|
coi = find(parenthesisData{j}==':',1,'last');
|
||
|
|
if isempty(coi) % if no colon no length, the whole label is the name
|
||
|
|
parenthesisDist(j) = 0;
|
||
|
|
parenthesisData{j} = parenthesisData{j}(2:end);
|
||
|
|
else % if there is colon, get name and length
|
||
|
|
parenthesisDist(j) = strread(parenthesisData{j}(coi+1:end),'%f');
|
||
|
|
parenthesisData{j} = parenthesisData{j}(2:coi-1);
|
||
|
|
end
|
||
|
|
end
|
||
|
|
% uniformizing empty cells, value inside brackes may be empty
|
||
|
|
if any(cellfun('isempty',parenthesisData))
|
||
|
|
[parenthesisData{cellfun('isempty',parenthesisData)}] = deal('');
|
||
|
|
end
|
||
|
|
|
||
|
|
li = 1; bi = 1; pi = 1; % indexes for leaf, branch and parentheses
|
||
|
|
queue = zeros(1,2*numLeaves); qp = 0; % setting the queue (worst case size)
|
||
|
|
|
||
|
|
j = 1;
|
||
|
|
|
||
|
|
while j <= numel(strFeatures)
|
||
|
|
switch strFeatures(j)
|
||
|
|
case ')' % close parenthesis, pull values from the queue to create
|
||
|
|
% a new branch and push the new branch # into the queue
|
||
|
|
lastOpenPar = find(queue(1:qp)==0,1,'last');
|
||
|
|
numElemInPar = min(3,qp-lastOpenPar);
|
||
|
|
switch numElemInPar
|
||
|
|
case 2 % 99% of the cases, two elements in the parenthesis
|
||
|
|
bp = bi + numLeaves;
|
||
|
|
names{bp} = parenthesisData{pi}; % set name
|
||
|
|
dist(bp) = parenthesisDist(pi); % set length
|
||
|
|
tree(bi,:) = queue(qp-1:qp);
|
||
|
|
qp = qp - 2; % writes over the open par mark
|
||
|
|
queue(qp) = bp;
|
||
|
|
bi = bi + 1;
|
||
|
|
pi = pi + 1;
|
||
|
|
case 3 % find in non-binary trees, create a phantom branch
|
||
|
|
bp = bi + numLeaves;
|
||
|
|
names{bp} = ''; % set name
|
||
|
|
dist(bp) = 0; % set length
|
||
|
|
tree(bi,:) = queue(qp-1:qp);
|
||
|
|
qp = qp - 1; % writes over the left element
|
||
|
|
queue(qp) = bp;
|
||
|
|
bi = bi + 1;
|
||
|
|
j = j - 1; %repeat this closing branch to get the rest
|
||
|
|
case 1 % parenthesis with no meaning (holds one element)
|
||
|
|
qp = qp - 1;
|
||
|
|
queue(qp) = queue(qp+1);
|
||
|
|
pi = pi + 1;
|
||
|
|
case 0 % an empty parenthesis pair
|
||
|
|
error('Bioinfo:ParenthesisPairWithNoData', ...
|
||
|
|
['Found parenthesis pair with no data,\n', ...
|
||
|
|
'input string may not be in Newick style or',...
|
||
|
|
'is not a valid filename.'])
|
||
|
|
end % switch numElemInPar
|
||
|
|
|
||
|
|
case '(' % an open parenthesis marker (0) pushed into the queue
|
||
|
|
qp = qp + 1;
|
||
|
|
queue(qp) = 0;
|
||
|
|
|
||
|
|
otherwise % a new leaf pushed into the queue
|
||
|
|
qp = qp + 1;
|
||
|
|
queue(qp) = li;
|
||
|
|
li = li + 1;
|
||
|
|
end % switch strFeatures
|
||
|
|
j = j + 1;
|
||
|
|
end % while j ...
|
||
|
|
|
||
|
|
catch
|
||
|
|
le = lasterror;
|
||
|
|
if strcmp(le.identifier,'Bioinfo:ParenthesisPairWithNoData')
|
||
|
|
rethrow(le)
|
||
|
|
else
|
||
|
|
error('Bioinfo:IncorrectString',...
|
||
|
|
['An error occurred while trying to interpret the data,\n'...
|
||
|
|
'input string may not be in Newick style or is not a '...
|
||
|
|
'valid filename.'])
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
% make sure all dists are greater than 0
|
||
|
|
dist = max(0,dist);
|
||
|
|
|
||
|
|
if sum(dist) == 0 % there was no distance information so force to an unitary ultrametric tree
|
||
|
|
tr = phyTree(tree,names);
|
||
|
|
elseif sum(dist(1:numLeaves)) == 0 % no dist infor for leaves, so force an ultrametric tree
|
||
|
|
tr = phyTree(tree,dist(numLeaves+1:end),names);
|
||
|
|
else % put all info into output object
|
||
|
|
tr = phyTree(tree,dist,names);
|
||
|
|
end
|