ourMELONS/matlab/graph/chooseDistance.m
2019-12-16 16:47:21 +01:00

242 lines
6.9 KiB
Matlab

function [D]=chooseDistance(aln)
% CHOOSEDISTANCE Display a quest dialog for choosing a genetic distance
% input: a structure containing part of variables from the mixture result.
% ouput: a symmetric distance matrix
if (isempty(aln))
error('Need input.');
end
if aln.npops == 1
disp('*** ERROR: no population structure is found.');
return
end
ButtonName=questdlg('Choose a type of genetic distance?', ...
'Select genetic distance', ...
'KL','Nei','Hamming','KL');
switch ButtonName,
case 'KL'
disp('Using KL distance.');
D = dn_kl(aln);
case 'Nei'
disp('Using Nei distance.');
D = dn_nei(aln);
case 'Hamming'
disp('Using Hamming distance. Please wait...');
D = dn_hamming(aln);
% case 'LogDet'
% disp('Using LogDet distance');
% D= dn_logdet(aln);
otherwise
D = [];
end
D = D + D'; % make it symmetric
%--------------------------------------------------------------------------
% SUBFUNCTIONS
%--------------------------------------------------------------------------
function [dist_mat] = dn_kl(aln)
npops = aln.npops;
COUNTS = aln.COUNTS;
adjprior = aln.adjprior;
% data = noIndex(aln.data, aln.noalle);
% partition = aln.partition;
dist_mat = zeros(npops, npops);
maxnoalle = size(COUNTS,1);
nloci = size(COUNTS,2);
d = zeros(maxnoalle, nloci, npops);
prior = adjprior;
prior(find(prior==1))=0;
nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yhtä alleelia.
prior(1,nollia)=1;
for pop1 = 1:npops
d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1);
%dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1);
end
for pop1 = 1:npops
% rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' '];
for pop2 = 1:pop1-1
dist1 = d(:,:,pop1); dist2 = d(:,:,pop2);
div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci;
div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci;
div = (div12+div21)/2;
dist_mat(pop1,pop2) = div;
end
end
%--------------------------------------------------------------------------
function [dist_mat] = dn_nei(aln)
npops = aln.npops;
COUNTS = aln.COUNTS;
% adjprior = aln.adjprior;
% data = aln.data;
% partition = aln.partition;
dist_mat = zeros(npops, npops);
maxnoalle = size(COUNTS,1);
nloci = size(COUNTS,2);
d = zeros(maxnoalle, nloci, npops);
for pop1 = 1:npops
d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))),maxnoalle,1);
%dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1);
end
for pop1 = 1:npops
% rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' '];
for pop2 = 1:pop1-1
dist1 = d(:,:,pop1); dist2 = d(:,:,pop2);
div1 = sum(sum(dist1.*dist2));
div2 = sqrt(sum(sum(dist1.^2)))*sqrt(sum(sum(dist2.^2)));
div = -log(div1/div2);
dist_mat(pop1,pop2) = div;
end
end
%--------------------------------------------------------------------------
function [dist_mat] = dn_hamming(aln)
npops = aln.npops;
data = noIndex(aln.data, aln.noalle);
partition = aln.partition;
dist_mat = zeros(npops, npops);
for pop1 = 1:npops
for pop2 = 1:pop1-1
dist_mat(pop1,pop2) = hamming_dist(data(logical(partition==pop1),[1:end-1]),...
data(logical(partition==pop2),[1:end-1]));
end
end
function dist = hamming_dist(data1,data2)
[length1,nloci] = size(data1);
length2 = size(data2,1);
dist1 = 0;
for i = 1:length1
dist2 = 0;
for j = 1:length2
dist2 = dist2 + sum(data1(i,:)~=data2(j,:))/nloci;
end
dist1 = dist1 + dist2/length2;
end
dist = dist1/length1;
%--------------------------------------------------------------------------
function [D]=dn_logdet(aln)
%DN_LOGDET - Log-det (paralinear) distance
%The LogDet model computes the distance from the determinant of the matrix of
%co-occurrence of nucleotides in the two species, according to the formula
%
% D = - 1/4(loge(|F|) - 1/2loge(fA1 fC1 fG1 fT1 fA2 fC2 fG2 fT2))
%
%Where F is a matrix whose (i,j) element is the fraction of sites at which base
%i occurs in one species and base j occurs in the other. fji is the fraction of
%sites at which species i has base j. The LogDet distance cannot cope with
%ambiguity codes. It must have completely defined sequences. One limitation of
%the LogDet distance is that it may be infinite sometimes, if there are too many
%changes between certain pairs of nucleotides. This can be particularly
%noticeable with distances computed from bootstrapped sequences.
%
% Syntax: [D]=dn_logdet(aln)
%
% Inputs:
% aln - Alignment structure
%
% Outputs:
% D - Distance matrix
%
% See also:
% Molecular Biology & Evolution Toolbox, (C) 2006
% Author: James J. Cai
% Email: jamescai@hku.hk
% Website: http://bioinformatics.org/mbetoolbox/
% Last revision: 5/3/2006
if (isstruct(aln)),
S=aln.data;
else
S=aln;
end
[n,m] = size(S);
D = zeros(n,n);
for i=1:n-1
for j=i+1:n
D(i,j) = d_logdet(S(i,:), S(j,:));
D(j,i) = D(i,j);
end
end
%--------------------------------------------------------------------------
function d=d_logdet(seq1, seq2)
[S,gap] = countntchange(seq1, seq2);
% S=count_ntchanges(seq1, seq2);
if (det(S)<=0)
d=inf;
else
f1=sum(S);
f2=sum(S');
d=(-1/4)*(log(det(S))-(1/2)*log( prod(f1)*prod(f2)));
end
%--------------------------------------------------------------------------
function [D,gap]=countntchange(s1,s2)
%COUNTNTCHANGE - Count nucleotide changes in two DNA sequences
%D is a 4x4 array, with bases in seq1 along top, seq2 along side,
%in order A,C,G,T.
%
% Syntax: [D,gap]=countntchange(s1,s2)
%
% Inputs:
% s1 - Sequence 1 vector
% s2 - Sequence 2 vector
%
% Outputs:
% D - Codon Adaptation Index value
% gap - Codon Adaptation Index value
%
%
% See also: COUNTAACHANGE COUNTCDCHANGE
% Molecular Biology & Evolution Toolbox, (C) 2006
% Author: James J. Cai
% Email: jamescai@hku.hk
% Website: http://bioinformatics.org/mbetoolbox/
% Last revision: 3/28/2006
if (nargout>1)
[D,gap]=countchange(s1,s2,4);
else
[D]=countchange(s1,s2,4);
end
% -------------------------------------------------------------------------
function [D,gap] = countchange(s1,s2,nword)
if ~(ismember(nword, [4 20 61])),
error('Wrong NWORD')
end
if (length(s1)~=length(s2)),
error('Sequences are not of same length.')
end
D=zeros(nword);
WORD=1:nword;
for j=1:nword
s1sites=(s1==WORD(j));
for i=1:nword
D(i,j)=sum(s1sites & (s2==WORD(i)));
end
end
if (nargout>1),
gap = length(s1)-sum(D(:)); % gaps
end