1034 lines
No EOL
34 KiB
Matlab
1034 lines
No EOL
34 KiB
Matlab
function linkage_admix(tietue)
|
||
|
||
global COUNTS; global PARTITION; global SUMCOUNTS;
|
||
clearGlobalVars;
|
||
|
||
PARTITION = tietue.PARTITION;
|
||
COUNTS = tietue.COUNTS;
|
||
SUMCOUNTS = tietue.SUMCOUNTS;
|
||
rowsFromInd = tietue.rowsFromInd;
|
||
data = double(tietue.data);
|
||
npops = tietue.npops;
|
||
noalle = tietue.noalle;
|
||
switch tietue.mixtureType
|
||
case 'linear_mix'
|
||
linkage_model = 'linear';
|
||
case 'codon_mix'
|
||
linkage_model = 'codon';
|
||
end
|
||
if isfield(tietue,'gene_lengths')
|
||
gene_lengths = tietue.gene_lengths;
|
||
else
|
||
[filename, pathname] = uigetfile('*.txt', 'Load file with lengths of the genes (same order as in data).');
|
||
gene_lengths = load([pathname filename]);
|
||
end
|
||
|
||
if length(unique(PARTITION(find(PARTITION>0))))==1
|
||
disp('Only one population in the input file');
|
||
disp('No admixture detected');
|
||
return
|
||
end
|
||
|
||
answers = inputdlg({['Input the minimum size of a population that will'...
|
||
' be taken into account when admixture is estimated.']},...
|
||
'Input minimum population size',1,{'5'});
|
||
if isempty(answers), return; end
|
||
alaRaja = str2double(answers{1,1});
|
||
[npops] = poistaLiianPienet(npops, rowsFromInd, alaRaja);
|
||
|
||
nloci = size(COUNTS,2);
|
||
ninds = size(data,1)/rowsFromInd;
|
||
|
||
answers = inputdlg({'Input number of iterations'},'Input',1,{'50'});
|
||
if isempty(answers), return; end
|
||
iterationCount = str2double(answers{1,1});
|
||
|
||
answers = inputdlg({'Input number of reference individuals from each population'},'Input',1,{'50'});
|
||
if isempty(answers), nrefIndsInPop = 50;
|
||
else nrefIndsInPop = str2double(answers{1,1});
|
||
end
|
||
|
||
answers = inputdlg({'Input number of iterations for reference individuals'},'Input',1,{'10'});
|
||
if isempty(answers), return; end
|
||
iterationCountRef = str2double(answers{1,1});
|
||
|
||
[cliq_data, sep_data, cliq_counts, component_mat] = createCliqData(data, gene_lengths, noalle, ...
|
||
linkage_model, rowsFromInd);
|
||
|
||
% Repeat: simulate clique frequencies and estimate proportions. Save the
|
||
% average proportions in "proportionsIt".
|
||
|
||
proportionsIt = zeros(ninds,npops);
|
||
for iterationNum = 1:iterationCount
|
||
disp(['Iter: ' num2str(iterationNum)]);
|
||
%allfreqs = simulateAllFreqs(noalle);
|
||
[cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model);
|
||
for ind=1:ninds
|
||
|
||
%omaFreqs = computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd);
|
||
[ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(ind, ...
|
||
cliq_data, cliq_freqs, sep_data, sep_freqs, rowsFromInd, gene_lengths, linkage_model);
|
||
osuusTaulu = zeros(1,npops);
|
||
if PARTITION(ind)==0
|
||
% Outlier individual
|
||
elseif PARTITION(ind)~=0
|
||
if PARTITION(ind)>0
|
||
osuusTaulu(PARTITION(ind)) = 1;
|
||
else
|
||
% Individuals who are not assigned to any cluster.
|
||
arvot = zeros(1,npops);
|
||
for q=1:npops
|
||
osuusTaulu = zeros(1,npops);
|
||
osuusTaulu(q) = 1;
|
||
arvot(q) = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu);
|
||
end
|
||
[iso_arvo, isoimman_indeksi] = max(arvot);
|
||
osuusTaulu = zeros(1,npops);
|
||
osuusTaulu(isoimman_indeksi) = 1;
|
||
PARTITION(ind)=isoimman_indeksi;
|
||
end
|
||
logml = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu);
|
||
|
||
for osuus = [0.5 0.25 0.05 0.01]
|
||
[osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml);
|
||
end
|
||
end
|
||
proportionsIt(ind,:) = proportionsIt(ind,:).*(iterationNum-1) + osuusTaulu;
|
||
proportionsIt(ind,:) = proportionsIt(ind,:)./iterationNum;
|
||
end
|
||
end
|
||
|
||
disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']);
|
||
disp('each population.');
|
||
|
||
%allfreqs = simulateAllFreqs(noalle); % Simuloidaan alleelifrekvenssisetti
|
||
%allfreqs = computeAllFreqs2(noalle); % Koitetaan t<>llaista.
|
||
%refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs);
|
||
|
||
exp_cliq_freqs = ...
|
||
computeExpectedFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model);
|
||
[ref_cliq_data, ref_sep_data] = ...
|
||
simulateLinkageIndividuals(nrefIndsInPop, rowsFromInd, exp_cliq_freqs, ...
|
||
gene_lengths, noalle, component_mat, linkage_model);
|
||
nrefInds = npops*nrefIndsInPop;
|
||
|
||
disp(['Analysing the reference individuals in ' num2str(iterationCountRef) ' iterations.']);
|
||
refProportions = zeros(nrefInds,npops);
|
||
for iter = 1:iterationCountRef
|
||
disp(['Iter: ' num2str(iter)]);
|
||
%allfreqs = simulateAllFreqs(noalle);
|
||
[cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model);
|
||
for ind = 1:nrefInds
|
||
%omaFreqs = computePersonalAllFreqs(ind, refData, allfreqs, rowsFromInd);
|
||
[ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(ind, ...
|
||
ref_cliq_data, cliq_freqs, ref_sep_data, sep_freqs, rowsFromInd, gene_lengths, linkage_model);
|
||
osuusTaulu = zeros(1,npops);
|
||
pop = ceil(ind/nrefIndsInPop);
|
||
osuusTaulu(pop)=1;
|
||
%logml = computeIndLogml(omaFreqs, osuusTaulu);
|
||
logml = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu);
|
||
for osuus = [0.5 0.25 0.05 0.01]
|
||
%[osuusTaulu, logml] = searchBest(osuus, osuusTaulu, omaFreqs, logml);
|
||
[osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml);
|
||
end
|
||
refProportions(ind,:) = refProportions(ind,:).*(iter-1) + osuusTaulu;
|
||
refProportions(ind,:) = refProportions(ind,:)./iter;
|
||
end
|
||
end
|
||
refTaulu = zeros(npops,100);
|
||
for ind = 1:nrefInds
|
||
pop = ceil(ind/nrefIndsInPop);
|
||
omanOsuus = refProportions(ind,pop);
|
||
if round(omanOsuus*100)==0
|
||
omanOsuus = 0.01;
|
||
end
|
||
if abs(omanOsuus)<1e-5
|
||
omanOsuus = 0.01;
|
||
end
|
||
refTaulu(pop, round(omanOsuus*100)) = refTaulu(pop, round(omanOsuus*100))+1;
|
||
end
|
||
|
||
% Rounding:
|
||
proportionsIt = proportionsIt.*100; proportionsIt = round(proportionsIt);
|
||
proportionsIt = proportionsIt./100;
|
||
for ind = 1:ninds
|
||
% if sum not equal to one, fix the largest part.
|
||
if (PARTITION(ind)>0) && (sum(proportionsIt(ind,:)) ~= 1)
|
||
[isoin,indeksi] = max(proportionsIt(ind,:));
|
||
erotus = sum(proportionsIt(ind,:))-1;
|
||
proportionsIt(ind,indeksi) = isoin-erotus;
|
||
end
|
||
end
|
||
|
||
% "p-value" for admixture
|
||
uskottavuus = zeros(ninds,1);
|
||
for ind = 1:ninds
|
||
pop = PARTITION(ind);
|
||
if pop==0 % an outlier
|
||
uskottavuus(ind)=1;
|
||
else
|
||
omanOsuus = proportionsIt(ind,pop);
|
||
if abs(omanOsuus)<1e-5
|
||
omanOsuus = 0.01;
|
||
end
|
||
if round(omanOsuus*100)==0
|
||
omanOsuus = 0.01;
|
||
end
|
||
refPienempia = sum(refTaulu(pop, 1:round(100*omanOsuus)));
|
||
uskottavuus(ind) = refPienempia / nrefIndsInPop;
|
||
end
|
||
end
|
||
|
||
tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount);
|
||
|
||
%viewPartition(proportionsIt, popnames);
|
||
|
||
[filename, pathname] = uiputfile('*.mat','Save admixture results as');
|
||
if (filename == 0) & (pathname == 0)
|
||
% Cancel was pressed
|
||
return
|
||
end
|
||
|
||
% copy 'baps4_output.baps' into the text file with the same name.
|
||
if exist('baps4_output.baps','file')
|
||
copyfile('baps4_output.baps',[pathname filename '.txt'])
|
||
delete('baps4_output.baps')
|
||
end
|
||
|
||
tietue.proportionsIt = proportionsIt;
|
||
tietue.pvalue = uskottavuus; % Added by Jing
|
||
tietue.admixnpops = npops;
|
||
tietue.mixtureType = 'admix'; % added by jing on 09.09.2008
|
||
% save([pathname filename], 'tietue');
|
||
save([pathname filename], 'tietue','-v7.3'); % added by Lu Cheng, 08.06.2012
|
||
|
||
|
||
%----------------------------------------------------------------------------
|
||
|
||
|
||
function [npops] = poistaLiianPienet(npops, rowsFromInd, alaraja)
|
||
% Muokkaa tulokset muotoon, jossa outlier yksil<69>t on
|
||
% poistettu. Tarkalleen ottaen poistaa ne populaatiot,
|
||
% joissa on v<>hemm<6D>n kuin 'alaraja':n verran yksil<69>it?
|
||
|
||
global PARTITION;
|
||
global COUNTS;
|
||
global SUMCOUNTS;
|
||
|
||
popSize=zeros(1,npops);
|
||
for i=1:npops
|
||
popSize(i)=length(find(PARTITION==i));
|
||
end
|
||
miniPops = find(popSize<alaraja);
|
||
|
||
if length(miniPops)==0
|
||
return;
|
||
end
|
||
|
||
outliers = [];
|
||
for pop = miniPops
|
||
inds = find(PARTITION==pop);
|
||
disp('Removed individuals: ');
|
||
disp(num2str(inds));
|
||
outliers = [outliers; inds];
|
||
end
|
||
|
||
ninds = length(PARTITION);
|
||
PARTITION(outliers) = 0;
|
||
korit = unique(PARTITION(find(PARTITION>0)));
|
||
for n=1:length(korit)
|
||
kori = korit(n);
|
||
yksilot = find(PARTITION==kori);
|
||
PARTITION(yksilot) = n;
|
||
end
|
||
COUNTS(:,:,miniPops) = [];
|
||
SUMCOUNTS(miniPops,:) = [];
|
||
|
||
npops = npops-length(miniPops);
|
||
|
||
%------------------------------------------------------------------------
|
||
|
||
function clearGlobalVars
|
||
|
||
global COUNTS; COUNTS = [];
|
||
global SUMCOUNTS; SUMCOUNTS = [];
|
||
global PARTITION; PARTITION = [];
|
||
global POP_LOGML; POP_LOGML = [];
|
||
|
||
%--------------------------------------------------------
|
||
|
||
|
||
function allFreqs = computeAllFreqs2(noalle)
|
||
% Lis<69><73> a priori jokaista alleelia
|
||
% joka populaation joka lokukseen j 1/noalle(j) verran.
|
||
|
||
global COUNTS;
|
||
global SUMCOUNTS;
|
||
|
||
max_noalle = size(COUNTS,1);
|
||
nloci = size(COUNTS,2);
|
||
npops = size(COUNTS,3);
|
||
|
||
sumCounts = SUMCOUNTS+ones(size(SUMCOUNTS));
|
||
sumCounts = reshape(sumCounts', [1, nloci, npops]);
|
||
sumCounts = repmat(sumCounts, [max_noalle, 1 1]);
|
||
|
||
prioriAlleelit = zeros(max_noalle,nloci);
|
||
for j=1:nloci
|
||
prioriAlleelit(1:noalle(j),j) = 1/noalle(j);
|
||
end
|
||
prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]);
|
||
counts = COUNTS + prioriAlleelit;
|
||
allFreqs = counts./sumCounts;
|
||
|
||
%--------------------------------------------------------------------------
|
||
|
||
|
||
function refData = simulateIndividuals(n,rowsFromInd,allfreqs)
|
||
% simuloidaan n yksil<69><6C> jokaisesta populaatiosta. (
|
||
|
||
npops = size(allfreqs,3);
|
||
nloci = size(allfreqs,2);
|
||
ninds = n*npops;
|
||
|
||
refData = zeros(ninds*rowsFromInd,nloci);
|
||
counter = 1; % Pit<69><74> kirjaa mille riville seuraavaksi simuloidaan.
|
||
|
||
for ind = 1:ninds
|
||
pop = ceil(ind/n);
|
||
for loc = 1:nloci
|
||
for k=0:rowsFromInd-1
|
||
refData(counter+k,loc) = simuloiAlleeli(allfreqs,pop,loc);
|
||
end
|
||
end
|
||
counter = counter+rowsFromInd;
|
||
end
|
||
|
||
function all = simuloiAlleeli(allfreqs,pop,loc)
|
||
% Simuloi populaation pop lokukseen loc alleelin.
|
||
freqs = allfreqs(:,loc,pop);
|
||
cumsumma = cumsum(freqs);
|
||
arvo = rand;
|
||
isommat = find(cumsumma>arvo);
|
||
all = min(isommat);
|
||
|
||
%---------------------------------------------------------------------------
|
||
|
||
|
||
function loggis = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, proportions)
|
||
% Calculates the likelihood when the origins are defined by "proportions".
|
||
|
||
aux = proportions * ownCliqFreqs;
|
||
aux = log(aux);
|
||
loggis = sum(aux);
|
||
|
||
clear aux;
|
||
|
||
aux2 = proportions * ownSepFreqs;
|
||
aux2 = log(aux2);
|
||
loggis = loggis - sum(aux2);
|
||
|
||
|
||
%--------------------------------------------------------------------------
|
||
|
||
|
||
function osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi)
|
||
% P<>ivitt<74><74> osuusTaulun muutoksen j<>lkeen.
|
||
|
||
global COUNTS;
|
||
npops = size(COUNTS,3);
|
||
|
||
i1 = rem(indeksi,npops);
|
||
if i1==0, i1 = npops; end;
|
||
i2 = ceil(indeksi / npops);
|
||
|
||
osuusTaulu(i1) = osuusTaulu(i1)-osuus;
|
||
osuusTaulu(i2) = osuusTaulu(i2)+osuus;
|
||
|
||
|
||
%-------------------------------------------------------------------------
|
||
|
||
|
||
function [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml)
|
||
|
||
ready = 0;
|
||
while ready ~= 1
|
||
muutokset = calcChanges(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml);
|
||
[maxMuutos, indeksi] = max(muutokset(1:end));
|
||
if maxMuutos>0
|
||
osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi);
|
||
logml = logml + maxMuutos;
|
||
else
|
||
ready = 1;
|
||
end
|
||
end
|
||
|
||
|
||
|
||
%---------------------------------------------------------------------------
|
||
|
||
|
||
function muutokset = calcChanges(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml)
|
||
% Palauttaa npops*npops taulun, jonka alkio (i,j) kertoo, mik?on
|
||
% muutos logml:ss? mik<69>li populaatiosta i siirret<65><74>n osuuden verran
|
||
% todenn<6E>k<EFBFBD>isyysmassaa populaatioon j. Mik<69>li populaatiossa i ei ole
|
||
% mit<69><74>n siirrett<74>v<EFBFBD><76>, on vastaavassa kohdassa rivi nollia.
|
||
|
||
global COUNTS;
|
||
npops = size(COUNTS,3);
|
||
|
||
notEmpty = find(osuusTaulu>0.005);
|
||
muutokset = zeros(npops);
|
||
empties = ~notEmpty;
|
||
|
||
for i1=notEmpty
|
||
|
||
osuusTaulu(i1) = osuusTaulu(i1)-osuus;
|
||
|
||
for i2 = [1:i1-1 i1+1:npops]
|
||
osuusTaulu(i2) = osuusTaulu(i2)+osuus;
|
||
loggis = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu);
|
||
muutokset(i1,i2) = loggis-logml;
|
||
osuusTaulu(i2) = osuusTaulu(i2)-osuus;
|
||
end
|
||
|
||
osuusTaulu(i1) = osuusTaulu(i1)+osuus;
|
||
end
|
||
|
||
|
||
%---------------------------------------------------------------
|
||
|
||
|
||
function dispLine
|
||
disp('---------------------------------------------------');
|
||
|
||
|
||
%--------------------------------------------------------------------------
|
||
|
||
|
||
function tulostaAdmixtureTiedot(proportions, uskottavuus, alaRaja, niter)
|
||
h0 = findobj('Tag','filename1_text');
|
||
inputf = get(h0,'String');
|
||
h0 = findobj('Tag','filename2_text');
|
||
outf = get(h0,'String'); clear h0;
|
||
|
||
if length(outf)>0
|
||
fid = fopen(outf,'a');
|
||
else
|
||
fid = -1;
|
||
diary('baps4_output.baps'); % save in text anyway.
|
||
end
|
||
|
||
ninds = length(uskottavuus);
|
||
npops = size(proportions,2);
|
||
disp(' ');
|
||
dispLine;
|
||
disp('RESULTS OF ADMIXTURE ANALYSIS BASED');
|
||
disp('ON MIXTURE CLUSTERING OF INDIVIDUALS');
|
||
disp(['Data file: ' inputf]);
|
||
disp(['Number of individuals: ' num2str(ninds)]);
|
||
disp(['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']);
|
||
disp(' ');
|
||
if fid ~= -1
|
||
fprintf(fid, '\n');
|
||
fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n');
|
||
fprintf(fid,'%s \n', ['RESULTS OF ADMIXTURE ANALYSIS BASED']); fprintf(fid, '\n');
|
||
fprintf(fid,'%s \n', ['ON MIXTURE CLUSTERING OF INDIVIDUALS']); fprintf(fid, '\n');
|
||
fprintf(fid,'%s \n', ['Data file: ' inputf]); fprintf(fid, '\n');
|
||
fprintf(fid,'%s \n', ['Number of individuals: ' num2str(ninds)]); fprintf(fid, '\n');
|
||
fprintf(fid,'%s \n', ['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); fprintf(fid, '\n');
|
||
fprintf(fid, '\n');
|
||
end
|
||
|
||
ekaRivi = blanks(6);
|
||
for pop = 1:npops
|
||
ekaRivi = [ekaRivi blanks(3-floor(log10(pop))) num2str(pop) blanks(2)];
|
||
end
|
||
ekaRivi = [ekaRivi blanks(1) 'p']; % Added on 29.08.06
|
||
disp(ekaRivi);
|
||
for ind = 1:ninds
|
||
rivi = [num2str(ind) ':' blanks(4-floor(log10(ind)))];
|
||
if any(proportions(ind,:)>0)
|
||
for pop = 1:npops-1
|
||
rivi = [rivi proportion2str(proportions(ind,pop)) blanks(2)];
|
||
end
|
||
rivi = [rivi proportion2str(proportions(ind,npops)) ': '];
|
||
rivi = [rivi ownNum2Str(uskottavuus(ind))];
|
||
end
|
||
disp(rivi);
|
||
if fid ~= -1
|
||
fprintf(fid,'%s \n',[rivi]); fprintf(fid,'\n');
|
||
end
|
||
end
|
||
if fid ~= -1
|
||
fclose(fid);
|
||
else
|
||
diary off
|
||
end
|
||
|
||
%------------------------------------------------------
|
||
|
||
function str = proportion2str(prob)
|
||
%prob belongs to [0.00, 0.01, ... ,1].
|
||
%str is a 4-mark presentation of proportion.
|
||
|
||
if abs(prob)<1e-3
|
||
str = '0.00';
|
||
elseif abs(prob-1) < 1e-3;
|
||
str = '1.00';
|
||
else
|
||
prob = round(100*prob);
|
||
if prob<10
|
||
str = ['0.0' num2str(prob)];
|
||
else
|
||
str = ['0.' num2str(prob)];
|
||
end;
|
||
end;
|
||
|
||
%-------------------------------------------------------
|
||
|
||
function g=randga(a,b)
|
||
flag = 0;
|
||
if a>1
|
||
c1 = a-1; c2 = (a-(1/(6*a)))/c1; c3 = 2/c1; c4 = c3+2; c5 = 1/sqrt(a);
|
||
U1=-1;
|
||
while flag == 0,
|
||
if a<=2.5,
|
||
U1=rand;U2=rand;
|
||
else
|
||
while ~(U1>0 & U1<1),
|
||
U1=rand;U2=rand;
|
||
U1 = U2 + c5*(1-1.86*U1);
|
||
end %while
|
||
end %if
|
||
W = c2*U2/U1;
|
||
if c3*U1+W+(1/W)<=c4,
|
||
flag = 1;
|
||
g = c1*W/b;
|
||
elseif c3*log(U1)-log(W)+W<1,
|
||
flag = 1;
|
||
g = c1*W/b;
|
||
else
|
||
U1=-1;
|
||
end %if
|
||
end %while flag
|
||
elseif a==1
|
||
g=sum(-(1/b)*log(rand(a,1)));
|
||
else
|
||
while flag == 0,
|
||
U = rand(2,1);
|
||
if U(1)>exp(1)/(a+exp(1)),
|
||
g = -log(((a+exp(1))*(1-U(1)))/(a*exp(1)));
|
||
if U(2)<=g^(a-1),
|
||
flag = 1;
|
||
end %if
|
||
else
|
||
g = ((a+exp(1))*U(1)/((exp(1))^(1/a)));
|
||
if U(2)<=exp(-g),
|
||
flag = 1;
|
||
end %if
|
||
end %if
|
||
end %while flag
|
||
g=g/b;
|
||
end %if;
|
||
|
||
|
||
%-------------------------------------------------
|
||
|
||
function svar=randdir(counts,nc)
|
||
% K<>ytt<74>esim randdir([10;30;60],3)
|
||
|
||
svar=zeros(nc,1);
|
||
for i=1:nc
|
||
svar(i,1)=randga(counts(i,1),1);
|
||
end
|
||
svar=svar/sum(svar);
|
||
|
||
%--------------------------------------------------
|
||
|
||
function waitALittle
|
||
A = rand(500);
|
||
gammaln(A);
|
||
|
||
|
||
%--------------------------------------------------
|
||
|
||
function [cliq_data, sep_data, cliq_counts, component_mat] = ...
|
||
createCliqData(data, gene_lengths, noalle, linkage_model, ...
|
||
rowsFromInd)
|
||
% cliq_data: cell array, each cell corresponds to one gene. Element (i,j)
|
||
% in cell k is the code of the allele combination in the j:th clique in
|
||
% gene k, for individual i.
|
||
|
||
% sep_data: like cliq_data. i:th separator separates cliques i and i+1.
|
||
|
||
% cliq_counts: cell array, each cell corresponds to one gene. Each cell is
|
||
% a 3-dimensional array, where the element (i,j,k) is the observed count of
|
||
% allele combination i, in clique j, in population k.
|
||
|
||
% The coding of the allele combinations: If a clique of 3 sites has
|
||
% noalle values 3,2,4, then the allele combinations are given numbers in
|
||
% lexicographic order: 111, 112, 113, 114, 121, 122, ..., 324.
|
||
|
||
%------------------------------------------------------------------------
|
||
|
||
% cliq_data on cell-array, jossa kukin solu vastaa yht?geeni? Alkio (i,j)
|
||
% solussa k merkitsee sen alleelikombinaation koodia, joka yksil<69>ll?i
|
||
% havaitaan geenin k klikiss?numero j.
|
||
|
||
% cliq_counts on cell-array, jossa my<6D>s kukin solu vastaa yht?geeni?
|
||
% Kukin solu on kolmiulotteinen taulukko, jonka alkio (i,j,k) on
|
||
% populaatiossa k, ko geenin j:nness?klikiss?havaitun alleelikombinaation
|
||
% i lukum<75><6D>r?
|
||
|
||
% Alleelikombinaatioiden koodaus: Jos kolmen position klikiss?on (koko
|
||
% datassa) noalle:t 3,2,4, (eli ekassa positiossa alleelit 1-3, tokassa 1-2
|
||
% ja kolmannessa alleelit 1-4), niin alleelikombinaatiot numeroidaan
|
||
% leksikograafisessa j<>rjestyksess? 111, 112, 113, 114, 121, 122, ...,
|
||
% 324.
|
||
|
||
%-----------------------------------------------------------------------
|
||
|
||
global PARTITION;
|
||
|
||
if sum(gene_lengths) ~= size(data,2)
|
||
disp('Error 155');
|
||
end
|
||
if ~isa(data,'double')
|
||
data = double(data); % Required in matlab 6
|
||
end
|
||
|
||
ninds = size(data,1);
|
||
n_genes = length(gene_lengths);
|
||
if strcmp(linkage_model,'codon')
|
||
n_cliques = gene_lengths-2; % Number of cliques in each gene
|
||
else
|
||
n_cliques = gene_lengths-1; % Use linear model
|
||
end
|
||
max_noalle = zeros(n_genes,1); % Maximum "clique noalle" in each gene.
|
||
|
||
component_mat = zeros(n_genes, max(gene_lengths));
|
||
cum_length = cumsum(gene_lengths);
|
||
component_mat(1,1:gene_lengths(1))=1:gene_lengths(1);
|
||
for i = 2:n_genes
|
||
component_mat(i,1:gene_lengths(i)) = cum_length(i-1)+1:cum_length(i);
|
||
end
|
||
|
||
for i = 1:n_genes
|
||
% What is the largest number of different values that are observed for
|
||
% some clique in this gene:
|
||
number = 0;
|
||
if n_cliques(i)<1
|
||
% the gene is shorter than a normal clique.
|
||
if gene_lengths(i)==2
|
||
% linkage_model must be 'codon' to end up here..
|
||
positions = component_mat(i, [1 2]);
|
||
number = prod(noalle(positions));
|
||
else
|
||
% gene_lengths(i) == 1
|
||
positions = component_mat(i,1);
|
||
number = noalle(positions);
|
||
end
|
||
else
|
||
for j = 1:n_cliques(i)
|
||
if strcmp(linkage_model,'codon'), positions = component_mat(i , j:j+2);
|
||
else positions = component_mat(i, j:j+1);
|
||
end
|
||
|
||
cand = prod(noalle(positions));
|
||
if cand>number
|
||
number=cand;
|
||
end
|
||
end
|
||
end
|
||
max_noalle(i) = number;
|
||
end
|
||
|
||
cliq_data = cell(n_genes,1); % An array for each gene.
|
||
% (i,j) is the combination which individual i has in clique j (in haploid case..).
|
||
|
||
for i = 1:n_genes
|
||
if n_cliques(i)<1
|
||
cliq_data{i} = zeros(ninds, 1);
|
||
if gene_lengths(i)==2
|
||
% linkage_model must be 'codon' to end up here..
|
||
positions = component_mat(i, [1 2]);
|
||
rows = data(:,positions);
|
||
observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2);
|
||
else
|
||
positions = component_mat(i,1);
|
||
rows = data(:,positions);
|
||
observations = rows;
|
||
end
|
||
cliq_data{i}(:,1) = observations;
|
||
else
|
||
cliq_data{i} = zeros(ninds, n_cliques(i));
|
||
for j = 1:n_cliques(i)
|
||
if strcmp(linkage_model,'codon')
|
||
positions = component_mat(i,j:j+2);
|
||
rows = data(:, positions);
|
||
observations = (rows(:,1)-1) * prod(noalle(positions(2:3))) + ...
|
||
(rows(:,2)-1) * noalle(positions(3)) + rows(:,3);
|
||
else
|
||
positions = component_mat(i,j:j+1);
|
||
rows = data(:,positions);
|
||
observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2);
|
||
end
|
||
cliq_data{i}(:,j) = observations;
|
||
end
|
||
end
|
||
end
|
||
|
||
cliq_counts = cell(n_genes,1);
|
||
% (i,j,k) is the count of combination i, in clique j, in population k.
|
||
|
||
npops = length(unique(PARTITION));
|
||
for i = 1:n_genes
|
||
cliq_counts{i} = zeros(max_noalle(i), max(1,n_cliques(i)), npops);
|
||
for j = 1:npops
|
||
partition = repmat(PARTITION', [rowsFromInd 1]);
|
||
partition = partition(:); % Partition for rows in the data (instead of individuals).
|
||
inds_now = find(partition==j);
|
||
ninds_now = length(inds_now);
|
||
data_now = cliq_data{i}(inds_now,:);
|
||
|
||
for k = 1:max(n_cliques(i),1)
|
||
apu = zeros(ninds_now, max_noalle(i));
|
||
apu(sub2ind([ninds_now max_noalle(i)],...
|
||
(1:ninds_now)', data_now(:,k)))=1;
|
||
cliq_counts{i}(:, k, j) = (sum(apu,1))';
|
||
end
|
||
end
|
||
end
|
||
|
||
sep_data = cell(n_genes,1);
|
||
n_separators = n_cliques-1;
|
||
for i = 1:n_genes
|
||
sep_data{i} = zeros(ninds, n_separators(i));
|
||
for j = 1:n_separators(i)
|
||
if strcmp(linkage_model, 'codon')
|
||
positions = component_mat(i,j+1:j+2);
|
||
rows = data(:, positions);
|
||
observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2);
|
||
else
|
||
positions = component_mat(i,j+1);
|
||
rows = data(:,positions);
|
||
observations = rows;
|
||
end
|
||
sep_data{i}(:,j) = observations;
|
||
end
|
||
end
|
||
|
||
|
||
%------------------------------------------------------
|
||
|
||
|
||
function [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, ...
|
||
gene_lengths, linkage_model)
|
||
|
||
% cliq_freqs: cell-array. Element (i,j,k) in cell m is the frequence of
|
||
% combination i, in clique j of the m:th gene, in population k.
|
||
|
||
% sep_freqs: like cliq_freqs, but for the separators.
|
||
|
||
%------------------------------------------------------------------------
|
||
|
||
% cliq_freqs: cell-array, jossa on vastaavat dimensiot kuin cliq_counts:issa.
|
||
% solun m alkio (i,j,k) on geenin m, klikin j, kombinaation i, frekvenssi
|
||
% populaatiossa k.
|
||
|
||
% sep_freqs: cell-array, kuten cl_freqs, mutta separaattoreille.
|
||
|
||
%-------------------------------------------------------------------------
|
||
|
||
global PARTITION;
|
||
|
||
n_genes = length(cliq_counts);
|
||
if strcmp(linkage_model,'codon')
|
||
n_cliques = gene_lengths-2; % Number of cliques in each gene
|
||
else
|
||
n_cliques = gene_lengths-1; % Use linear model
|
||
end
|
||
npops = length(unique(PARTITION));
|
||
|
||
cliq_freqs = cell(n_genes,1);
|
||
sep_freqs = cell(n_genes,1);
|
||
|
||
for i=1:n_genes
|
||
|
||
cliq_freqs{i} = zeros(size(cliq_counts{i}));
|
||
|
||
positions = component_mat(i,1:gene_lengths(i));
|
||
|
||
if n_cliques(i)<1
|
||
% the gene is shorter than a normal clique.
|
||
if gene_lengths(i)==2
|
||
% linkage_model must be 'codon' to end up here..
|
||
cliq_noalle = noalle(positions(1)) .* noalle(positions(2));
|
||
sep_noalle = [];
|
||
else
|
||
% gene_lengths(i) == 1
|
||
cliq_noalle = noalle(positions(1));
|
||
sep_noalle = [];
|
||
end
|
||
sep_freqs{i} = [];
|
||
else
|
||
if strcmp(linkage_model, 'codon')
|
||
cliq_noalle = noalle(positions(1:end-2)) .* noalle(positions(2:end-1)) .* ...
|
||
noalle(positions(3:end));
|
||
sep_noalle = noalle(positions(2:end-2)) .* noalle(positions(3:end-1));
|
||
else
|
||
cliq_noalle = noalle(positions(1:end-1)) .* noalle(positions(2:end));
|
||
sep_noalle = noalle(positions(2:end-1));
|
||
end
|
||
sep_freqs{i} = zeros(max(sep_noalle), n_cliques(i)-1, npops);
|
||
end
|
||
|
||
% First clique:
|
||
prior = (1 / cliq_noalle(1)) * ones(cliq_noalle(1),1);
|
||
counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(1),1,:);
|
||
for k = 1:npops
|
||
simul = randdir(counts_now(:,1,k), cliq_noalle(1));
|
||
cliq_freqs{i}(1:cliq_noalle(1),1,k) = simul;
|
||
end
|
||
|
||
for j=2:n_cliques(i)
|
||
% Obtain freqs for j-1:th separator by marginalization from j-1:th
|
||
% clique, and draw values for the frequencies of the j:th clique:
|
||
|
||
aux = cliq_freqs{i}(1:cliq_noalle(j-1), j-1, :); % Freqs of the previous clique
|
||
aux = reshape(aux, [sep_noalle(j-1), noalle(positions(j-1)), npops]);
|
||
|
||
% Freqs for separator by marginalization from the previous clique:
|
||
sep_freqs{i}(1:sep_noalle(j-1),j-1,:) = sum(aux,2);
|
||
|
||
prior = (1 / cliq_noalle(j)) * ones(cliq_noalle(j),1);
|
||
counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(j),j,:);
|
||
for k = 1:npops
|
||
% Simulate conditional frequencies:
|
||
for m = 1:sep_noalle(j-1)
|
||
if strcmp(linkage_model, 'codon')
|
||
values = (m-1)*noalle(positions(j+2))+1:m*noalle(positions(j+2));
|
||
else
|
||
values = (m-1)*noalle(positions(j+1))+1:m*noalle(positions(j+1));
|
||
end
|
||
simul = randdir(counts_now(values,1,k), length(values));
|
||
cliq_freqs{i}(values,j,k) = simul * sep_freqs{i}(m,j-1,k); % MIETI TARKKAAN!
|
||
end
|
||
end
|
||
end
|
||
end
|
||
|
||
|
||
%--------------------------------------------------------------------
|
||
|
||
|
||
function [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(...
|
||
ind, cl_data, cl_freqs, sep_data, sep_freqs, rowsFromInd, ...
|
||
gene_lengths, linkage_model)
|
||
|
||
% ownCliqFreqs is (npops * (n_cliques*rowsFromInd)) table, where each column
|
||
% contains the frequencies of the corresponding clique_combination, in
|
||
% different populations.
|
||
|
||
% ownSepFreqs is (npops * (n_seps*rowsFromInd)) table, like ownCliqFreqs.
|
||
|
||
n_genes = length(gene_lengths);
|
||
if strcmp(linkage_model,'codon')
|
||
n_cliques = gene_lengths-2;
|
||
n_cliques = max([n_cliques ones(n_genes,1)], [], 2); % for genes shorter than clique
|
||
else
|
||
n_cliques = gene_lengths-1; % Use linear model.
|
||
n_cliques = max([n_cliques ones(n_genes,1)], [], 2);
|
||
end
|
||
|
||
total_n_cliques = sum(n_cliques);
|
||
npops = size(cl_freqs{1},3);
|
||
|
||
ownCliqFreqsXX = zeros(1, total_n_cliques*rowsFromInd, npops);
|
||
|
||
pointer = 1;
|
||
for i = 1:n_genes
|
||
ind_data = cl_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd , :);
|
||
for j = 1:n_cliques(i) % MUUTA!
|
||
for k = 1:rowsFromInd
|
||
code = ind_data(k,j);
|
||
ownCliqFreqsXX(1,pointer,:) = cl_freqs{i}(code,j,:);
|
||
pointer = pointer+1;
|
||
end
|
||
end
|
||
end
|
||
|
||
ownCliqFreqs = (squeeze(ownCliqFreqsXX))';
|
||
|
||
n_separators = n_cliques-1;
|
||
total_n_separators = sum(n_separators);
|
||
|
||
ownSepFreqsXX = zeros(1, total_n_separators*rowsFromInd, npops);
|
||
|
||
pointer = 1;
|
||
for i = 1:n_genes
|
||
ind_data = sep_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd , :);
|
||
for j = 1:n_separators(i)
|
||
for k = 1:rowsFromInd
|
||
code = ind_data(k,j);
|
||
ownSepFreqsXX(1,pointer,:) = sep_freqs{i}(code,j,:);
|
||
pointer = pointer+1;
|
||
end
|
||
end
|
||
end
|
||
|
||
if (total_n_separators*rowsFromInd)==1
|
||
ownSepFreqs = (squeeze(ownSepFreqsXX));
|
||
else
|
||
ownSepFreqs = (squeeze(ownSepFreqsXX))';
|
||
end
|
||
|
||
|
||
%-------------------------------------------------------------------------
|
||
|
||
|
||
function exp_cliq_freqs = computeExpectedFreqs(cliq_counts, ...
|
||
noalle, component_mat, gene_lengths, linkage_model)
|
||
|
||
% Returns the expected values for the clique and separator frequencies in
|
||
% different populations.
|
||
|
||
% exp_cliq_freqs: cell-array. Element (i,j,k) in cell m is the expected
|
||
% frequence of combination i, in clique j of the m:th gene, in
|
||
% population k.
|
||
|
||
n_genes = length(gene_lengths);
|
||
|
||
if strcmp(linkage_model, 'codon')
|
||
n_cliques = gene_lengths-2;
|
||
else
|
||
n_cliques = gene_lengths-1; % Linear model
|
||
end
|
||
|
||
npops = size(cliq_counts{1},3);
|
||
exp_cliq_freqs = cell(n_genes,1);
|
||
|
||
for i = 1:n_genes
|
||
|
||
exp_cliq_freqs{i} = zeros(size(cliq_counts{i}));
|
||
positions = component_mat(i,1:gene_lengths(i));
|
||
|
||
if n_cliques(i)<1
|
||
% the gene is shorter than a normal clique.
|
||
if gene_lengths(i)==2
|
||
% linkage_model must be 'codon' to end up here..
|
||
cliq_noalle = noalle(positions(1)) .* noalle(positions(2));
|
||
else
|
||
% gene_lengths(i) == 1
|
||
cliq_noalle = noalle(positions(1));
|
||
end
|
||
else
|
||
if strcmp(linkage_model, 'codon')
|
||
cliq_noalle = noalle(positions(1:end-2)) .* noalle(positions(2:end-1)) .* ...
|
||
noalle(positions(3:end));
|
||
else
|
||
cliq_noalle = noalle(positions(1:end-1)) .* noalle(positions(2:end));
|
||
end
|
||
end
|
||
|
||
for j = 1:max(1, n_cliques(i))
|
||
prior = (1 / cliq_noalle(j)) * ones(cliq_noalle(j),1);
|
||
counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(j),j,:);
|
||
exp_cliq_freqs{i}(1:cliq_noalle(j),j,:) = ...
|
||
counts_now ./ repmat(sum(counts_now,1), [cliq_noalle(j) 1 1]);
|
||
end
|
||
end
|
||
|
||
|
||
%----------------------------------------------------------
|
||
|
||
|
||
function [ref_cliq_data, ref_sep_data] = ...
|
||
simulateLinkageIndividuals(n, rowsFromInd, exp_cliq_freqs, gene_lengths, ...
|
||
noalle, component_mat, linkage_model)
|
||
|
||
% Simulates n individuals from each population using expected frequencies
|
||
% for cliques and separators.
|
||
|
||
% ref_cliq_data: cell array, each cell corresponds to one gene. Elements
|
||
% ((i-1)*rowsFromInd+1:i*rowsFromInd, j) in cell k are the codes of the allele
|
||
% combinations in the j:th clique in gene k, for individual i.
|
||
|
||
% ref_sep_data: like ref_cliq_data. i:th separator separates cliques i and i+1.
|
||
|
||
n_genes = length(gene_lengths);
|
||
if strcmp(linkage_model,'codon')
|
||
n_cliques = gene_lengths-2;
|
||
else
|
||
n_cliques = gene_lengths-1; % Linear model
|
||
end
|
||
npops = size(exp_cliq_freqs{1},3);
|
||
ninds = n*npops;
|
||
|
||
ref_cliq_data = cell(n_genes,1);
|
||
ref_sep_data = cell(n_genes,1);
|
||
|
||
for i = 1:n_genes
|
||
ref_cliq_data{i} = zeros(ninds*rowsFromInd, max(n_cliques(i),1)); % Added: rowsFromInd
|
||
|
||
positions = component_mat(i,1:gene_lengths(i));
|
||
|
||
if strcmp(linkage_model,'codon')
|
||
sep_noalle = noalle(positions(2:end-2)) .* noalle(positions(3:end-1));
|
||
else
|
||
sep_noalle = noalle(positions(2:end-1));
|
||
end
|
||
ref_sep_data{i} = zeros(ninds*rowsFromInd, n_cliques(i)-1); % Added: rowsFromInd
|
||
|
||
for ind = 1:ninds
|
||
pop = ceil(ind/n);
|
||
|
||
% First clique:
|
||
freqs = exp_cliq_freqs{i}(:,1,pop);
|
||
freqs = repmat(freqs, [1 rowsFromInd]);
|
||
codes = simulateCodes(freqs);
|
||
ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, 1) = codes;
|
||
|
||
for j = 2:n_cliques(i)
|
||
previous_cliq = ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j-1);
|
||
|
||
% Value for j-1:th separator:
|
||
sep_codes = rem(previous_cliq, sep_noalle(j-1));
|
||
sep_codes(find(sep_codes==0)) = sep_noalle(j-1);
|
||
ref_sep_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j-1) = sep_codes;
|
||
|
||
% Value for j:th clique:
|
||
if strcmp(linkage_model,'codon')
|
||
freqs = zeros(noalle(positions(j+2)),rowsFromInd);
|
||
values = zeros(noalle(positions(j+2)),rowsFromInd);
|
||
for k = 1:rowsFromInd
|
||
values(:,k) = ((sep_codes(k)-1)*noalle(positions(j+2))+1 : sep_codes(k)*noalle(positions(j+2)))';
|
||
freqs(:,k) = exp_cliq_freqs{i}(values(:,k), j, pop);
|
||
end
|
||
freqs = freqs ./ repmat(sum(freqs,1), [noalle(positions(j+2)) 1]);
|
||
else
|
||
freqs = zeros(noalle(positions(j+1)),rowsFromInd);
|
||
values = zeros(noalle(positions(j+1)),rowsFromInd);
|
||
for k = 1:rowsFromInd
|
||
values(:,k) = ((sep_codes(k)-1)*noalle(positions(j+1))+1 : sep_codes(k)*noalle(positions(j+1)))';
|
||
freqs(:,k) = exp_cliq_freqs{i}(values(:,k), j, pop);
|
||
end
|
||
freqs = freqs ./ repmat(sum(freqs,1), [noalle(positions(j+1)) 1]);
|
||
end
|
||
codes = simulateCodes(freqs);
|
||
codes = values(sub2ind(size(values),codes,(1:rowsFromInd)'));
|
||
ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j) = codes;
|
||
end
|
||
end
|
||
end
|
||
|
||
function codes = simulateCodes(freqs)
|
||
% Freqs is a table where each column is a distribution. The
|
||
% number of columns in freqs must be equal to rowsFromInd.
|
||
% A value is drawn from each distribution in different columns. The values
|
||
% are saved in codes, which is (rowsFromInd*1) table.
|
||
|
||
[nrows, rowsFromInd] = size(freqs);
|
||
codes = nrows+1-sum(cumsum(freqs)>repmat(rand(1,rowsFromInd),[nrows 1]),1);
|
||
codes = codes'; |