71 lines
2.3 KiB
Mathematica
71 lines
2.3 KiB
Mathematica
|
|
function [data, totalmissing] = makecomplete(data)
|
||
|
|
%MAKECOMPLETE ESTIMATING missing alleles in a loci.
|
||
|
|
% Input:
|
||
|
|
% data: a baps data format, e.g. last column is index. Missing values
|
||
|
|
% are denoted as any non-positive integers, commonly as 0 or -999.
|
||
|
|
|
||
|
|
|
||
|
|
[nrows, ncols] = size(data);
|
||
|
|
id = all(data>0);
|
||
|
|
if all(id) % no missing values
|
||
|
|
return
|
||
|
|
else
|
||
|
|
fprintf(1,'Estimating the missing values...');
|
||
|
|
|
||
|
|
% remove the totally missing loci
|
||
|
|
missingloci = find(id==0);
|
||
|
|
totalmissing = find(all(data(:,missingloci)<=0));
|
||
|
|
if ~isempty(totalmissing)
|
||
|
|
disp('Totally missing loci were found.');
|
||
|
|
data(:,missingloci(totalmissing)) = ones(nrows,length(totalmissing));
|
||
|
|
end
|
||
|
|
% goodloci = setdiff([1:ncols],missingloci(totalmissing));
|
||
|
|
% data = data(:,goodloci);
|
||
|
|
% missingloci = find(all(data>0)==0);
|
||
|
|
|
||
|
|
data_in = data(:,[missingloci end]);
|
||
|
|
[counts,noalle,prior,adjprior,rawalleles,data_in] = allfreqsnew3(data_in);
|
||
|
|
data(:,missingloci) = data_in(:,[1:end-1]);
|
||
|
|
|
||
|
|
s_counts = sum(counts,3);
|
||
|
|
% m_counts = size(counts,3)*ones(1, size(counts,2)) - sum(s_counts); % counts of missing values
|
||
|
|
m_counts = nrows*ones(1, size(counts,2)) - sum(s_counts); % counts of missing values
|
||
|
|
m = repmat(m_counts,size(s_counts,1),1);
|
||
|
|
a = s_counts + prior;
|
||
|
|
a = a./repmat(sum(a),size(a,1),1);
|
||
|
|
c = round(m.*a);
|
||
|
|
|
||
|
|
% Be sure that the sum matches
|
||
|
|
reassign = find(sum(c)~=m_counts);
|
||
|
|
for i = reassign
|
||
|
|
dif = sum(c(:,i)) - m_counts(:,i);
|
||
|
|
if dif > 0
|
||
|
|
remove = find(c(:,i)==max(c(:,i)));
|
||
|
|
c(remove(1),i)=c(remove(1),i) - dif; % decrease by the diff
|
||
|
|
else
|
||
|
|
add = find(c(:,i)==min(c(:,i)));
|
||
|
|
c(add(1),i)=c(add(1),i) - dif; % increase by the diff
|
||
|
|
end
|
||
|
|
end
|
||
|
|
try
|
||
|
|
% Fill in the missing values
|
||
|
|
for i = 1:length(missingloci)
|
||
|
|
missingrows = find(data(:,missingloci(i))<=0);
|
||
|
|
p = randperm(m_counts(i));
|
||
|
|
cum = [0 cumsum(c(:,i))'];
|
||
|
|
for j = 1:size(c,1)
|
||
|
|
data(missingrows(p([cum(j)+1:cum(j+1)])),missingloci(i)) = j;
|
||
|
|
end
|
||
|
|
end
|
||
|
|
catch
|
||
|
|
% disp('*** ERROR: in completing the missing data.');
|
||
|
|
data = [];
|
||
|
|
totalmissing = [];
|
||
|
|
return
|
||
|
|
end
|
||
|
|
fprintf(1,'Finished.\n');
|
||
|
|
end
|
||
|
|
|
||
|
|
|
||
|
|
|