ourMELONS/matlab/linkage/transform5.m
2019-12-16 16:47:21 +01:00

265 lines
10 KiB
Matlab

function [data_clique, data_separator, noalle_clique, noalle_separator, alphabet_cq, alphabet_sp, informative_clique, informative_separator] = ...
transform5(data, component_mat, varargin)
% Filename: transform4.m (modified, Lu Cheng, 17.02.2010)
% [data_clique,data_clique,noalle_clique, noalle_separator] = tranform4(data, component_mat, linkage_model)
% DATA SHRINKAGE STEP IN THE END TO SAVE MEMORY USE
% Further shrink the data using uint8 format.
% Make sure that allele numbers smaller than 256.
% Modified by Lu Cheng from transform4.m, 17.02.2010
% New: The alphabet at each locus is stored in alphabet_cq and alphabet_sp
% And here we also assume there is no missing data
linkage_model = 'codon';
if strcmp(linkage_model,'linear')
nindividuals = size(data,1);
nloci = size(data,2)-1;
% if ~all(diag(adj_mat)), adj_mat = adj_mat + diag(ones(nloci,1));, end;
% [p,p,r]=dmperm(adj_mat);
% UNSOLVED --- Need to find a better algorithm to count the numbers of
% separators ,cliques and singletons.
ncomponents = size(component_mat,1);
cardinality = sum(component_mat>0,2)';
ncliques = sum(cardinality-1);
nsingletons = sum(cardinality==1);
nseparators = sum(cardinality(find(cardinality>1))-2);
data_clique = zeros(nindividuals, ncliques+nsingletons,'uint16');
noalle_clique = zeros(ncliques+nsingletons,1,'uint16');
data_separator = zeros(nindividuals, nseparators,'uint16');
noalle_separator = zeros(nseparators,1,'uint16');
% data_separator = []; % used for vectorization
k = 1;
for i = 1:ncomponents
if (cardinality(i)==1) % singleton
data_clique(:,k) = data(:,component_mat(i,1));
temp = unique(data(:,component_mat(i,1)));
if temp(1) == 0 % missing data found
noalle_clique(k) = size(temp,1) -1;
else
noalle_clique(k) = size(temp,1);
end
k = k+1;
else
for j = 1:cardinality(i)-1
% dealing with missing data
% nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2));
% [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows');
% data_clique(nonmissing,k) = n;
% noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*...
% size(unique(data(nonmissing,component_mat(i,j+1))),1);
[b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows');
data_clique(:,k) = n;
noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*...
size(unique(data(:,component_mat(i,j+1))),1);
k = k+1;
end
end
end
k = 1;
for i = 1:ncomponents
if (cardinality(i)>2)
for j = 2:cardinality(i)-1
data_separator(:,k) = data(:,component_mat(i,j));
k = k+1;
end
end
end
if (k-1)~=nseparators
error('ERROR in transform.m');
return
end
for i = 1:nseparators
% nonmissing = find(all(data_separator(:,i),2));
% noalle_separator(i) = length(unique(data_separator(nonmissing,i)));
noalle_separator(i) = length(unique(data_separator(:,i)));
end
elseif strcmp(linkage_model,'codon')
nindividuals = size(data,1);
nloci = size(data,2)-1;
ncomponents = size(component_mat,1);
cardinality = sum(component_mat>0,2)'; % length of each gene
ncliques = sum(cardinality(cardinality>2)-2);
nsingletons = sum(cardinality==1 | cardinality ==2);
nseparators = sum(cardinality(cardinality>2)-3);
data_clique = zeros(nindividuals,ncliques+nsingletons,'uint16');
noalle_clique = zeros(ncliques+nsingletons,1,'uint16');
data_separator = zeros(nindividuals,nseparators,'uint16');
noalle_separator = zeros(nseparators,1,'uint16');
alphabet_cq = cell(1,ncliques+nsingletons);
alphabet_sp = cell(1,nseparators);
k = 1;
for i = 1:ncomponents
if (cardinality(i)==1) % singleton
[b, m, n] = unique(data(:,component_mat(i,1)));
alphabet_cq{k} = b;
data_clique(:,k) = unit16(n);
% temp = unique(data(:,component_mat(i,1)));
% if temp(1) == 0 % missing data found
% noalle_clique(k) = size(temp,1) -1;
% else
% noalle_clique(k) = size(temp,1);
% end
k = k+1;
elseif (cardinality(i)==2) % transform to linear linkage
% dealing with missing data
% nonmissing = find(all(data(:,[component_mat(i,1) component_mat(i,2)]),2));
% [b,m,n] = unique(data(nonmissing,[component_mat(i,1) component_mat(i,2)]),'rows');
[b,m,n] = unique(data(:,[component_mat(i,1) component_mat(i,2)]),'rows');
data_clique(:,k) = uint16(n);
alphabet_cq{k} = b;
% data_clique(nonmissing,k) = n;
% noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,1))),1)*...
% size(unique(data(nonmissing,component_mat(i,2))),1);
noalle_clique(k) = size(unique(data(:,component_mat(i,1))),1)*...
size(unique(data(:,component_mat(i,2))),1);
k = k+1;
else
for j = 1:cardinality(i)-2
% dealing with missing data
% nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),2));
% [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),'rows');
% data_clique(nonmissing,k) = n;
[b,m,n] = unique(data(:,component_mat(i,j:j+2)),'rows');
data_clique(:,k) = uint16(n);
alphabet_cq{k} = b;
% noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*...
% size(unique(data(nonmissing,component_mat(i,j+1))),1)*size(unique(data(nonmissing,component_mat(i,j+2))),1);
noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*...
size(unique(data(:,component_mat(i,j+1))),1)*size(unique(data(:,component_mat(i,j+2))),1);
k = k+1;
end
end
end
k = 1;
for i = 1:ncomponents
if (cardinality(i)>2)
for j = 2:cardinality(i)-2
% dealing with missing data
% nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2));
% [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows');
[b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows');
% data_separator(nonmissing,k) = n;
data_separator(:,k) = uint16(n);
alphabet_sp{k} = b;
% noalle_separator(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*...
% size(unique(data(nonmissing,component_mat(i,j+1))),1);
noalle_separator(k) = size(unique(data(:,component_mat(i,j))),1)*...
size(unique(data(:,component_mat(i,j+1))),1);
k = k+1;
end
end
end
elseif strcmpi(linkage_model,'Independent')
data_clique = data(:,[1:end-1]);
data_separator = [];
noalle_clique = max(data_clique); % Be sure the data is normalized.
noalle_separator = [];
elseif strcmp(linkage_model,'random')
adj_mat = mk_adjmat(component_mat);
nloci = size(adj_mat,1);
ns = 4*ones(1,nloci);
porder = [];
stages = {[1:nloci]};
clusters = {};
[jtree,root2,cliques,B,w]=graph_to_jtree(adj_mat,ns,porder,stages,clusters);
C = length(cliques);
[is,js] = find(jtree > 0);
separator = cell(C,C);
for k=1:length(is)
i = is(k); j = js(k);
separator{i,j} = find(B(i,:) & B(j,:)); % intersect(cliques{i}, cliques{j});
end
nindividuals = size(data,1);
if nloci ~= size(data,2)-1 error('*** ERROR: in transform3.m');
return
end
ncliques = length(cliques);
[i,j] = find(~cellfun('isempty',triu(separator)));
ij = [i j];
nseparators = length(i);
data_clique = zeros(nindividuals, ncliques);
noalle_clique = zeros(ncliques,1);
data_separator = zeros(nindividuals, nseparators);
noalle_separator = zeros(nseparators,1);
k = 1;
for i = 1:ncliques
[b,m,n] = unique(data(:,cliques{i}),'rows');
data_clique(:,k) = n;
% for j = 1:length(cliques{i})
% sz(j) = size(unique(data(:,cliques{i}(j))),1);
% end
% noalle_clique(k) = prod(sz);
noalle_clique(k) = 4^length(cliques{i});
k = k+1;
% clear sz;
end
k = 1;
for i = 1:nseparators
[b,m,n] = unique(data(:,separator{ij(i,1),ij(i,2)}),'rows');
data_separator(:,k) = n;
% for j = 1:length(separator{ij(i,1),ij(i,2)})
% sz(j) = size(unique(data(:,separator{ij(i,1),ij(i,2)}(j))),1);
% end
% noalle_separator(k) = prod(sz);
noalle_separator(k) = 4^length(separator{ij(i,1),ij(i,2)});
k = k+1;
% clear sz;
end
end
% Data shrinkage step
if ~isempty(varargin)
informative_clique = varargin{1};
informative_separator = varargin{2};
else
informative_clique = find(noalle_clique~=1);
informative_separator = find(noalle_separator~=1);
end
data_clique = data_clique(:,informative_clique);
data_separator = data_separator(:,informative_separator);
alphabet_cq = alphabet_cq(informative_clique);
alphabet_sp = alphabet_sp(informative_separator);
noalle_clique = noalle_clique(informative_clique);
noalle_separator = noalle_separator(informative_separator);
clear data;