Added source Matlab code for reference
This commit is contained in:
parent
b8af977117
commit
b5d99903d2
186 changed files with 61405 additions and 1 deletions
1788
matlab/independent/greedyMix.m
Normal file
1788
matlab/independent/greedyMix.m
Normal file
File diff suppressed because it is too large
Load diff
1685
matlab/independent/greedyPopMix.m
Normal file
1685
matlab/independent/greedyPopMix.m
Normal file
File diff suppressed because it is too large
Load diff
1234
matlab/independent/indMix.m
Normal file
1234
matlab/independent/indMix.m
Normal file
File diff suppressed because it is too large
Load diff
1337
matlab/independent/indMix_fixK.m
Normal file
1337
matlab/independent/indMix_fixK.m
Normal file
File diff suppressed because it is too large
Load diff
49
matlab/independent/myxlsread.m
Normal file
49
matlab/independent/myxlsread.m
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
function [A B] = myxlsread(file)
|
||||
% This function read a tab ('\t') seperated txt file
|
||||
% input file structure:
|
||||
% first row: title
|
||||
% sencond to end row: first column, sample ID
|
||||
% second column, cluster label
|
||||
% other columns, gene sequences
|
||||
% Lu Cheng
|
||||
% 26.06.2010
|
||||
|
||||
% there can be multiple numeric columns in the input file
|
||||
% Lu Cheng, 25.11.2010
|
||||
|
||||
delimiter = '\t';
|
||||
|
||||
if exist(file,'file')~=2
|
||||
error('The input file %s does not exist!', file);
|
||||
end
|
||||
|
||||
lines = textread(file,'%s','delimiter','\n');
|
||||
|
||||
title = strread(lines{1},'%s','delimiter',delimiter);
|
||||
nRow = length(lines);
|
||||
nCol = length(title);
|
||||
|
||||
% determine numeric Columns
|
||||
tmp = strread(lines{2},'%s','delimiter',delimiter);
|
||||
numCols = [];
|
||||
for i = 1:length(tmp)
|
||||
if ~isnan(str2double(tmp{i}))
|
||||
numCols(end+1) = i; %#ok<AGROW>
|
||||
end
|
||||
end
|
||||
|
||||
A = cell(nRow-1, length(numCols));
|
||||
B = cell(nRow, nCol);
|
||||
|
||||
B(1,:) = title;
|
||||
for i=2:nRow
|
||||
if isempty(lines{i})
|
||||
B(i,:) = [];
|
||||
A(i-1,:) = [];
|
||||
else
|
||||
B(i,:) = strread(lines{i},'%s','delimiter',delimiter);
|
||||
A(i-1,:) = B(i,numCols);
|
||||
end
|
||||
end
|
||||
|
||||
A = cellfun(@str2double,A);
|
||||
19
matlab/independent/myxlswrite.m
Normal file
19
matlab/independent/myxlswrite.m
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
function myxlswrite(file, A)
|
||||
% A is a cell matrix, each element is a string
|
||||
% Lu Cheng, 25.11.2010
|
||||
|
||||
h = fopen(file,'w+');
|
||||
[nRow nCol] = size(A);
|
||||
|
||||
for i=1:nRow
|
||||
%tmpLine = '';
|
||||
for j=1:nCol-1
|
||||
if isnumeric(A{i,j})
|
||||
A{i,j} = num2str(A{i,j});
|
||||
end
|
||||
fprintf(h,'%s\t',A{i,j});
|
||||
end
|
||||
fprintf(h,'%s\n',A{i,nCol});
|
||||
end
|
||||
|
||||
fclose(h);
|
||||
177
matlab/independent/preprocessXLS.m
Normal file
177
matlab/independent/preprocessXLS.m
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
function c = preprocessXLS(xlsfile,varargin)
|
||||
% This function preprocesses the input xlsfile
|
||||
% File structure: first line - title
|
||||
% else - first column, name of individuals
|
||||
% - second to end column, sequences of the given genes
|
||||
|
||||
% Lu Cheng, 16.02.2010
|
||||
|
||||
%% check file names
|
||||
|
||||
file_suff = '.xls'; % endings of the input file
|
||||
if ~(exist(xlsfile,'file')==2)
|
||||
fprintf('Input file %s does not exists, quit!\n',xlsfile);
|
||||
return;
|
||||
end
|
||||
|
||||
if ~strcmp(xlsfile(end-length(file_suff)+1:end),file_suff)
|
||||
fprintf('Input file %s does not end with %s, quit!\n',xlsfile,file_suff);
|
||||
return;
|
||||
end
|
||||
|
||||
%% process the xls file
|
||||
|
||||
% Here we assume there is no missing values, if so, the missing values are
|
||||
% indicated by 0
|
||||
[data, component_mat, popnames] = processxls(xlsfile);
|
||||
|
||||
% missing data 0 is transformed to |alphabet|+1, in the [ACGT] case, '-' is 5
|
||||
[data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data);
|
||||
|
||||
c.data = data; c.rowsFromInd = rowsFromInd;
|
||||
c.alleleCodes = alleleCodes; c.noalle=noalle;
|
||||
c.adjprior = adjprior;
|
||||
% c.priorTerm = c.priorTerm;
|
||||
|
||||
c.component_mat = component_mat;
|
||||
c.popnames = popnames;
|
||||
|
||||
%% count the cliques and separators
|
||||
|
||||
index = data(:,end);
|
||||
|
||||
if isempty(varargin)
|
||||
[data_clique, data_separator, noalle_clique, noalle_separator, codes_cq, codes_sp, info_cq_loci, info_sp_loci] = ...
|
||||
transform5(data, component_mat);
|
||||
else
|
||||
c_train = varargin{1};
|
||||
|
||||
if ~all(all(c_train.component_mat == component_mat))
|
||||
disp('The gene lengths are different between the training data and the test data!');
|
||||
return;
|
||||
end
|
||||
|
||||
[data_clique, data_separator, noalle_clique, noalle_separator, codes_cq, codes_sp, info_cq_loci, info_sp_loci] = ...
|
||||
transform5(data, component_mat, c_train.info_cq_loci,c_train.info_sp_loci);
|
||||
end
|
||||
data_clique = [data_clique index];
|
||||
data_separator = [data_separator index];
|
||||
|
||||
% Count the data, note that the order of the alphabets keeps the same
|
||||
[counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq] ...
|
||||
= allfreqsnew2(data_clique, double(noalle_clique));
|
||||
[counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp] ...
|
||||
= allfreqsnew2(data_separator, double(noalle_separator));
|
||||
|
||||
clear prior_cq prior_sp nalleles_cq nalleles_sp genotypes_cq genotypes_sp;
|
||||
|
||||
counts_cq = uint16(counts_cq);
|
||||
counts_sp = uint16(counts_sp);
|
||||
|
||||
c.counts_cq = counts_cq;
|
||||
c.counts_sp = counts_sp;
|
||||
|
||||
c.adjprior_cq = adjprior_cq;
|
||||
c.adjprior_sp = adjprior_sp;
|
||||
|
||||
c.codes_cq = codes_cq;
|
||||
c.codes_sp = codes_sp;
|
||||
|
||||
c.info_cq_loci = info_cq_loci;
|
||||
c.info_sp_loci = info_sp_loci;
|
||||
|
||||
%--------------------------------------------------------------------------
|
||||
function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(raw_data)
|
||||
% Alkuperäisen datan viimeinen sarake kertoo, miltä yksilöltä
|
||||
% kyseinen rivi on peräisin. Funktio tutkii ensin, että montako
|
||||
% riviä maksimissaan on peräisin yhdeltä yksilöltä, jolloin saadaan
|
||||
% tietää onko kyseessä haploidi, diploidi jne... Tämän jälkeen funktio
|
||||
% lisää tyhjiä rivejä niille yksilöille, joilta on peräisin vähemmän
|
||||
% rivejä kuin maksimimäärä.
|
||||
% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin
|
||||
% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytössä oleva koodi.
|
||||
% Tämän jälkeen funktio muuttaa alleelikoodit siten, että yhden lokuksen j
|
||||
% koodit saavat arvoja välillä 1,...,noalle(j).
|
||||
|
||||
% English Comments added
|
||||
% Small modifications have been added
|
||||
% Lu Cheng, 17.02.2010
|
||||
|
||||
% Last column are the indexes of the samples, the raw_data is supposed to
|
||||
% be unit16 type, 0 indicates missing value
|
||||
data = raw_data;
|
||||
nloci=size(raw_data,2)-1;
|
||||
|
||||
% Replace missing value with the |alphabet|+1, thus 0 is replaced by 5 for
|
||||
% DNA dataset
|
||||
dataApu = data(:,1:nloci);
|
||||
nollat = find(dataApu==0);
|
||||
if ~isempty(nollat)
|
||||
isoinAlleeli = max(max(dataApu));
|
||||
dataApu(nollat) = isoinAlleeli+1;
|
||||
data(:,1:nloci) = dataApu;
|
||||
end
|
||||
|
||||
% stores all different alleles at each loci, construct the allle codes matrix
|
||||
noalle=zeros(1,nloci);
|
||||
alleelitLokuksessa = cell(nloci,1);
|
||||
for i=1:nloci
|
||||
alleelitLokuksessaI = unique(data(:,i));
|
||||
alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0));
|
||||
noalle(i) = length(alleelitLokuksessa{i,1});
|
||||
end
|
||||
alleleCodes = zeros(max(noalle),nloci);
|
||||
for i=1:nloci
|
||||
alleelitLokuksessaI = alleelitLokuksessa{i,1};
|
||||
puuttuvia = max(noalle)-length(alleelitLokuksessaI);
|
||||
alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)];
|
||||
end
|
||||
|
||||
%-----------------modified by Lu Cheng 17.02.2010--------------------------%
|
||||
% NOTE: Here we do not want to change the alpahbets, thus the following
|
||||
% lines are commented
|
||||
|
||||
% replace the index of an allele to replace the allele
|
||||
% for loc = 1:nloci
|
||||
% for all = 1:noalle(loc)
|
||||
% data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all;
|
||||
% end;
|
||||
% end;
|
||||
%-----------------modified end.....----------------------------------------%
|
||||
|
||||
% handle diploid situation
|
||||
nind = max(data(:,end));
|
||||
nrows = size(data,1);
|
||||
ncols = size(data,2);
|
||||
rowsFromInd = zeros(nind,1);
|
||||
for i=1:nind
|
||||
rowsFromInd(i) = length(find(data(:,end)==i));
|
||||
end
|
||||
maxRowsFromInd = max(rowsFromInd);
|
||||
a = -999;
|
||||
emptyRow = repmat(a, 1, ncols);
|
||||
lessThanMax = find(rowsFromInd < maxRowsFromInd);
|
||||
missingRows = maxRowsFromInd*nind - nrows;
|
||||
data = [data; zeros(missingRows, ncols)];
|
||||
pointer = 1;
|
||||
for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivejä
|
||||
miss = maxRowsFromInd-rowsFromInd(ind); % Tältä yksilöltä puuttuvien lkm.
|
||||
for j=1:miss
|
||||
rowToBeAdded = emptyRow;
|
||||
rowToBeAdded(end) = ind;
|
||||
data(nrows+pointer, :) = rowToBeAdded;
|
||||
pointer = pointer+1;
|
||||
end
|
||||
end
|
||||
data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti
|
||||
newData = data;
|
||||
rowsFromInd = maxRowsFromInd;
|
||||
|
||||
% calculate the prior for each loci, priorTerm is a constant term in the
|
||||
% formula, which is precalclulateed for speeding up the program
|
||||
adjprior = zeros(max(noalle),nloci);
|
||||
priorTerm = 0;
|
||||
for j=1:nloci
|
||||
adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)];
|
||||
priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j));
|
||||
end
|
||||
81
matlab/independent/processxls.m
Normal file
81
matlab/independent/processxls.m
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
function [data, component_mat, popnames] = processxls(filename)
|
||||
%
|
||||
% a bug in line 64-68 was fixed
|
||||
data = [];
|
||||
component_mat = [];
|
||||
popnames = [];
|
||||
try
|
||||
if ispc
|
||||
[A,B] = xlsread(filename);
|
||||
else
|
||||
[A,B] = myxlsread(filename);
|
||||
end
|
||||
catch
|
||||
display('*** ERROR: Wrong Excel format');
|
||||
return
|
||||
end
|
||||
|
||||
if size(A,2)~=1 % more than one columns containing numeric ST values
|
||||
display('*** ERROR: multiple columns of numeric values');
|
||||
data = []; component_mat = []; popnames = [];
|
||||
return
|
||||
end
|
||||
|
||||
if size(A,1)~=size(B,1)-1
|
||||
display('*** ERROR: Wrong format');
|
||||
data = []; component_mat = []; popnames = [];
|
||||
return
|
||||
end
|
||||
|
||||
B = deblank(B); % remove any trailing blanks
|
||||
nstrains = size(B,1)-1;
|
||||
nheader = size(B,2);
|
||||
for i = 1:nheader
|
||||
if strcmpi('ST',B{1,i}) ix_ST = i; end
|
||||
if strcmpi('Strain', B{1,i}) || strcmpi('Isolate',B{1,i})
|
||||
ix_Strain = i;
|
||||
end
|
||||
end
|
||||
if ~exist('ix_ST')
|
||||
display('*** ERROR: ST column needed');
|
||||
data = []; component_mat = []; popnames = [];
|
||||
return
|
||||
end
|
||||
|
||||
if ~exist('ix_Strain')
|
||||
ix_gene = setdiff([1:nheader],ix_ST);
|
||||
else
|
||||
ix_gene = setdiff([1:nheader],[ix_ST ix_Strain]);
|
||||
end
|
||||
|
||||
ngenes = length(ix_gene);
|
||||
|
||||
C = cell(nstrains,ngenes);
|
||||
if ~isempty(A)
|
||||
for i=1:nstrains
|
||||
B{i+1,ix_ST}=num2str(A(i));
|
||||
for j=1:ngenes
|
||||
C{i,j}=uint16(i_encode_n(B{i+1,ix_gene(j)})); % save the memory.
|
||||
end
|
||||
end
|
||||
end
|
||||
genesize=cellfun('size',C(1,:),2);
|
||||
data=cell2mat(C);
|
||||
data=[data uint16([1:nstrains]')];
|
||||
component_mat = zeros(ngenes,max(genesize));
|
||||
cum = cumsum(genesize);
|
||||
component_mat(1,[1:genesize(1)]) = [1:cum(1)];
|
||||
for i=2:ngenes
|
||||
component_mat(i,[1:genesize(i)]) = [(cum(i-1)+1):cum(i)];
|
||||
end
|
||||
|
||||
if ~exist('ix_Strain')
|
||||
popnames = num2cell(B([2:end],ix_ST));
|
||||
else % store the strain names only
|
||||
popnames = num2cell(B([2:end],ix_Strain));
|
||||
end
|
||||
popnames(:,2)=num2cell([1:nstrains]');
|
||||
|
||||
display('---------------------------------------------------');
|
||||
display(['# of strains: ', num2str(nstrains)]);
|
||||
display(['# of genes: ', num2str(ngenes)]);
|
||||
128
matlab/independent/semiReadScript.m
Normal file
128
matlab/independent/semiReadScript.m
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
function paras = semiReadScript(script_file)
|
||||
% This function extracts parameter information from the script file
|
||||
% Script Command Table
|
||||
% datafile('train|test','c:\BAPS5\DATA.xls'); Here only .xls and .mat file
|
||||
% input file is supported.
|
||||
% savePreproFile('train|test','c:\BAPS5\predata.mat');
|
||||
% setK('16 17 18');
|
||||
% outputmat('c:\BAPS5\output.mat')
|
||||
% Lu Cheng, 11.03.2010
|
||||
|
||||
paras.train_file_format = [];
|
||||
paras.train_file_name = [];
|
||||
|
||||
paras.save_prepro_train_data = []; paras.save_prepro_train_data = 'No';
|
||||
paras.train_prepro_file = [];
|
||||
|
||||
paras.test_file_format = [];
|
||||
paras.test_file_name = [];
|
||||
|
||||
paras.save_prepro_test_data = []; paras.save_prepro_test_data = 'No';
|
||||
paras.test_prepro_file = [];
|
||||
|
||||
paras.cluster_num_upperbounds = [];
|
||||
|
||||
paras.save_results = []; paras.save_results = 'No';
|
||||
paras.result_file = [];
|
||||
|
||||
T = readfile(script_file);
|
||||
|
||||
n = length(T);
|
||||
for i=1:n
|
||||
%line = regexprep(T{i},'\s+','');
|
||||
line = T{i};
|
||||
[res toks] = regexp(line,'(.+)\((.+)\)','once','match','tokens');
|
||||
|
||||
if isempty(res)
|
||||
continue;
|
||||
else
|
||||
%toks
|
||||
paras = parseCmd(toks{1}, toks{2}, paras);
|
||||
end
|
||||
end
|
||||
|
||||
% -------------------------------------------------------------------------
|
||||
function prog_paras = parseCmd(cmd, paras, prog_paras)
|
||||
% cmd is the script command
|
||||
% paras are the parameters of the script command
|
||||
% prog_paras is a stucture of the global parameters
|
||||
|
||||
switch cmd
|
||||
case 'datafile'
|
||||
paras = regexprep(paras,'\s+','');
|
||||
toks = regexp(paras,'''([^,]+)''','tokens');
|
||||
option = toks{1}{:};
|
||||
filename = toks{2}{:};
|
||||
if exist(filename,'file')~=2
|
||||
error(cat(2,'File not exist! File: ',filename));
|
||||
end
|
||||
filetype = getFileType(filename);
|
||||
if isequal(option,'train')
|
||||
prog_paras.train_file_format = filetype;
|
||||
prog_paras.train_file_name = filename;
|
||||
elseif isequal(option,'test')
|
||||
prog_paras.test_file_format = filetype;
|
||||
prog_paras.test_file_name = filename;
|
||||
else
|
||||
error(cat(2,'Unkown option: ',option,'! Expect train or test.'));
|
||||
end
|
||||
|
||||
case 'savePreprocFile'
|
||||
paras = regexprep(paras,'\s+','');
|
||||
toks = regexp(paras,'''([^,]+)''','tokens');
|
||||
option = toks{1}{:};
|
||||
filename = toks{2}{:};
|
||||
|
||||
filetype = getFileType(filename);
|
||||
if ~isequal(filetype,'.mat')
|
||||
error(cat(2,'The saved file should end with .mat! ',filename));
|
||||
end
|
||||
|
||||
if isequal(option,'train')
|
||||
prog_paras.save_prepro_train_data = 'Yes';
|
||||
prog_paras.train_prepro_file = filename;
|
||||
elseif isequal(option,'test')
|
||||
prog_paras.save_prepro_test_data = 'Yes';
|
||||
prog_paras.test_prepro_file = filename;
|
||||
else
|
||||
error(cat(2,'Unkown option: ',option,'! Expect train or test.'));
|
||||
end
|
||||
case 'setK'
|
||||
prog_paras.cluster_num_upperbounds = paras(2:end-1);
|
||||
case 'outputmat'
|
||||
filename = paras(2:end-1);
|
||||
filetype = getFileType(filename);
|
||||
if ~isequal(filetype,'.mat')
|
||||
error(cat(2,'The saved file should end with .mat! ',filename));
|
||||
end
|
||||
prog_paras.save_results = 'Yes';
|
||||
prog_paras.result_file = filename;
|
||||
otherwise
|
||||
error('Can not parse the cmd: %s in the script!', cmd);
|
||||
end
|
||||
|
||||
% -------------------------------------------------------------------------
|
||||
function filetype = getFileType(filename)
|
||||
filetype = filename(end-3:end);
|
||||
if ~isequal(filetype,'.xls') && ~isequal(filetype,'.mat')
|
||||
error(cat(2,'Unknown option: ', filename, '! Expect .xls or .mat file'));
|
||||
end
|
||||
|
||||
% -------------------------------------------------------------------------
|
||||
function T = readfile(filename)
|
||||
f = fopen(filename,'r');
|
||||
if f == -1
|
||||
error(cat(2,'*** ERROR: invalid input file: ',filename));
|
||||
T = [];
|
||||
return
|
||||
end
|
||||
|
||||
i = 1;
|
||||
while 1
|
||||
clear line;
|
||||
line = fgetl(f);
|
||||
if ~ischar(line), break, end
|
||||
T{i} = line;
|
||||
i = i+1;
|
||||
end
|
||||
fclose(f);
|
||||
1470
matlab/independent/semi_linkageMix.m
Normal file
1470
matlab/independent/semi_linkageMix.m
Normal file
File diff suppressed because it is too large
Load diff
651
matlab/independent/semi_linkageMixture_speed.m
Normal file
651
matlab/independent/semi_linkageMixture_speed.m
Normal file
|
|
@ -0,0 +1,651 @@
|
|||
function semi_linkageMixture_speed(c_train, c_test)
|
||||
% This function process adjusts the priors of the training data accoring to
|
||||
% the test data. Based on the adjusted priors, the test data is clustered.
|
||||
|
||||
% modified from linkageMixture_speed.m by Lu Cheng, 16.02.2010
|
||||
|
||||
% Update by Lu Cheng, 07.03.2011
|
||||
% case of only 1 sample in the test data has been handled
|
||||
|
||||
% added by Lu Cheng, 11.03.2010
|
||||
global SCRIPT_MODE;
|
||||
global PARAMETERS;
|
||||
if isempty(SCRIPT_MODE)
|
||||
SCRIPT_MODE = false;
|
||||
end
|
||||
% -----------------
|
||||
|
||||
|
||||
%% compare the training data and test data, adjust priors
|
||||
|
||||
%1% Compare the training data and test data to adjust the prior
|
||||
if ~all(all(c_train.component_mat == c_test.component_mat))
|
||||
disp('The gene lengths are different between the training data and the test data!');
|
||||
return;
|
||||
end
|
||||
|
||||
flag = false; % whether the trained priors should be adjusted
|
||||
n_loci = size(c_train.alleleCodes,2);
|
||||
|
||||
if c_train.rowsFromInd ~= c_test.rowsFromInd
|
||||
error('Inconsistant rows from each individual. Train: %d Test: %d. Quit! \n', ...
|
||||
c_train.rowsFromInd, c_test.rowsFromInd);
|
||||
return;
|
||||
elseif c_train.rowsFromInd > 1
|
||||
error('Data must be haploid. Quit! rowsFromInd: %d.\n', c_train.rowsFromInd);
|
||||
return;
|
||||
end
|
||||
|
||||
for i=1:n_loci
|
||||
if flag; break; end
|
||||
a = setdiff(c_test.alleleCodes(:,i),c_train.alleleCodes(:,i));
|
||||
a = a(a~=0);
|
||||
if ~isempty(a)
|
||||
flag = true;
|
||||
fprintf('New alleles are detected in the test data at loci: %d\n',i);
|
||||
fprintf('The processing time will be much longer than usual. \n');
|
||||
end
|
||||
end
|
||||
|
||||
%2% reprocess the data for clustering
|
||||
if flag
|
||||
|
||||
% combing the training data and test data, adjust the priors
|
||||
combine_data = [c_train.data; c_test.data];
|
||||
n_train = size(c_train.data,1);
|
||||
n_samples = size(combine_data,1);
|
||||
combine_data(:,end) = (1:n_samples)';
|
||||
[data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(combine_data);
|
||||
|
||||
index = data(:,end);
|
||||
[data_clique, data_separator, noalle_clique, noalle_separator] = ...
|
||||
transform4(data, c_train.component_mat,'codon');
|
||||
data_clique = [data_clique index];
|
||||
data_separator = [data_separator index];
|
||||
|
||||
% Count the data
|
||||
[counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]...
|
||||
= allfreqsnew2(data_clique, double(noalle_clique));
|
||||
[counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]...
|
||||
= allfreqsnew2(data_separator, double(noalle_separator));
|
||||
|
||||
counts_cq = uint16(counts_cq);
|
||||
counts_sp = uint16(counts_sp);
|
||||
|
||||
c_train.adjprior = adjprior;
|
||||
|
||||
c_train.counts_cq = counts_cq(:,:,1:n_train);
|
||||
c_train.counts_sp = counts_sp(:,:,1:n_train);
|
||||
|
||||
c_test.counts_cq = counts_cq(:,:,n_train+1:end);
|
||||
c_test.counts_sp = counts_sp(:,:,n_train+1:end);
|
||||
|
||||
c_train.adjprior_cq = adjprior_cq;
|
||||
c_train.adjprior_sp = adjprior_sp;
|
||||
|
||||
c_train.alleleCodes = alleleCodes;
|
||||
c_train.noalle = noalle;
|
||||
|
||||
clear data rowsFromInd alleleCodes noalle adjprior priorTerm index
|
||||
clear data_clique data_separator noalle_clique noalle_separator
|
||||
clear counts_cq counts_sp nalleles_cq nalleles_sp prior_cq prior_sp adjprior_cq adjprior_sp
|
||||
|
||||
else
|
||||
|
||||
% KEY: adjust the test data to fit the configuration of the training data
|
||||
% the 'codes_cq' and 'codes_sp' are directly translated from DNA sequence
|
||||
% SEE 'i_encode_n.m' under the linkage folder
|
||||
|
||||
num_cq = size(c_train.counts_cq, 1); num_sp = size(c_train.counts_sp, 1);
|
||||
n_loci_cq = size(c_train.counts_cq, 2); n_loci_sp = size(c_train.counts_sp, 2);
|
||||
n_inds = size(c_test.counts_cq, 3);
|
||||
|
||||
counts_cq = zeros(num_cq, n_loci_cq, n_inds);
|
||||
counts_sp = zeros(num_sp, n_loci_sp, n_inds);
|
||||
|
||||
% mapping the indexes of cliques and separators of the test data to the
|
||||
% indexes of the training data
|
||||
for k = 1:n_inds
|
||||
for j = 1:n_loci_cq
|
||||
[c, ia, ib] = intersect(c_test.codes_cq{j}, c_train.codes_cq{j},'rows');
|
||||
counts_cq(ib,j,k) = c_test.counts_cq(ia,j,k);
|
||||
end
|
||||
|
||||
for j = 1:n_loci_sp
|
||||
[c, ia, ib] = intersect(c_test.codes_sp{j}, c_train.codes_sp{j},'rows');
|
||||
counts_sp(ib,j,k) = c_test.counts_sp(ia,j,k);
|
||||
end
|
||||
end
|
||||
|
||||
c_test.counts_cq = counts_cq;
|
||||
c_test.counts_sp = counts_sp;
|
||||
|
||||
clear c ia ib k j i;
|
||||
|
||||
end
|
||||
|
||||
%% cluster the test data
|
||||
|
||||
% case of only 1 sample in the test data, added by Lu Cheng, 07.03.2011
|
||||
if size(c_test.data,1)~=1
|
||||
[Z,dist] = newGetDistances(c_test.data, c_test.rowsFromInd);
|
||||
c_test.Z = Z;
|
||||
c_test.dist = dist;
|
||||
end
|
||||
|
||||
clear Z dist;
|
||||
|
||||
message = cat(2,'There are currently ',num2str(length(unique(c_train.cluster_labels))),' clusters in the training data, please input upper bounds of cluster numbers in the test data.');
|
||||
|
||||
if SCRIPT_MODE
|
||||
cluster_nums = str2num(PARAMETERS.cluster_num_upperbounds);
|
||||
else
|
||||
cluster_nums = inputdlg(message);
|
||||
if isempty(cluster_nums) == 1
|
||||
return;
|
||||
else
|
||||
cluster_nums = str2num(cluster_nums{:});
|
||||
end
|
||||
end
|
||||
|
||||
% % Test purpose, Check the input data, there should be 1 allele in each loci (column)
|
||||
% % Lu Cheng, 25.02.2010
|
||||
% if ~all(all(squeeze(sum(c_train.counts_cq,1))))
|
||||
% disp('Missing cq value of some sample in counts_cq of the training data');
|
||||
% return;
|
||||
% elseif ~all(all(squeeze(sum(c_train.counts_sp,1))))
|
||||
% disp('Missing sp value of some sample in counts_sp of the training data');
|
||||
% return;
|
||||
% elseif ~all(all(squeeze(sum(c_test.counts_cq,1))))
|
||||
% disp('Missing cq value of some sample in counts_cq of the test data');
|
||||
% return;
|
||||
% elseif ~all(all(squeeze(sum(c_test.counts_sp,1))))
|
||||
% disp('Missing sp value of some sample in counts_sp of the test data');
|
||||
% return;
|
||||
% end
|
||||
|
||||
tic
|
||||
semi_res = semi_linkageMix(c_train, c_test, cluster_nums);
|
||||
toc
|
||||
|
||||
semi_res.popnames = c_test.popnames;
|
||||
|
||||
writeMixtureInfo(semi_res);
|
||||
|
||||
% save the results
|
||||
if SCRIPT_MODE
|
||||
save_results = PARAMETERS.save_results;
|
||||
else
|
||||
save_results = questdlg('Do you wish to save the results?',...
|
||||
'Save Results','Yes','No','Yes');
|
||||
end
|
||||
|
||||
if isequal(save_results,'Yes')
|
||||
if SCRIPT_MODE
|
||||
save(PARAMETERS.result_file,'semi_res','-v7.3');
|
||||
else
|
||||
[filename, pathname] = uiputfile('*.mat','Save the results as');
|
||||
if (sum(filename)==0) || (sum(pathname)==0)
|
||||
% do nothing
|
||||
else
|
||||
save(strcat(pathname,filename),'semi_res','-v7.3');
|
||||
end
|
||||
end
|
||||
end;
|
||||
|
||||
% -----------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
%--------------------------------------------------------------------------
|
||||
%% The next three functions are for computing the initial partition
|
||||
% according to the distance between the individuals
|
||||
|
||||
function initial_partition=admixture_initialization(nclusters,Z)
|
||||
T=cluster_own(Z,nclusters);
|
||||
initial_partition=T;
|
||||
|
||||
function T = cluster_own(Z,nclust)
|
||||
% true=logical(1);
|
||||
% false=logical(0);
|
||||
|
||||
maxclust = nclust;
|
||||
% Start of algorithm
|
||||
m = size(Z,1)+1;
|
||||
T = zeros(m,1);
|
||||
% maximum number of clusters based on inconsistency
|
||||
if m <= maxclust
|
||||
T = (1:m)';
|
||||
elseif maxclust==1
|
||||
T = ones(m,1);
|
||||
|
||||
else
|
||||
clsnum = 1;
|
||||
for k = (m-maxclust+1):(m-1)
|
||||
i = Z(k,1); % left tree
|
||||
if i <= m % original node, no leafs
|
||||
T(i) = clsnum;
|
||||
clsnum = clsnum + 1;
|
||||
elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree
|
||||
T = clusternum(Z, T, i-m, clsnum);
|
||||
clsnum = clsnum + 1;
|
||||
end
|
||||
i = Z(k,2); % right tree
|
||||
if i <= m % original node, no leafs
|
||||
T(i) = clsnum;
|
||||
clsnum = clsnum + 1;
|
||||
elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree
|
||||
T = clusternum(Z, T, i-m, clsnum);
|
||||
clsnum = clsnum + 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function T = clusternum(X, T, k, c)
|
||||
m = size(X,1)+1;
|
||||
while(~isempty(k))
|
||||
% Get the children of nodes at this level
|
||||
children = X(k,1:2);
|
||||
children = children(:);
|
||||
|
||||
% Assign this node number to leaf children
|
||||
t = (children<=m);
|
||||
T(children(t)) = c;
|
||||
|
||||
% Move to next level
|
||||
k = children(~t) - m;
|
||||
end
|
||||
|
||||
%--------------------------------------------------------------------------
|
||||
|
||||
function Z = computeLinkage(Y, method)
|
||||
[k, n] = size(Y);
|
||||
m = (1+sqrt(1+8*n))/2;
|
||||
if k ~= 1 || m ~= fix(m)
|
||||
error('The first input has to match the output of the PDIST function in size.');
|
||||
end
|
||||
if nargin == 1 % set default switch to be 'co'
|
||||
method = 'co';
|
||||
end
|
||||
method = lower(method(1:2)); % simplify the switch string.
|
||||
% monotonic = 1;
|
||||
Z = zeros(m-1,3); % allocate the output matrix.
|
||||
N = zeros(1,2*m-1);
|
||||
N(1:m) = 1;
|
||||
n = m; % since m is changing, we need to save m in n.
|
||||
R = 1:n;
|
||||
for s = 1:(n-1)
|
||||
X = Y;
|
||||
[v, k] = min(X);
|
||||
i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1)));
|
||||
j = k - (i-1)*(m-i/2)+i;
|
||||
Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A
|
||||
I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables.
|
||||
U = [I1 I2 I3];
|
||||
I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3];
|
||||
J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3];
|
||||
|
||||
switch method
|
||||
case 'si' %single linkage
|
||||
Y(I) = min(Y(I),Y(J));
|
||||
case 'av' % average linkage
|
||||
Y(I) = Y(I) + Y(J);
|
||||
case 'co' %complete linkage
|
||||
Y(I) = max(Y(I),Y(J));
|
||||
case 'ce' % centroid linkage
|
||||
K = N(R(i))+N(R(j));
|
||||
Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K;
|
||||
case 'wa'
|
||||
Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ...
|
||||
N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U)));
|
||||
end
|
||||
J = [J i*(m-(i+1)/2)-m+j];
|
||||
Y(J) = []; % no need for the cluster information about j.
|
||||
|
||||
% update m, N, R
|
||||
m = m-1;
|
||||
N(n+s) = N(R(i)) + N(R(j));
|
||||
R(i) = n+s;
|
||||
R(j:(n-1))=R((j+1):n);
|
||||
end
|
||||
|
||||
%--------------------------------------------------------------------------
|
||||
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
function [Z, dist] = newGetDistances(data, rowsFromInd)
|
||||
|
||||
ninds = max(data(:,end));
|
||||
nloci = size(data,2)-1;
|
||||
riviLkm = nchoosek(double(ninds),2);
|
||||
|
||||
% empties = find(data<0);
|
||||
% data(empties)=0;
|
||||
data(logical(data<0)) = 0;
|
||||
data = uint16(data);
|
||||
|
||||
pariTaulu = zeros(riviLkm,2);
|
||||
aPointer=1;
|
||||
|
||||
for a=1:ninds-1
|
||||
pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a;
|
||||
pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)');
|
||||
aPointer = aPointer+double(ninds-a);
|
||||
end
|
||||
|
||||
eka = pariTaulu(:,ones(1,rowsFromInd));
|
||||
eka = eka * rowsFromInd;
|
||||
miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]);
|
||||
eka = eka - miinus;
|
||||
|
||||
toka = pariTaulu(:,ones(1,rowsFromInd)*2);
|
||||
toka = toka * rowsFromInd;
|
||||
toka = toka - miinus;
|
||||
|
||||
eka = uint16(eka);
|
||||
toka = uint16(toka);
|
||||
|
||||
clear pariTaulu; clear miinus;
|
||||
|
||||
summa = uint16(zeros(riviLkm,1));
|
||||
vertailuja = uint16(zeros(riviLkm,1));
|
||||
|
||||
x = zeros(size(eka)); x = uint16(x);
|
||||
y = zeros(size(toka)); y = uint16(y);
|
||||
% fprintf(1,'%%10');
|
||||
for j=1:nloci;
|
||||
|
||||
for k=1:rowsFromInd
|
||||
x(:,k) = data(eka(:,k),j);
|
||||
y(:,k) = data(toka(:,k),j);
|
||||
end
|
||||
|
||||
for a=1:rowsFromInd
|
||||
for b=1:rowsFromInd
|
||||
vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0);
|
||||
vertailuja = vertailuja + vertailutNyt;
|
||||
lisays = (x(:,a)~=y(:,b) & vertailutNyt);
|
||||
summa = summa + uint16(lisays);
|
||||
end
|
||||
end
|
||||
% fprintf(1,'\b\b');
|
||||
% fprintf(1,'%d',floor(10+80*j/nloci));
|
||||
end
|
||||
|
||||
clear x; clear y; clear vertailutNyt;
|
||||
clear eka; clear toka; clear data; clear lisays;
|
||||
dist = zeros(length(vertailuja),1);
|
||||
% nollat = find(vertailuja==0);
|
||||
% dist(nollat) = 1;
|
||||
dist(logical(vertailuja==0)) = 1;
|
||||
muut = find(vertailuja>0);
|
||||
dist(muut) = double(summa(muut))./double(vertailuja(muut));
|
||||
clear summa; clear vertailuja; clear muut;
|
||||
|
||||
Z = computeLinkage(dist');
|
||||
% fprintf(1,'\b\b');
|
||||
% fprintf(1,'%d\n',100);
|
||||
%--------------------------------------------------------------------------
|
||||
|
||||
function writeMixtureInfo(c)
|
||||
|
||||
outputFile = 'baps5_semi_output.txt';
|
||||
|
||||
% output the semi-supervised clustering results to the outputFile
|
||||
% modified by Lu Cheng, 28.03.2010
|
||||
|
||||
ninds = length(c.PARTITION);
|
||||
npops = c.npops;
|
||||
popnames = c.popnames;
|
||||
logml = c.logml;
|
||||
partition = c.PARTITION;
|
||||
partitionSummary = c.partitionSummary;
|
||||
|
||||
if ~isempty(outputFile)
|
||||
fid = fopen(outputFile,'w');
|
||||
else
|
||||
fid = -1;
|
||||
%diary('baps5_semi_output.baps'); % save in text anyway.
|
||||
end
|
||||
|
||||
dispLine;
|
||||
disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:');
|
||||
disp(['Number of clustered individuals: ' ownNum2Str(ninds)]);
|
||||
disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]);
|
||||
disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]);
|
||||
disp(' ');
|
||||
if (fid ~= -1)
|
||||
fprintf(fid,'%10s\n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']);
|
||||
fprintf(fid,'%20s\n', ['Number of clustered individuals: ' ownNum2Str(ninds)]);
|
||||
fprintf(fid,'%20s\n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]);
|
||||
fprintf(fid,'%20s\n\n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]);
|
||||
end
|
||||
|
||||
disp('Best Partition: ');
|
||||
if (fid ~= -1)
|
||||
fprintf(fid,'%s \n','Best Partition: ');
|
||||
end
|
||||
for m=1:npops
|
||||
indsInM = find(partition==m);
|
||||
|
||||
if isempty(indsInM)
|
||||
continue;
|
||||
end
|
||||
|
||||
length_of_beginning = 11 + floor(log10(m));
|
||||
cluster_size = length(indsInM);
|
||||
|
||||
text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})];
|
||||
for k = 2:cluster_size
|
||||
text = [text ', ' char(popnames{indsInM(k)})];
|
||||
end;
|
||||
text = [text '}'];
|
||||
|
||||
while length(text)>58
|
||||
%Take one line and display it.
|
||||
new_line = takeLine(text,58);
|
||||
text = text(length(new_line)+1:end);
|
||||
disp(new_line);
|
||||
if (fid ~= -1)
|
||||
fprintf(fid,'%s \n',new_line);
|
||||
end
|
||||
if length(text)>0
|
||||
text = [blanks(length_of_beginning) text];
|
||||
else
|
||||
text = [];
|
||||
end;
|
||||
end;
|
||||
|
||||
if ~isempty(text)
|
||||
disp(text);
|
||||
if (fid ~= -1)
|
||||
fprintf(fid,'%s \n',text);
|
||||
end
|
||||
end;
|
||||
end
|
||||
|
||||
names = true;
|
||||
|
||||
clusterProbTable = c.clusterProbTable;
|
||||
if npops == 1
|
||||
clusterProbTable = [];
|
||||
else
|
||||
disp('');
|
||||
disp('Posterior probability of assignment into clusters:');
|
||||
|
||||
if (fid ~= -1)
|
||||
fprintf(fid, '%s \n', ' '); fprintf(fid, '\n');
|
||||
fprintf(fid, '%s \n', 'Posterior probability of assignment into clusters: '); fprintf(fid, '\n');
|
||||
end
|
||||
|
||||
text = sprintf('%10s','ind');
|
||||
for ii = 1:npops
|
||||
tmpstr = sprintf('%10s',num2str(ii));
|
||||
text = [text tmpstr];
|
||||
end
|
||||
|
||||
disp(text);
|
||||
if (fid ~= -1)
|
||||
fprintf(fid, '%s \n', text);
|
||||
end
|
||||
|
||||
for ii = 1:ninds
|
||||
text = sprintf('%10s',popnames{ii}{:});
|
||||
for jj = 1:npops
|
||||
tmpstr = sprintf('%10s',num2str(clusterProbTable(ii,jj),'%10.6f'));
|
||||
text = [text tmpstr];
|
||||
end
|
||||
|
||||
if ii<100
|
||||
disp(text);
|
||||
elseif ii==101
|
||||
disp('.......................................');
|
||||
disp('..........see output file..............');
|
||||
end
|
||||
if (fid ~= -1)
|
||||
fprintf(fid, '%s \n', text);
|
||||
end
|
||||
text = [];
|
||||
end
|
||||
end
|
||||
|
||||
disp(' ');
|
||||
disp(' ');
|
||||
disp('List of sizes of 10 best visited partitions and corresponding log(ml) values');
|
||||
|
||||
if (fid ~= -1)
|
||||
fprintf(fid, '%s \n\n', ' ');
|
||||
fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n');
|
||||
end
|
||||
|
||||
partitionSummary = sortrows(partitionSummary,2);
|
||||
partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :);
|
||||
partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:);
|
||||
if size(partitionSummary,1)>10
|
||||
vikaPartitio = 10;
|
||||
else
|
||||
vikaPartitio = size(partitionSummary,1);
|
||||
end
|
||||
for part = 1:vikaPartitio
|
||||
line = [num2str(partitionSummary(part,1),'%20d') ' ' num2str(partitionSummary(part,2),'%20.6f')];
|
||||
disp(line);
|
||||
if (fid ~= -1)
|
||||
fprintf(fid, '%s \n', line);
|
||||
end
|
||||
end
|
||||
|
||||
if (fid ~= -1)
|
||||
fclose(fid);
|
||||
else
|
||||
diary off
|
||||
end
|
||||
|
||||
%--------------------------------------------------------------
|
||||
function newline = takeLine(description,width)
|
||||
%Returns one line from the description: line ends to the first
|
||||
%space after width:th mark.
|
||||
% newLine = description(1:width);
|
||||
n = width+1;
|
||||
while ~isspace(description(n)) && n<length(description)
|
||||
n = n+1;
|
||||
end;
|
||||
newline = description(1:n);
|
||||
|
||||
|
||||
function dispLine
|
||||
disp('---------------------------------------------------');
|
||||
|
||||
function num2 = omaRound(num)
|
||||
% Py�rist�� luvun num 1 desimaalin tarkkuuteen
|
||||
num = num*10;
|
||||
num = round(num);
|
||||
num2 = num/10;
|
||||
|
||||
%---------------------------------------------------------
|
||||
|
||||
%-------------------------------------------------------------------------
|
||||
|
||||
function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ...
|
||||
handleData(raw_data)
|
||||
% Alkuper�isen datan viimeinen sarake kertoo, milt�yksil�lt�
|
||||
% kyseinen rivi on per�isin. Funktio tutkii ensin, ett�montako
|
||||
% rivi�maksimissaan on per�isin yhdelt�yksil�lt� jolloin saadaan
|
||||
% tiet�� onko kyseess�haploidi, diploidi jne... T�m�n j�lkeen funktio
|
||||
% lis�� tyhji�rivej�niille yksil�ille, joilta on per�isin v�hemm�n
|
||||
% rivej�kuin maksimim��r�
|
||||
% Mik�li jonkin alleelin koodi on =0, funktio muuttaa t�m�n alleelin
|
||||
% koodi pienimm�ksi koodiksi, joka isompi kuin mik��n k�yt�ss�oleva koodi.
|
||||
% T�m�n j�lkeen funktio muuttaa alleelikoodit siten, ett�yhden lokuksen j
|
||||
% koodit saavat arvoja v�lill�1,...,noalle(j).
|
||||
|
||||
data = raw_data;
|
||||
nloci=size(raw_data,2)-1;
|
||||
|
||||
dataApu = data(:,1:nloci);
|
||||
nollat = find(dataApu==0);
|
||||
if ~isempty(nollat)
|
||||
isoinAlleeli = max(max(dataApu));
|
||||
dataApu(nollat) = isoinAlleeli+1;
|
||||
data(:,1:nloci) = dataApu;
|
||||
end
|
||||
% dataApu = [];
|
||||
% nollat = [];
|
||||
% isoinAlleeli = [];
|
||||
|
||||
noalle=zeros(1,nloci);
|
||||
alleelitLokuksessa = cell(nloci,1);
|
||||
for i=1:nloci
|
||||
alleelitLokuksessaI = unique(data(:,i));
|
||||
%alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0));
|
||||
alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0));
|
||||
noalle(i) = length(alleelitLokuksessa{i,1});
|
||||
end
|
||||
alleleCodes = zeros(max(noalle),nloci);
|
||||
for i=1:nloci
|
||||
alleelitLokuksessaI = alleelitLokuksessa{i,1};
|
||||
puuttuvia = max(noalle)-length(alleelitLokuksessaI);
|
||||
alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)];
|
||||
end
|
||||
|
||||
for loc = 1:nloci
|
||||
for all = 1:noalle(loc)
|
||||
% data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all;
|
||||
data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all;
|
||||
end;
|
||||
end;
|
||||
|
||||
nind = max(data(:,end));
|
||||
nrows = size(data,1);
|
||||
ncols = size(data,2);
|
||||
rowsFromInd = zeros(nind,1);
|
||||
for i=1:nind
|
||||
rowsFromInd(i) = length(find(data(:,end)==i));
|
||||
end
|
||||
maxRowsFromInd = max(rowsFromInd);
|
||||
a = -999;
|
||||
emptyRow = repmat(a, 1, ncols);
|
||||
lessThanMax = find(rowsFromInd < maxRowsFromInd);
|
||||
missingRows = maxRowsFromInd*nind - nrows;
|
||||
data = [data; zeros(missingRows, ncols)];
|
||||
pointer = 1;
|
||||
for ind=lessThanMax' %K�y l�pi ne yksil�t, joilta puuttuu rivej�
|
||||
miss = maxRowsFromInd-rowsFromInd(ind); % T�lt�yksil�lt�puuttuvien lkm.
|
||||
for j=1:miss
|
||||
rowToBeAdded = emptyRow;
|
||||
rowToBeAdded(end) = ind;
|
||||
data(nrows+pointer, :) = rowToBeAdded;
|
||||
pointer = pointer+1;
|
||||
end
|
||||
end
|
||||
data = sortrows(data, ncols); % Sorttaa yksil�iden mukaisesti
|
||||
newData = data;
|
||||
rowsFromInd = maxRowsFromInd;
|
||||
|
||||
adjprior = zeros(max(noalle),nloci);
|
||||
priorTerm = 0;
|
||||
for j=1:nloci
|
||||
adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)];
|
||||
priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j));
|
||||
end
|
||||
|
||||
%--------------------------------------------------------------------------
|
||||
5
matlab/independent/tmpscript.txt
Normal file
5
matlab/independent/tmpscript.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
datafile('train','C:\BAPS5\burk_test\train_data.mat')
|
||||
datafile('test','C:\BAPS5\burk_test\testdata\testdata_51_5.xls')
|
||||
savePreproFile('test','C:\BAPS5\burk_test\testpreproc\testpreproc_51_5.mat')
|
||||
setK('16')
|
||||
outputmat('C:\BAPS5\burk_test\testres\testres_51_5.mat')
|
||||
2293
matlab/independent/trainedMix.m
Normal file
2293
matlab/independent/trainedMix.m
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue