Added source Matlab code for reference

2019-12-16 16:47:21 +01:00 · 2019-12-16 16:47:21 +01:00 · b5d99903d2
commit b5d99903d2
parent b8af977117
186 changed files with 61405 additions and 1 deletions
--- a/matlab/independent/greedyMix.m
+++ b/matlab/independent/greedyMix.m
--- a/matlab/independent/greedyPopMix.m
+++ b/matlab/independent/greedyPopMix.m
--- a/matlab/independent/indMix.m
+++ b/matlab/independent/indMix.m
--- a/matlab/independent/indMix_fixK.m
+++ b/matlab/independent/indMix_fixK.m
--- a/matlab/independent/myxlsread.m
+++ b/matlab/independent/myxlsread.m
@ -0,0 +1,49 @@
+function [A B] = myxlsread(file)
+% This function read a tab ('\t') seperated txt file
+% input file structure:
+% first row: title
+% sencond to end row: first column, sample ID
+%                     second column, cluster label
+%                     other columns, gene sequences
+% Lu Cheng
+% 26.06.2010
+
+% there can be multiple numeric columns in the input file
+% Lu Cheng, 25.11.2010
+
+delimiter = '\t';
+
+if exist(file,'file')~=2
+    error('The input file %s does not exist!', file);
+end
+
+lines = textread(file,'%s','delimiter','\n');
+
+title = strread(lines{1},'%s','delimiter',delimiter);
+nRow = length(lines);
+nCol = length(title);
+
+% determine numeric Columns
+tmp = strread(lines{2},'%s','delimiter',delimiter);
+numCols = [];
+for i = 1:length(tmp)
+    if ~isnan(str2double(tmp{i}))
+        numCols(end+1) = i;  %#ok<AGROW>
+    end
+end
+
+A = cell(nRow-1, length(numCols));
+B = cell(nRow, nCol);
+
+B(1,:) = title;
+for i=2:nRow
+    if isempty(lines{i})
+        B(i,:) = [];
+        A(i-1,:) = [];
+    else
+        B(i,:) = strread(lines{i},'%s','delimiter',delimiter);
+        A(i-1,:) = B(i,numCols);
+    end
+end
+
+A = cellfun(@str2double,A);
--- a/matlab/independent/myxlswrite.m
+++ b/matlab/independent/myxlswrite.m
@ -0,0 +1,19 @@
+function myxlswrite(file, A)
+% A is a cell matrix, each element is a string
+% Lu Cheng, 25.11.2010
+
+h = fopen(file,'w+');
+[nRow nCol] = size(A);
+
+for i=1:nRow
+    %tmpLine = '';
+    for j=1:nCol-1
+        if isnumeric(A{i,j})
+            A{i,j} = num2str(A{i,j});
+        end
+        fprintf(h,'%s\t',A{i,j});
+    end
+    fprintf(h,'%s\n',A{i,nCol});
+end
+
+fclose(h);
--- a/matlab/independent/preprocessXLS.m
+++ b/matlab/independent/preprocessXLS.m
@ -0,0 +1,177 @@
+function c = preprocessXLS(xlsfile,varargin)
+% This function preprocesses the input xlsfile
+% File structure:  first line - title
+%                  else - first column, name of individuals
+%                       - second to end column, sequences of the given genes
+
+% Lu Cheng, 16.02.2010
+
+%% check file names
+
+file_suff = '.xls'; % endings of the input file
+if ~(exist(xlsfile,'file')==2)
+    fprintf('Input file %s does not exists, quit!\n',xlsfile);
+    return;
+end
+
+if ~strcmp(xlsfile(end-length(file_suff)+1:end),file_suff)
+    fprintf('Input file %s does not end with %s, quit!\n',xlsfile,file_suff);
+    return;
+end
+
+%% process the xls file
+
+% Here we assume there is no missing values, if so, the missing values are
+% indicated by 0
+[data, component_mat, popnames] = processxls(xlsfile);
+
+% missing data 0 is transformed to |alphabet|+1, in the [ACGT] case, '-' is 5
+[data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data);
+
+c.data = data; c.rowsFromInd = rowsFromInd;
+c.alleleCodes = alleleCodes; c.noalle=noalle;
+c.adjprior = adjprior;
+% c.priorTerm = c.priorTerm;
+
+c.component_mat = component_mat;
+c.popnames = popnames;
+
+%% count the cliques and separators
+
+index = data(:,end);
+
+if isempty(varargin)
+    [data_clique, data_separator, noalle_clique, noalle_separator, codes_cq, codes_sp, info_cq_loci, info_sp_loci] = ...
+        transform5(data, component_mat);
+else
+    c_train = varargin{1};
+    
+    if ~all(all(c_train.component_mat == component_mat))
+        disp('The gene lengths are different between the training data and the test data!');
+        return;
+    end
+    
+    [data_clique, data_separator, noalle_clique, noalle_separator, codes_cq, codes_sp, info_cq_loci, info_sp_loci] = ...
+        transform5(data, component_mat, c_train.info_cq_loci,c_train.info_sp_loci);
+end
+data_clique = [data_clique index];
+data_separator = [data_separator index];
+
+% Count the data, note that the order of the alphabets keeps the same
+[counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq] ...
+    = allfreqsnew2(data_clique, double(noalle_clique));
+[counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp] ...
+    = allfreqsnew2(data_separator, double(noalle_separator));
+
+clear prior_cq prior_sp nalleles_cq nalleles_sp genotypes_cq genotypes_sp;
+
+counts_cq = uint16(counts_cq);
+counts_sp = uint16(counts_sp);
+
+c.counts_cq = counts_cq;
+c.counts_sp = counts_sp;
+
+c.adjprior_cq = adjprior_cq;
+c.adjprior_sp = adjprior_sp;
+
+c.codes_cq = codes_cq;
+c.codes_sp = codes_sp;
+
+c.info_cq_loci = info_cq_loci;
+c.info_sp_loci = info_sp_loci;
+
+%--------------------------------------------------------------------------
+function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(raw_data)
+% Alkuperäisen datan viimeinen sarake kertoo, miltä yksilöltä
+% kyseinen rivi on peräisin. Funktio tutkii ensin, että montako
+% riviä maksimissaan on peräisin yhdeltä yksilöltä, jolloin saadaan
+% tietää onko kyseessä haploidi, diploidi jne... Tämän jälkeen funktio
+% lisää tyhjiä rivejä niille yksilöille, joilta on peräisin vähemmän
+% rivejä kuin maksimimäärä.
+%   Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin
+% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytössä oleva koodi.
+% Tämän jälkeen funktio muuttaa alleelikoodit siten, että yhden lokuksen j
+% koodit saavat arvoja välillä 1,...,noalle(j).
+
+% English Comments added 
+% Small modifications have been added
+% Lu Cheng, 17.02.2010
+
+% Last column are the indexes of the samples, the raw_data is supposed to
+% be unit16 type, 0 indicates missing value
+data = raw_data;
+nloci=size(raw_data,2)-1;
+
+% Replace missing value with the |alphabet|+1, thus 0 is replaced by 5 for
+% DNA dataset
+dataApu = data(:,1:nloci);
+nollat = find(dataApu==0);
+if ~isempty(nollat)
+    isoinAlleeli = max(max(dataApu));
+    dataApu(nollat) = isoinAlleeli+1;
+    data(:,1:nloci) = dataApu;
+end
+
+% stores all different alleles at each loci, construct the allle codes matrix
+noalle=zeros(1,nloci);
+alleelitLokuksessa = cell(nloci,1);
+for i=1:nloci
+    alleelitLokuksessaI = unique(data(:,i));
+    alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0));
+    noalle(i) = length(alleelitLokuksessa{i,1});
+end
+alleleCodes = zeros(max(noalle),nloci);
+for i=1:nloci
+    alleelitLokuksessaI = alleelitLokuksessa{i,1};
+    puuttuvia = max(noalle)-length(alleelitLokuksessaI);
+    alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)];
+end
+
+%-----------------modified by Lu Cheng 17.02.2010--------------------------%
+% NOTE: Here we do not want to change the alpahbets, thus the following
+% lines are commented
+
+% replace the index of an allele to replace the allele
+% for loc = 1:nloci
+%     for all = 1:noalle(loc)
+%         data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all;
+%     end;
+% end;
+%-----------------modified end.....----------------------------------------%
+
+% handle diploid situation
+nind = max(data(:,end));
+nrows = size(data,1);
+ncols = size(data,2);
+rowsFromInd = zeros(nind,1);
+for i=1:nind
+    rowsFromInd(i) = length(find(data(:,end)==i));
+end
+maxRowsFromInd = max(rowsFromInd);
+a = -999;
+emptyRow = repmat(a, 1, ncols);
+lessThanMax = find(rowsFromInd < maxRowsFromInd);
+missingRows = maxRowsFromInd*nind - nrows;
+data = [data; zeros(missingRows, ncols)];
+pointer = 1;
+for ind=lessThanMax'    %Käy läpi ne yksilöt, joilta puuttuu rivejä
+    miss = maxRowsFromInd-rowsFromInd(ind);  % Tältä yksilöltä puuttuvien lkm.
+    for j=1:miss
+        rowToBeAdded = emptyRow;
+        rowToBeAdded(end) = ind;
+        data(nrows+pointer, :) = rowToBeAdded;
+        pointer = pointer+1;
+    end
+end
+data = sortrows(data, ncols);   % Sorttaa yksilöiden mukaisesti
+newData = data;
+rowsFromInd = maxRowsFromInd;
+
+% calculate the prior for each loci, priorTerm is a constant term in the
+% formula, which is precalclulateed for speeding up the program
+adjprior = zeros(max(noalle),nloci);
+priorTerm = 0;
+for j=1:nloci
+    adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)];
+    priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j));
+end
--- a/matlab/independent/processxls.m
+++ b/matlab/independent/processxls.m
@ -0,0 +1,81 @@
+function [data, component_mat, popnames] = processxls(filename)
+%
+% a bug in line 64-68 was fixed
+data = [];
+component_mat = [];
+popnames = [];
+try
+    if ispc
+        [A,B] = xlsread(filename);
+    else
+        [A,B] = myxlsread(filename);
+    end
+catch
+    display('*** ERROR: Wrong Excel format');
+    return
+end
+
+if size(A,2)~=1   % more than one columns containing numeric ST values
+    display('*** ERROR: multiple columns of numeric values');
+    data = []; component_mat = []; popnames = [];
+    return
+end
+
+if size(A,1)~=size(B,1)-1
+    display('*** ERROR: Wrong format');
+    data = []; component_mat = []; popnames = [];   
+    return
+end
+
+B = deblank(B); % remove any trailing blanks
+nstrains = size(B,1)-1;
+nheader = size(B,2);
+for i = 1:nheader
+    if strcmpi('ST',B{1,i}) ix_ST = i; end
+    if strcmpi('Strain', B{1,i}) || strcmpi('Isolate',B{1,i})
+        ix_Strain = i; 
+    end
+end
+if ~exist('ix_ST') 
+    display('*** ERROR: ST column needed');
+    data = []; component_mat = []; popnames = [];
+    return
+end
+
+if ~exist('ix_Strain')
+    ix_gene = setdiff([1:nheader],ix_ST);
+else
+    ix_gene = setdiff([1:nheader],[ix_ST ix_Strain]);
+end
+    
+ngenes = length(ix_gene);
+
+C = cell(nstrains,ngenes);
+if ~isempty(A)
+    for i=1:nstrains
+        B{i+1,ix_ST}=num2str(A(i));
+        for j=1:ngenes
+            C{i,j}=uint16(i_encode_n(B{i+1,ix_gene(j)})); % save the memory.
+        end
+    end
+end
+genesize=cellfun('size',C(1,:),2);
+data=cell2mat(C);
+data=[data uint16([1:nstrains]')]; 
+component_mat = zeros(ngenes,max(genesize));
+cum = cumsum(genesize);
+component_mat(1,[1:genesize(1)]) = [1:cum(1)];
+for i=2:ngenes
+    component_mat(i,[1:genesize(i)]) = [(cum(i-1)+1):cum(i)];
+end
+
+if ~exist('ix_Strain')
+    popnames = num2cell(B([2:end],ix_ST));
+else  % store the strain names only
+    popnames = num2cell(B([2:end],ix_Strain)); 
+end
+popnames(:,2)=num2cell([1:nstrains]');
+
+display('---------------------------------------------------');
+display(['# of strains: ', num2str(nstrains)]);
+display(['# of genes: ', num2str(ngenes)]);
--- a/matlab/independent/semiReadScript.m
+++ b/matlab/independent/semiReadScript.m
@ -0,0 +1,128 @@
+function paras = semiReadScript(script_file)
+% This function extracts parameter information from the script file
+% Script Command Table
+% datafile('train|test','c:\BAPS5\DATA.xls');  Here only .xls and .mat file
+% input file is supported.
+% savePreproFile('train|test','c:\BAPS5\predata.mat'); 
+% setK('16 17 18');
+% outputmat('c:\BAPS5\output.mat')
+% Lu Cheng, 11.03.2010
+
+paras.train_file_format = [];
+paras.train_file_name = [];
+
+paras.save_prepro_train_data = []; paras.save_prepro_train_data = 'No';
+paras.train_prepro_file = [];
+
+paras.test_file_format = [];
+paras.test_file_name = [];
+
+paras.save_prepro_test_data = []; paras.save_prepro_test_data = 'No';
+paras.test_prepro_file = [];
+
+paras.cluster_num_upperbounds = [];
+
+paras.save_results = []; paras.save_results = 'No';
+paras.result_file = [];
+
+T = readfile(script_file);
+
+n = length(T);
+for i=1:n
+    %line = regexprep(T{i},'\s+','');
+    line = T{i};
+    [res toks] = regexp(line,'(.+)\((.+)\)','once','match','tokens');
+    
+    if isempty(res)
+        continue;
+    else
+        %toks
+        paras = parseCmd(toks{1}, toks{2}, paras);
+    end
+end
+
+% -------------------------------------------------------------------------
+function prog_paras = parseCmd(cmd, paras, prog_paras)
+% cmd is the script command
+% paras are the parameters of the script command
+% prog_paras is a stucture of the global parameters
+
+switch cmd
+    case 'datafile'
+        paras = regexprep(paras,'\s+','');
+        toks = regexp(paras,'''([^,]+)''','tokens');
+        option = toks{1}{:};
+        filename = toks{2}{:};
+        if exist(filename,'file')~=2
+            error(cat(2,'File not exist! File: ',filename));
+        end
+        filetype = getFileType(filename);
+        if isequal(option,'train')
+            prog_paras.train_file_format = filetype;
+            prog_paras.train_file_name = filename;
+        elseif isequal(option,'test')
+            prog_paras.test_file_format = filetype;
+            prog_paras.test_file_name = filename;
+        else
+            error(cat(2,'Unkown option: ',option,'! Expect train or test.'));
+        end
+        
+    case 'savePreprocFile'
+        paras = regexprep(paras,'\s+','');
+        toks = regexp(paras,'''([^,]+)''','tokens');
+        option = toks{1}{:};
+        filename = toks{2}{:};
+        
+        filetype = getFileType(filename);
+        if ~isequal(filetype,'.mat')
+            error(cat(2,'The saved file should end with .mat! ',filename));
+        end
+        
+        if isequal(option,'train')
+            prog_paras.save_prepro_train_data = 'Yes';
+            prog_paras.train_prepro_file = filename;
+        elseif isequal(option,'test')
+            prog_paras.save_prepro_test_data = 'Yes';
+            prog_paras.test_prepro_file = filename;
+        else
+            error(cat(2,'Unkown option: ',option,'! Expect train or test.'));
+        end
+    case 'setK'
+        prog_paras.cluster_num_upperbounds = paras(2:end-1);
+    case 'outputmat'
+        filename = paras(2:end-1);
+        filetype = getFileType(filename);
+        if ~isequal(filetype,'.mat')
+            error(cat(2,'The saved file should end with .mat! ',filename));
+        end
+        prog_paras.save_results = 'Yes';
+        prog_paras.result_file = filename;
+    otherwise
+        error('Can not parse the cmd: %s in the script!', cmd);
+end
+
+% -------------------------------------------------------------------------
+function filetype = getFileType(filename)
+filetype = filename(end-3:end);
+if ~isequal(filetype,'.xls') && ~isequal(filetype,'.mat')
+    error(cat(2,'Unknown option: ', filename, '! Expect .xls or .mat file'));
+end
+
+% -------------------------------------------------------------------------
+function T = readfile(filename)
+f = fopen(filename,'r');
+if f == -1 
+    error(cat(2,'*** ERROR: invalid input file: ',filename));
+    T = [];
+    return
+end
+
+i = 1;
+while 1
+  clear line;
+  line = fgetl(f);
+  if ~ischar(line), break, end  
+  T{i} = line;
+  i = i+1;
+end
+fclose(f);
--- a/matlab/independent/semi_linkageMix.m
+++ b/matlab/independent/semi_linkageMix.m
--- a/matlab/independent/semi_linkageMixture_speed.m
+++ b/matlab/independent/semi_linkageMixture_speed.m
@ -0,0 +1,651 @@
+function semi_linkageMixture_speed(c_train, c_test)
+% This function process adjusts the priors of the training data accoring to
+% the test data. Based on the adjusted priors, the test data is clustered. 
+
+% modified from linkageMixture_speed.m by Lu Cheng, 16.02.2010
+
+% Update by Lu Cheng, 07.03.2011
+% case of only 1 sample in the test data has been handled
+
+% added by Lu Cheng, 11.03.2010
+global SCRIPT_MODE;
+global PARAMETERS;
+if isempty(SCRIPT_MODE)
+    SCRIPT_MODE = false;
+end
+% -----------------
+
+
+%% compare the training data and test data, adjust priors
+
+%1% Compare the training data and test data to adjust the prior
+if ~all(all(c_train.component_mat == c_test.component_mat))
+    disp('The gene lengths are different between the training data and the test data!');
+    return;
+end
+
+flag = false; % whether the trained priors should be adjusted
+n_loci = size(c_train.alleleCodes,2);
+
+if c_train.rowsFromInd ~= c_test.rowsFromInd
+    error('Inconsistant rows from each individual. Train: %d Test: %d. Quit! \n', ...
+        c_train.rowsFromInd, c_test.rowsFromInd);
+    return;
+elseif c_train.rowsFromInd > 1
+    error('Data must be haploid. Quit! rowsFromInd: %d.\n', c_train.rowsFromInd);
+    return;
+end
+
+for i=1:n_loci
+    if flag;   break;    end
+    a = setdiff(c_test.alleleCodes(:,i),c_train.alleleCodes(:,i));
+    a = a(a~=0);
+    if ~isempty(a)
+        flag = true;
+        fprintf('New alleles are detected in the test data at loci: %d\n',i);
+        fprintf('The processing time will be much longer than usual. \n');
+    end
+end
+
+%2% reprocess the data for clustering
+if flag
+    
+    % combing the training data and test data, adjust the priors
+    combine_data = [c_train.data; c_test.data];
+    n_train = size(c_train.data,1);
+    n_samples = size(combine_data,1);
+    combine_data(:,end) = (1:n_samples)';
+    [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(combine_data);
+    
+    index = data(:,end);
+    [data_clique, data_separator, noalle_clique, noalle_separator] = ...
+        transform4(data, c_train.component_mat,'codon');
+    data_clique = [data_clique index];
+    data_separator = [data_separator index];
+    
+    % Count the data
+    [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]...
+        = allfreqsnew2(data_clique, double(noalle_clique));
+    [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]...
+        = allfreqsnew2(data_separator, double(noalle_separator));
+
+    counts_cq = uint16(counts_cq);
+    counts_sp = uint16(counts_sp);
+        
+    c_train.adjprior = adjprior;
+    
+    c_train.counts_cq = counts_cq(:,:,1:n_train);
+    c_train.counts_sp = counts_sp(:,:,1:n_train);
+    
+    c_test.counts_cq = counts_cq(:,:,n_train+1:end);
+    c_test.counts_sp = counts_sp(:,:,n_train+1:end);
+    
+    c_train.adjprior_cq = adjprior_cq;
+    c_train.adjprior_sp = adjprior_sp;
+    
+    c_train.alleleCodes = alleleCodes;
+    c_train.noalle = noalle;
+        
+    clear data rowsFromInd alleleCodes noalle adjprior priorTerm index 
+    clear data_clique data_separator noalle_clique noalle_separator
+    clear counts_cq counts_sp nalleles_cq nalleles_sp prior_cq prior_sp adjprior_cq adjprior_sp
+    
+else
+    
+    % KEY: adjust the test data to fit the configuration of the training data
+    %      the 'codes_cq' and 'codes_sp' are directly translated from DNA sequence
+    %      SEE 'i_encode_n.m' under the linkage folder
+    
+    num_cq  = size(c_train.counts_cq, 1);   num_sp  = size(c_train.counts_sp, 1); 
+    n_loci_cq = size(c_train.counts_cq, 2); n_loci_sp = size(c_train.counts_sp, 2);
+    n_inds = size(c_test.counts_cq, 3);
+    
+    counts_cq = zeros(num_cq, n_loci_cq, n_inds);
+    counts_sp = zeros(num_sp, n_loci_sp, n_inds);
+    
+    % mapping the indexes of cliques and separators of the test data to the
+    % indexes of the training data
+    for k = 1:n_inds
+        for j = 1:n_loci_cq
+            [c, ia, ib] = intersect(c_test.codes_cq{j}, c_train.codes_cq{j},'rows');
+            counts_cq(ib,j,k) = c_test.counts_cq(ia,j,k);
+        end
+        
+        for j = 1:n_loci_sp
+            [c, ia, ib] = intersect(c_test.codes_sp{j}, c_train.codes_sp{j},'rows');
+            counts_sp(ib,j,k) = c_test.counts_sp(ia,j,k);
+        end
+    end
+
+    c_test.counts_cq = counts_cq;
+    c_test.counts_sp = counts_sp;
+
+    clear c ia ib k j i;
+
+end
+
+%% cluster the test data
+
+% case of only 1 sample in the test data, added by Lu Cheng, 07.03.2011
+if size(c_test.data,1)~=1
+    [Z,dist] = newGetDistances(c_test.data, c_test.rowsFromInd);
+    c_test.Z = Z;
+    c_test.dist = dist;
+end
+
+clear Z dist;
+
+message = cat(2,'There are currently ',num2str(length(unique(c_train.cluster_labels))),' clusters in the training data, please input upper bounds of cluster numbers in the test data.');
+
+if SCRIPT_MODE
+    cluster_nums = str2num(PARAMETERS.cluster_num_upperbounds);
+else
+    cluster_nums = inputdlg(message);
+    if isempty(cluster_nums) == 1
+        return;
+    else
+        cluster_nums = str2num(cluster_nums{:});
+    end    
+end
+
+% % Test purpose, Check the input data, there should be 1 allele in each loci (column)
+% % Lu Cheng, 25.02.2010
+% if ~all(all(squeeze(sum(c_train.counts_cq,1))))
+%     disp('Missing cq value of some sample in counts_cq of the training data');
+%     return;
+% elseif ~all(all(squeeze(sum(c_train.counts_sp,1))))
+%     disp('Missing sp value of some sample in counts_sp of the training data');
+%     return;
+% elseif ~all(all(squeeze(sum(c_test.counts_cq,1))))
+%     disp('Missing cq value of some sample in counts_cq of the test data');
+%     return;
+% elseif ~all(all(squeeze(sum(c_test.counts_sp,1))))
+%     disp('Missing sp value of some sample in counts_sp of the test data');
+%     return;
+% end
+
+tic
+semi_res = semi_linkageMix(c_train, c_test, cluster_nums);
+toc
+
+semi_res.popnames = c_test.popnames;
+
+writeMixtureInfo(semi_res);
+
+% save the results 
+if SCRIPT_MODE
+    save_results = PARAMETERS.save_results;
+else
+    save_results = questdlg('Do you wish to save the results?',...
+           'Save Results','Yes','No','Yes');
+end
+
+if isequal(save_results,'Yes')
+    if SCRIPT_MODE
+        save(PARAMETERS.result_file,'semi_res','-v7.3');
+    else
+        [filename, pathname] = uiputfile('*.mat','Save the results as');
+        if (sum(filename)==0) || (sum(pathname)==0)
+            % do nothing
+        else
+            save(strcat(pathname,filename),'semi_res','-v7.3');
+        end
+    end   
+end;
+        
+% -----------------------------------------------------------------------
+
+
+
+%--------------------------------------------------------------------------
+%% The next three functions are for computing the initial partition
+% according to the distance between the individuals
+
+function initial_partition=admixture_initialization(nclusters,Z)
+T=cluster_own(Z,nclusters);
+initial_partition=T;
+
+function T = cluster_own(Z,nclust)
+% true=logical(1);
+% false=logical(0);
+
+maxclust = nclust;
+% Start of algorithm
+m = size(Z,1)+1;
+T = zeros(m,1);
+% maximum number of clusters based on inconsistency
+if m <= maxclust
+    T = (1:m)';
+elseif maxclust==1
+    T = ones(m,1);
+    
+else
+    clsnum = 1;
+    for k = (m-maxclust+1):(m-1)
+        i = Z(k,1); % left tree
+        if i <= m % original node, no leafs
+            T(i) = clsnum;
+            clsnum = clsnum + 1;
+        elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree
+            T = clusternum(Z, T, i-m, clsnum);
+            clsnum = clsnum + 1;
+        end
+        i = Z(k,2); % right tree
+        if i <= m  % original node, no leafs
+            T(i) = clsnum;
+            clsnum = clsnum + 1;
+        elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree
+            T = clusternum(Z, T, i-m, clsnum);
+            clsnum = clsnum + 1;
+        end
+    end
+end
+
+function T = clusternum(X, T, k, c)
+m = size(X,1)+1;
+while(~isempty(k))
+    % Get the children of nodes at this level
+    children = X(k,1:2);
+    children = children(:);
+
+    % Assign this node number to leaf children
+    t = (children<=m);
+    T(children(t)) = c;
+
+    % Move to next level
+    k = children(~t) - m;
+end
+
+%--------------------------------------------------------------------------
+
+function Z = computeLinkage(Y, method)
+[k, n] = size(Y);
+m = (1+sqrt(1+8*n))/2;
+if k ~= 1 || m ~= fix(m)
+    error('The first input has to match the output of the PDIST function in size.');
+end
+if nargin == 1 % set default switch to be 'co'
+    method = 'co';
+end
+method = lower(method(1:2)); % simplify the switch string.
+% monotonic = 1;
+Z = zeros(m-1,3); % allocate the output matrix.
+N = zeros(1,2*m-1);
+N(1:m) = 1;
+n = m; % since m is changing, we need to save m in n.
+R = 1:n;
+for s = 1:(n-1)
+    X = Y;
+    [v, k] = min(X);
+    i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1)));
+    j = k - (i-1)*(m-i/2)+i;
+    Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A
+    I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables.
+    U = [I1 I2 I3];
+    I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3];
+    J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3];
+
+    switch method
+        case 'si' %single linkage
+            Y(I) = min(Y(I),Y(J));
+        case 'av' % average linkage
+            Y(I) = Y(I) + Y(J);
+        case 'co' %complete linkage
+            Y(I) = max(Y(I),Y(J));
+        case 'ce' % centroid linkage
+            K = N(R(i))+N(R(j));
+            Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K;
+        case 'wa'
+            Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ...
+                N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U)));
+    end
+    J = [J i*(m-(i+1)/2)-m+j];
+    Y(J) = []; % no need for the cluster information about j.
+
+    % update m, N, R
+    m = m-1;
+    N(n+s) = N(R(i)) + N(R(j));
+    R(i) = n+s;
+    R(j:(n-1))=R((j+1):n);
+end
+
+%--------------------------------------------------------------------------
+
+%-------------------------------------------------------------------------
+
+function [Z, dist] = newGetDistances(data, rowsFromInd)
+
+ninds = max(data(:,end));
+nloci = size(data,2)-1;
+riviLkm = nchoosek(double(ninds),2);
+
+% empties = find(data<0);
+% data(empties)=0;
+data(logical(data<0)) = 0;
+data = uint16(data);
+
+pariTaulu = zeros(riviLkm,2);
+aPointer=1;
+
+for a=1:ninds-1
+    pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a;
+    pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)');
+    aPointer = aPointer+double(ninds-a);
+end
+
+eka = pariTaulu(:,ones(1,rowsFromInd));
+eka = eka * rowsFromInd;
+miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]);
+eka = eka - miinus;
+
+toka = pariTaulu(:,ones(1,rowsFromInd)*2);
+toka = toka * rowsFromInd;
+toka = toka - miinus;
+
+eka = uint16(eka);
+toka = uint16(toka);
+
+clear pariTaulu; clear miinus;
+
+summa = uint16(zeros(riviLkm,1));
+vertailuja = uint16(zeros(riviLkm,1));
+
+x = zeros(size(eka));    x = uint16(x);
+y = zeros(size(toka));   y = uint16(y);
+% fprintf(1,'%%10');
+for j=1:nloci;
+    
+    for k=1:rowsFromInd
+        x(:,k) = data(eka(:,k),j);
+        y(:,k) = data(toka(:,k),j);
+    end
+
+    for a=1:rowsFromInd
+        for b=1:rowsFromInd
+            vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0);
+            vertailuja = vertailuja + vertailutNyt;
+            lisays = (x(:,a)~=y(:,b) & vertailutNyt);
+            summa = summa + uint16(lisays);
+        end
+    end
+    % fprintf(1,'\b\b');
+    % fprintf(1,'%d',floor(10+80*j/nloci));
+end
+
+clear x;    clear y;   clear vertailutNyt;
+clear eka; clear toka; clear data; clear lisays;
+dist = zeros(length(vertailuja),1);
+% nollat = find(vertailuja==0);
+% dist(nollat) = 1;
+dist(logical(vertailuja==0)) = 1;
+muut = find(vertailuja>0);
+dist(muut) = double(summa(muut))./double(vertailuja(muut));
+clear summa; clear vertailuja; clear muut;
+
+Z = computeLinkage(dist');
+% fprintf(1,'\b\b');
+% fprintf(1,'%d\n',100);
+%--------------------------------------------------------------------------
+
+function writeMixtureInfo(c)
+
+outputFile = 'baps5_semi_output.txt';
+
+% output the semi-supervised clustering results to the outputFile
+% modified by Lu Cheng, 28.03.2010
+
+ninds = length(c.PARTITION);
+npops =  c.npops;
+popnames = c.popnames;
+logml = c.logml;
+partition = c.PARTITION;
+partitionSummary = c.partitionSummary;
+
+if ~isempty(outputFile)
+    fid = fopen(outputFile,'w');
+else
+    fid = -1;
+    %diary('baps5_semi_output.baps'); % save in text anyway.
+end
+
+dispLine;
+disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:');
+disp(['Number of clustered individuals: ' ownNum2Str(ninds)]);
+disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]);
+disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]);
+disp(' ');
+if (fid ~= -1)
+    fprintf(fid,'%10s\n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']);
+    fprintf(fid,'%20s\n', ['Number of clustered individuals: ' ownNum2Str(ninds)]);
+    fprintf(fid,'%20s\n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]);
+    fprintf(fid,'%20s\n\n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]);
+end
+
+disp('Best Partition: ');
+if (fid ~= -1)
+    fprintf(fid,'%s \n','Best Partition: ');
+end
+for m=1:npops
+    indsInM = find(partition==m);
+    
+    if isempty(indsInM)
+        continue;
+    end
+    
+    length_of_beginning = 11 + floor(log10(m));
+    cluster_size = length(indsInM);
+    
+    text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})];
+    for k = 2:cluster_size
+        text = [text ', ' char(popnames{indsInM(k)})];
+    end;
+    text = [text '}'];
+    
+    while length(text)>58
+        %Take one line and display it.
+        new_line = takeLine(text,58);
+        text = text(length(new_line)+1:end);
+        disp(new_line);
+        if (fid ~= -1)
+            fprintf(fid,'%s \n',new_line);
+        end
+        if length(text)>0
+            text = [blanks(length_of_beginning) text];
+        else
+            text = [];
+        end;
+    end;
+    
+    if ~isempty(text)
+        disp(text);
+        if (fid ~= -1)
+            fprintf(fid,'%s \n',text);
+        end
+    end;
+end
+
+names = true;
+
+clusterProbTable = c.clusterProbTable;
+if npops == 1
+    clusterProbTable = [];
+else
+    disp('');
+    disp('Posterior probability of assignment into clusters:');
+    
+    if (fid ~= -1)
+        fprintf(fid, '%s \n', ' '); fprintf(fid, '\n');
+        fprintf(fid, '%s \n', 'Posterior probability of assignment into clusters: '); fprintf(fid, '\n');
+    end
+
+    text = sprintf('%10s','ind');
+    for ii = 1:npops
+        tmpstr = sprintf('%10s',num2str(ii));
+        text = [text tmpstr];
+    end
+    
+    disp(text);
+    if (fid ~= -1)
+        fprintf(fid, '%s \n', text);
+    end
+        
+    for ii = 1:ninds
+        text = sprintf('%10s',popnames{ii}{:});
+        for jj = 1:npops
+            tmpstr = sprintf('%10s',num2str(clusterProbTable(ii,jj),'%10.6f'));
+            text = [text tmpstr];
+        end
+        
+        if ii<100
+            disp(text);
+        elseif ii==101
+            disp('.......................................');
+            disp('..........see output file..............');
+        end
+        if (fid ~= -1)
+            fprintf(fid, '%s \n', text);
+        end   
+        text = [];
+    end
+end
+
+disp(' ');
+disp(' ');
+disp('List of sizes of 10 best visited partitions and corresponding log(ml) values');
+
+if (fid ~= -1)
+    fprintf(fid, '%s \n\n', ' ');
+    fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n');
+end
+
+partitionSummary = sortrows(partitionSummary,2);
+partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :);
+partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:);
+if size(partitionSummary,1)>10
+    vikaPartitio = 10;
+else
+    vikaPartitio = size(partitionSummary,1);
+end
+for part = 1:vikaPartitio
+    line = [num2str(partitionSummary(part,1),'%20d') '    ' num2str(partitionSummary(part,2),'%20.6f')];
+    disp(line);
+    if (fid ~= -1)
+        fprintf(fid, '%s \n', line);
+    end
+end
+
+if (fid ~= -1)
+    fclose(fid);
+else
+    diary off
+end
+    
+%--------------------------------------------------------------
+function newline = takeLine(description,width)
+%Returns one line from the description: line ends to the first
+%space after width:th mark.
+% newLine = description(1:width);
+n = width+1;
+while ~isspace(description(n)) && n<length(description)
+    n = n+1;
+end;
+newline = description(1:n);
+
+
+function dispLine
+disp('---------------------------------------------------');
+
+function num2 = omaRound(num)
+% Pyï¿½ristï¿½ï¿½ luvun num 1 desimaalin tarkkuuteen
+num = num*10;
+num = round(num);
+num2 = num/10;
+
+%---------------------------------------------------------
+
+%-------------------------------------------------------------------------
+
+function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ...
+    handleData(raw_data)
+% Alkuperï¿½isen datan viimeinen sarake kertoo, miltï¿?yksilï¿½ltï¿?
+% kyseinen rivi on perï¿½isin. Funktio tutkii ensin, ettï¿?montako
+% riviï¿?maksimissaan on perï¿½isin yhdeltï¿?yksilï¿½ltï¿? jolloin saadaan
+% tietï¿½ï¿½ onko kyseessï¿?haploidi, diploidi jne... Tï¿½mï¿½n jï¿½lkeen funktio
+% lisï¿½ï¿½ tyhjiï¿?rivejï¿?niille yksilï¿½ille, joilta on perï¿½isin vï¿½hemmï¿½n
+% rivejï¿?kuin maksimimï¿½ï¿½rï¿?
+%   Mikï¿½li jonkin alleelin koodi on =0, funktio muuttaa tï¿½mï¿½n alleelin
+% koodi pienimmï¿½ksi koodiksi, joka isompi kuin mikï¿½ï¿½n kï¿½ytï¿½ssï¿?oleva koodi.
+% Tï¿½mï¿½n jï¿½lkeen funktio muuttaa alleelikoodit siten, ettï¿?yhden lokuksen j
+% koodit saavat arvoja vï¿½lillï¿?1,...,noalle(j).
+
+data = raw_data;
+nloci=size(raw_data,2)-1;
+
+dataApu = data(:,1:nloci);
+nollat = find(dataApu==0);
+if ~isempty(nollat)
+    isoinAlleeli = max(max(dataApu));
+    dataApu(nollat) = isoinAlleeli+1;
+    data(:,1:nloci) = dataApu;
+end
+% dataApu = []; 
+% nollat = []; 
+% isoinAlleeli = [];
+
+noalle=zeros(1,nloci);
+alleelitLokuksessa = cell(nloci,1);
+for i=1:nloci
+    alleelitLokuksessaI = unique(data(:,i));
+   %alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0));
+    alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0));
+    noalle(i) = length(alleelitLokuksessa{i,1});
+end
+alleleCodes = zeros(max(noalle),nloci);
+for i=1:nloci
+    alleelitLokuksessaI = alleelitLokuksessa{i,1};
+    puuttuvia = max(noalle)-length(alleelitLokuksessaI);
+    alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)];
+end
+
+for loc = 1:nloci
+    for all = 1:noalle(loc)
+        % data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all;
+        data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all;
+    end;
+end;
+
+nind = max(data(:,end));
+nrows = size(data,1);
+ncols = size(data,2);
+rowsFromInd = zeros(nind,1);
+for i=1:nind
+    rowsFromInd(i) = length(find(data(:,end)==i));
+end
+maxRowsFromInd = max(rowsFromInd);
+a = -999;
+emptyRow = repmat(a, 1, ncols);
+lessThanMax = find(rowsFromInd < maxRowsFromInd);
+missingRows = maxRowsFromInd*nind - nrows;
+data = [data; zeros(missingRows, ncols)];
+pointer = 1;
+for ind=lessThanMax'    %Kï¿½y lï¿½pi ne yksilï¿½t, joilta puuttuu rivejï¿?
+    miss = maxRowsFromInd-rowsFromInd(ind);  % Tï¿½ltï¿?yksilï¿½ltï¿?puuttuvien lkm.
+    for j=1:miss
+        rowToBeAdded = emptyRow;
+        rowToBeAdded(end) = ind;
+        data(nrows+pointer, :) = rowToBeAdded;
+        pointer = pointer+1;
+    end
+end
+data = sortrows(data, ncols);   % Sorttaa yksilï¿½iden mukaisesti
+newData = data;
+rowsFromInd = maxRowsFromInd;
+
+adjprior = zeros(max(noalle),nloci);
+priorTerm = 0;
+for j=1:nloci
+    adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)];
+    priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j));
+end
+
+%--------------------------------------------------------------------------
--- a/matlab/independent/tmpscript.txt
+++ b/matlab/independent/tmpscript.txt
@ -0,0 +1,5 @@
+datafile('train','C:\BAPS5\burk_test\train_data.mat')
+datafile('test','C:\BAPS5\burk_test\testdata\testdata_51_5.xls')
+savePreproFile('test','C:\BAPS5\burk_test\testpreproc\testpreproc_51_5.mat')
+setK('16')
+outputmat('C:\BAPS5\burk_test\testres\testres_51_5.mat')
--- a/matlab/independent/trainedMix.m
+++ b/matlab/independent/trainedMix.m