diff --git a/NAMESPACE b/NAMESPACE index 9c77f97..8b6dded 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(fgetl) export(fopen) export(greedyMix) export(handleData) +export(handlePopData) export(initPopNames) export(laskeMuutokset4) export(learn_partition_modified) diff --git a/R/handlePopData.R b/R/handlePopData.R new file mode 100644 index 0000000..56a7356 --- /dev/null +++ b/R/handlePopData.R @@ -0,0 +1,71 @@ +#' @title Handle Pop data +#' @description The last column of the original data tells you which +#' that line is from. The function changes the allele +#' codes so that the codes for one locus have values between 1 and noalle[j]. +#' Before this change, an allele whose code is zero is changed. +#' @param raw_data raw data +#' @export +handlePopData <- function(raw_data) { + # Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? + # kyseinen rivi on peräisin. Funktio muuttaa alleelikoodit + # siten, ett?yhden lokuksen j koodit saavat arvoja + # välill?1, ,noalle(j). Ennen tät?muutosta alleeli, jonka + # koodi on nolla muutetaan. + + data <- raw_data + nloci <- size(raw_data, 2) - 1 + + dataApu <- data[, 1:nloci] + nollat <- find(dataApu == 0) + if (length(nollat) > 0) { + isoinAlleeli <- max(max(dataApu)$maxs)$maxs + dataApu[nollat] <- isoinAlleeli + 1 + data[, 1:nloci] <- dataApu + } + + noalle <- zeros(1, nloci) + alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE) + for (i in 1:nloci) { + alleelitLokuksessaI <- sort(unique(data[, i])) + alleelitLokuksessa[[i]] <- alleelitLokuksessaI[find(alleelitLokuksessaI >= 0)] + noalle[i] <- length(alleelitLokuksessa[[i]]) + } + alleleCodes <- zeros(unique(max(noalle)$maxs), nloci) + for (i in 1:nloci) { + alleelitLokuksessaI <- alleelitLokuksessa[[i]] + puuttuvia <- unique(max(noalle)$maxs) - length(alleelitLokuksessaI) + alleleCodes[, i] = c(alleelitLokuksessaI, zeros(puuttuvia, 1)) + } + + for (loc in 1:nloci) { + for (all in 1:noalle[loc]) { + data[find(data[ , loc] == alleleCodes[all, loc]), loc] <- all + } + } + + nind <- max(data[, ncol(data)])$maxs + rows <- zeros(nind, 2) + for (i in 1:nind) { + rivit <- t(find(data[, ncol(data)] == i)) + rows[i, 1] <- min(rivit)$mins + rows[i, 2] <- max(rivit)$maxs + } + newData <- data + + adjprior <- zeros(unique(max(noalle)$maxs), nloci) + priorTerm <- 0 + for (j in 1:nloci) { + adjprior[, j] <- c(repmat(1 / noalle[j], c(noalle[j], 1)), ones(unique(max(noalle)$maxs) - noalle[j], 1)) + priorTerm <- priorTerm + noalle[j] * log(gamma(1 / noalle[j])) + } + return( + list( + newData = newData, + rows = rows, + alleleCodes = alleleCodes, + noalle = noalle, + adjprior = adjprior, + priorTerm = priorTerm + ) + ) +} diff --git a/man/handlePopData.Rd b/man/handlePopData.Rd new file mode 100644 index 0000000..9ce3be6 --- /dev/null +++ b/man/handlePopData.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/handlePopData.R +\name{handlePopData} +\alias{handlePopData} +\title{Handle Pop data} +\usage{ +handlePopData(raw_data) +} +\arguments{ +\item{raw_data}{raw data} +} +\description{ +The last column of the original data tells you which? that line is from. The function changes the allele codes so that the codes for one locus have values between 1,, Noah (j). Before this change, an allele whose code is zero is changed. +} diff --git a/matlab/independent/greedyPopMix.m b/matlab/independent/greedyPopMix.m index 4618135..17d63ae 100644 --- a/matlab/independent/greedyPopMix.m +++ b/matlab/independent/greedyPopMix.m @@ -251,100 +251,6 @@ else end end -%-------------------------------------------------------------------------- - - -function [newData, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(raw_data) -% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? -% kyseinen rivi on peräisin. Funktio muuttaa alleelikoodit -% siten, ett?yhden lokuksen j koodit saavat arvoja -% välill?1,...,noalle(j). Ennen tät?muutosta alleeli, jonka -% koodi on nolla muutetaan. - - -data = raw_data; -nloci=size(raw_data,2)-1; - -dataApu = data(:,1:nloci); -nollat = find(dataApu==0); -if ~isempty(nollat) - isoinAlleeli = max(max(dataApu)); - dataApu(nollat) = isoinAlleeli+1; - data(:,1:nloci) = dataApu; -end -dataApu = []; nollat = []; isoinAlleeli = []; - -noalle=zeros(1,nloci); -alleelitLokuksessa = cell(nloci,1); -for i=1:nloci - alleelitLokuksessaI = unique(data(:,i)); - alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); - noalle(i) = length(alleelitLokuksessa{i,1}); -end -alleleCodes = zeros(max(noalle),nloci); -for i=1:nloci - alleelitLokuksessaI = alleelitLokuksessa{i,1}; - puuttuvia = max(noalle)-length(alleelitLokuksessaI); - alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; -end - -for loc = 1:nloci - for all = 1:noalle(loc) - data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; - end; -end; - -nind = max(data(:,end)); -%rows = cell(nind,1); -rows = zeros(nind,2); -for i=1:nind - rivit = find(data(:,end)==i)'; - rows(i,1) = min(rivit); - rows(i,2) = max(rivit); -end -newData = data; - -adjprior = zeros(max(noalle),nloci); -priorTerm = 0; -for j=1:nloci - adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; - priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); -end - - -%-------------------------------------------------------------------- - -function [Z,distances] = getPopDistancesByKL(adjprior) -% Laskee populaatioille etäisyydet -% käyttäen KL-divergenssi? -global COUNTS; -maxnoalle = size(COUNTS,1); -nloci = size(COUNTS,2); -npops = size(COUNTS,3); -distances = zeros(nchoosek(npops,2),1); - -d = zeros(maxnoalle, nloci, npops); -prior = adjprior; -prior(find(prior==1))=0; -nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. -prior(1,nollia)=1; -for pop1 = 1:npops - d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); - %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); -end -pointer = 1; -for pop1 = 1:npops-1 - for pop2 = pop1+1:npops - dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); - div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; - div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; - div = (div12+div21)/2; - distances(pointer) = div; - pointer = pointer+1; - end -end -Z=linkage(distances'); - %--------------------------------------------------------------------