Fixed basic parsing of FASTA files (#25)

2023-09-11 12:15:30 +02:00 · 2023-09-11 12:15:30 +02:00 · 76828387a3
commit 76828387a3
parent a88f31b3a5
8 changed files with 80 additions and 78 deletions
--- a/1
+++ b/1
@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand

 export(greedyMix)
+export(handleData)
 export(load_fasta)
 importFrom(R6,R6Class)
 importFrom(Rsamtools,scanBam)
--- a/R/greedyMix.R
+++ b/R/greedyMix.R
@ -54,7 +54,7 @@ greedyMix <- function(
  # Generating partition summary ===============================================
  ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)';
  c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1]
-  logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose);
+  logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
  logml <- logml_npops_partitionSummary[["logml"]]
  npops <- logml_npops_partitionSummary[["npops"]]
  partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]
@ -72,8 +72,8 @@ greedyMix <- function(

  # Writing mixture info =======================================================
  changesInLogml <- writeMixtureInfo(
-    logml, rowsFromInd, data, adjprior, priorTerm, NULL, inp, partitionSummary,
-    popnames, fixedK
+    logml, c[["rowsFromInd"]], c[["data"]], c[["adjprior"]], c[["priorTerm"]],
+    NULL, inp, partitionSummary, popnames, fixedK
  )

  # Updateing results ==========================================================
--- a/R/handleData.R
+++ b/R/handleData.R
@ -9,6 +9,7 @@
 #' code to the smallest code that is larger than any code in use. After this,
 #' the function changes the allele codes so that one locus j
 #' codes get values between? 1, ..., noalle(j).
+#' @export
 handleData <- function(raw_data, format = "Genepop") {
  # Alkuper?isen datan viimeinen sarake kertoo, milt?yksil?lt?
  # kyseinen rivi on per?isin. Funktio tutkii ensin, ett?montako
--- a/R/indMix.R
+++ b/R/indMix.R
@ -68,7 +68,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
  nruns <- length(npopsTaulu)

  initData <- data
-  data <- data[, 1:(ncol(data) - 1)]
+  data <- data[, seq_along(noalle)]  # Original code always dropped last column.

  logmlBest <- -1e50
  partitionSummary <- -1e50 * ones(30, 2) # Tiedot 30 parhaasta partitiosta (npops ja logml)
--- a/R/laskeLoggis.R
+++ b/R/laskeLoggis.R
@ -1,7 +1,7 @@
 laskeLoggis <- function(counts, sumcounts, adjprior) {
  npops <- size(counts, 3)
-
-  sum1 <- sum(sum(sum(lgamma(counts + repmat(adjprior, c(1, 1, npops))))))
+  replicated_adjprior <- array(adjprior, c(nrow(adjprior), ncol(adjprior), npops))
+  sum1 <- sum(sum(sum(lgamma(counts + replicated_adjprior))))
  sum3 <- sum(sum(lgamma(adjprior))) - sum(sum(lgamma(1 + sumcounts)))
  logml2 <- sum1 - npops * sum3
  loggis <- logml2
--- a/R/writeMixtureInfo.R
+++ b/R/writeMixtureInfo.R
@ -31,12 +31,12 @@ writeMixtureInfo <- function(
  }

  dispLine()
-  cat("RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:")
-  cat(c("Data file: ", inputFile))
-  cat("Model: independent")
-  cat(c("Number of clustered individuals: ", ownNum2Str(ninds)))
-  cat(c("Number of groups in optimal partition: ", ownNum2Str(npops)))
-  cat(c("Log(marginal likelihood) of optimal partition: ", ownNum2Str(logml)))
+  cat("RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:\n")
+  cat("Data file: ", inputFile, "\n")
+  cat("Model: independent\n")
+  cat("Number of clustered individuals: ", ownNum2Str(ninds), "\n")
+  cat("Number of groups in optimal partition: ", ownNum2Str(npops), "\n")
+  cat("Log(marginal likelihood) of optimal partition: ", ownNum2Str(logml), "\n")
  cat(" ")
  if (fid != -1) {
    append(fid, "RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:\n")
@ -87,10 +87,10 @@ writeMixtureInfo <- function(
        "Cluster ", as.character(m), ": {", as.character(indsInM[1])
      )
      for (k in 2:cluster_size) {
-        text <- c(text, ", ", as.character(indsInM[k]))
+        text <- c(text, ",", as.character(indsInM[k]))
      }
    }
-    text <- c(text, "}")
+    text <- c(text, "}\n")
    while (length(text) > 58) {
      # Take one line and display it.
      new_line <- takeLine(text, 58)
@ -106,7 +106,7 @@ writeMixtureInfo <- function(
        text <- ""
      }
    }
-    if (text != "") {
+    if (any(text != "")) {
      cat(text)
      if (fid != -1) {
        append(fid, text)
@ -116,11 +116,11 @@ writeMixtureInfo <- function(
  }

  if (npops > 1) {
-    cat(" ")
-    cat(" ")
+    cat("\n")
+    cat("\n")
    cat(
      "Changes in log(marginal likelihood)",
-      " if indvidual i is moved to group j:"
+      " if indvidual i is moved to group j:\n"
    )
    if (fid != -1) {
      append(fid, " ")
@ -131,7 +131,7 @@ writeMixtureInfo <- function(
        fid,
        c(
          "Changes in log(marginal likelihood)",
-          "if indvidual i is moved to group j:"
+          "if indvidual i is moved to group j:\n"
        )
      )
      append(fid, "\n")
@ -167,9 +167,9 @@ writeMixtureInfo <- function(

      if (names) {
        nimi <- as.character(popnames[ind])
-        rivi <- c(blanks(maxSize - length(nimi)), nimi, ":")
+        rivi <- c(blanks(maxSize - length(nimi)), nimi, ":\n")
      } else {
-        rivi <- c(blanks(4 - floor(log10(ind))), ownNum2Str(ind), ":")
+        rivi <- c("\n", blanks(4 - floor(log10(ind))), ownNum2Str(ind), ":\n")
      }
      for (j in 1:npops) {
        rivi <- c(rivi, "  ", logml2String(omaRound(muutokset[j])))
@ -181,9 +181,9 @@ writeMixtureInfo <- function(
      }
    }

-    cat(" ")
-    cat(" ")
-    cat("KL-divergence matrix in PHYLIP format:")
+    cat("\n")
+    cat("\n")
+    cat("KL-divergence matrix in PHYLIP format:\n")

    dist_mat <- zeros(npops, npops)
    if (fid != -1) {
@ -193,6 +193,7 @@ writeMixtureInfo <- function(
      append(fid, "\n")
    }

+    COUNTS <- COUNTS[seq_len(nrow(adjprior)), seq_len(ncol(adjprior)), , drop = FALSE]
    maxnoalle <- size(COUNTS, 1)
    nloci <- size(COUNTS, 2)
    d <- zeros(maxnoalle, nloci, npops)
@ -204,8 +205,8 @@ writeMixtureInfo <- function(

    prior[1, nollia] <- 1
    for (pop1 in 1:npops) {
-      d[, , pop1] <- (squeeze(COUNTS[, , pop1]) + prior) /
-        repmat(sum(squeeze(COUNTS[, , pop1]) + prior), c(maxnoalle, 1))
+      squeezed_COUNTS_prior <- squeeze(COUNTS[, , pop1]) + prior
+      d[, , pop1] <- squeezed_COUNTS_prior / sum(squeezed_COUNTS_prior)
    }
    ekarivi <- as.character(npops)
    cat(ekarivi)
@ -215,14 +216,14 @@ writeMixtureInfo <- function(
    }

    for (pop1 in 1:npops) {
-      for (pop2 in 1:(pop1 - 1)) {
+      for (pop2 in seq_len(pop1 - 1)) {
        dist1 <- d[, , pop1]
        dist2 <- d[, , pop2]
        div12 <- sum(
-          sum(dist1 * log2((dist1 + 10^-10) / (dist2 + 10^-10)))
+          sum(dist1 * base::log2((dist1 + 10^-10) / (dist2 + 10^-10)))
        ) / nloci
        div21 <- sum(
-          sum(dist2 * log2((dist2 + 10^-10) / (dist1 + 10^-10)))
+          sum(dist2 * base::log2((dist2 + 10^-10) / (dist1 + 10^-10)))
        ) / nloci
        div <- (div12 + div21) / 2
        dist_mat[pop1, pop2] <- div
@ -232,9 +233,9 @@ writeMixtureInfo <- function(

    dist_mat <- dist_mat + t(dist_mat) # make it symmetric
    for (pop1 in 1:npops) {
-      rivi <- c("Cluster_", as.character(pop1), " ")
+      rivi <- c("\nCluster_", as.character(pop1), "\n")
      for (pop2 in 1:npops) {
-        rivi <- c(rivi, kldiv2str(dist_mat[pop1, pop2]), " ")
+        rivi <- c(rivi, kldiv2str(dist_mat[pop1, pop2]))
      }
      cat(rivi)
      if (fid != -1) {
@ -244,11 +245,11 @@ writeMixtureInfo <- function(
    }
  }

-  cat(" ")
-  cat(" ")
+  cat("\n")
+  cat("\n")
  cat(
    "List of sizes of 10 best visited partitions",
-    "and corresponding log(ml) values"
+    "and corresponding log(ml) values\n"
  )

  if (fid != -1) {
@ -278,7 +279,7 @@ writeMixtureInfo <- function(
    line <- c(
      as.character(partitionSummary[part, 1]),
      "    ",
-      as.character(partitionSummary(part, 2))
+      as.character(partitionSummary[part, 2])
    )
    cat(line)
    if (fid != -1) {
@ -288,9 +289,9 @@ writeMixtureInfo <- function(
  }

  if (!fixedK) {
-    cat(" ")
-    cat(" ")
-    cat("Probabilities for number of clusters")
+    cat("\n")
+    cat("\n")
+    cat("Probabilities for number of clusters\n")

    if (fid != -1) {
      append(fid, " ")
@ -322,7 +323,7 @@ writeMixtureInfo <- function(
        line <- c(
          as.character(npopsTaulu[i]), "   ", as.character(probs[i])
        )
-        cat(line)
+        cat(line, "\n")
        if (fid != -1) {
          append(fid, line)
          append(fid, "\n")
--- a/man/greedyMix.Rd
+++ b/man/greedyMix.Rd
@ -8,9 +8,8 @@ greedyMix(
  data,
  format,
  partitionCompare = NULL,
-  ninds = NULL,
+  ninds = 1L,
  npops = 1L,
-  priorTerm = NULL,
  counts = NULL,
  sumcounts = NULL,
  max_iter = 100L,
@ -32,8 +31,6 @@ greedyMix(

 \item{npops}{number of populations}

-\item{priorTerm}{prior terms}
-
 \item{counts}{counts}

 \item{sumcounts}{sumcounts}
@ -55,6 +52,8 @@ greedyMix(
 \item{noalle}{number of alleles}

 \item{adjprior}{ajuster prior probabilities}
+
+\item{priorTerm}{prior terms}
 }
 \description{
 Clustering of individuals
--- a/matlab/independent/indMix.m
+++ b/matlab/independent/indMix.m
@ -87,7 +87,7 @@ for run = 1:nruns
        apu = rows(i);
        PARTITION(i) = initialPartition(apu(1));
    end
-    
+
    COUNTS = counts; SUMCOUNTS = sumcounts;
    POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm);
    LOGDIFF = repmat(-Inf,ninds,npops);
@ -98,7 +98,7 @@ for run = 1:nruns
    kokeiltu = zeros(nRoundTypes, 1);
    roundTypes = [1 1];  %Ykkösvaiheen sykli kahteen kertaan.
    ready = 0; vaihe = 1;
-    
+
    if dispText
        disp(' ');
        disp(['Mixture analysis started with initial ' num2str(npops) ' populations.']);
@ -106,11 +106,11 @@ for run = 1:nruns

    while ready ~= 1
        muutoksia = 0;
-        
+
        if dispText
            disp(['Performing steps: ' num2str(roundTypes)]);
        end
-        
+
        for n = 1:length(roundTypes)

            round = roundTypes(n);
@ -465,7 +465,7 @@ for run = 1:nruns

    npops = poistaTyhjatPopulaatiot(npops);
    POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm);
-    if dispText       
+    if dispText
        disp(['Found partition with ' num2str(npops) ' populations.']);
        disp(['Log(ml) = ' num2str(logml)]);
        disp(' ');
@ -491,7 +491,7 @@ COUNTS = countsBest;
 SUMCOUNTS = sumCountsBest;
 POP_LOGML = pop_logmlBest;
 LOGDIFF = logdiffbest;
-    
+
 %--------------------------------------------------------------------------

 function clearGlobalVars
@ -509,9 +509,9 @@ function Z = linkage(Y, method)
 [k, n] = size(Y);
 m = (1+sqrt(1+8*n))/2;
 if k ~= 1 | m ~= fix(m)
-  error('The first input has to match the output of the PDIST function in size.');   
+  error('The first input has to match the output of the PDIST function in size.');
 end
-if nargin == 1 % set default switch to be 'co' 
+if nargin == 1 % set default switch to be 'co'
   method = 'co';
 end
 method = lower(method(1:2)); % simplify the switch string.
@ -519,19 +519,19 @@ monotonic = 1;
 Z = zeros(m-1,3); % allocate the output matrix.
 N = zeros(1,2*m-1);
 N(1:m) = 1;
-n = m; % since m is changing, we need to save m in n. 
+n = m; % since m is changing, we need to save m in n.
 R = 1:n;
 for s = 1:(n-1)
   X = Y;
   [v, k] = min(X);
   i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1)));
   j = k - (i-1)*(m-i/2)+i;
-   Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A   
+   Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A
   I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables.
   U = [I1 I2 I3];
   I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3];
   J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3];
-   
+
   switch method
   case 'si' %single linkage
      Y(I) = min(Y(I),Y(J));
@ -548,12 +548,12 @@ for s = 1:(n-1)
   end
   J = [J i*(m-(i+1)/2)-m+j];
   Y(J) = []; % no need for the cluster information about j.
-   
+
   % update m, N, R
-   m = m-1; 
+   m = m-1;
   N(n+s) = N(R(i)) + N(R(j));
   R(i) = n+s;
-   R(j:(n-1))=R((j+1):n); 
+   R(j:(n-1))=R((j+1):n);
 end


@ -623,7 +623,7 @@ function [muutokset, diffInCounts] = ...
 %
 % Lisäys 25.9.2007:
 % Otettu käyttöön globaali muuttuja LOGDIFF, johon on tallennettu muutokset
-% logml:ssä siirrettäessä yksilöitä toisiin populaatioihin. 
+% logml:ssä siirrettäessä yksilöitä toisiin populaatioihin.

 global COUNTS;      global SUMCOUNTS;
 global PARTITION;   global POP_LOGML;
@ -647,7 +647,7 @@ COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts;
 SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts;

 i2 = find(muutokset==-Inf);     % Etsitään populaatiot jotka muuttuneet viime kerran jälkeen.
-i2 = setdiff(i2,i1);            
+i2 = setdiff(i2,i1);
 i2_logml = POP_LOGML(i2);

 ni2 = length(i2);
@ -668,19 +668,19 @@ LOGDIFF(ind,:) = muutokset;

 function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data)
 % Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien
-% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n 
+% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n
 % riveillä rows. rows pitää olla vaakavektori.

 diffInCounts = zeros(max_noalle, nloci);
 for i=rows
    row = data(i,:);
    notEmpty = find(row>=0);
-    
+
    if length(notEmpty)>0
        diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ...
            diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1;
    end
-end    
+end

 %------------------------------------------------------------------------

@ -693,8 +693,8 @@ function updateGlobalVariables(ind, i2, diffInCounts, ...
 % Suorittaa globaalien muuttujien muutokset, kun yksilö ind
 % on siirretään koriin i2.

-global PARTITION; 
-global COUNTS; 
+global PARTITION;
+global COUNTS;
 global SUMCOUNTS;
 global POP_LOGML;
 global LOGDIFF;
@ -724,7 +724,7 @@ function [muutokset, diffInCounts] = laskeMuutokset2( ...
    i1, globalRows, data, adjprior, priorTerm);
 % Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi
 % muutos logml:ssä, mikäli korin i1 kaikki yksilöt siirretään
-% koriin i. 
+% koriin i.

 global COUNTS;      global SUMCOUNTS;
 global PARTITION;   global POP_LOGML;
@ -839,7 +839,7 @@ for pop2 = 1:npops2

        i2 = [1:i1-1 , i1+1:npops];
        i2_logml = POP_LOGML(i2)';
-    
+
        COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]);
        SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]);
        new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)';
@ -848,7 +848,7 @@ for pop2 = 1:npops2

        muutokset(pop2,i2) = new_i1_logml - i1_logml ...
            + new_i2_logml - i2_logml;
-    end    
+    end
 end

 %------------------------------------------------------------------------------------
@ -858,7 +858,7 @@ function muutokset = laskeMuutokset5(inds, globalRows, data, adjprior, ...

 % Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mikä olisi
 % muutos logml:ssä, mikäli yksilö i vaihtaisi koria i1:n ja i2:n välillä.
-    
+
 global COUNTS;      global SUMCOUNTS;
 global PARTITION;   global POP_LOGML;

@ -885,14 +885,14 @@ for i = 1:ninds
    SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts;
    COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts;
    SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts;
-    
+
    new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm);
    muutokset(i) = sum(new_logmls);
-    
+
    COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts;
    SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts;
    COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts;
-    SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; 
+    SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts;
 end

 muutokset = muutokset - i1_logml - i2_logml;
@ -952,7 +952,7 @@ dist2 = dist(apu);


 function npops = poistaTyhjatPopulaatiot(npops)
-% Poistaa tyhjentyneet populaatiot COUNTS:ista ja 
+% Poistaa tyhjentyneet populaatiot COUNTS:ista ja
 % SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in.

 global COUNTS;
@ -1006,7 +1006,7 @@ if abs(logml)<10000
    end
    if logml<0
        mjono(pointer-1) = '-';
-    end 
+    end
 else
    suurinYks = 4;
    while abs(logml)/(10^(suurinYks+1)) >= 1
@ -1035,8 +1035,8 @@ end

 function digit = palautaYks(num,yks)
 % palauttaa luvun num 10^yks termin kertoimen
-% string:inä 
-% yks täytyy olla kokonaisluku, joka on 
+% string:inä
+% yks täytyy olla kokonaisluku, joka on
 % vähintään -1:n suuruinen. Pienemmillä
 % luvuilla tapahtuu jokin pyöristysvirhe.

@ -1063,7 +1063,7 @@ if abs(div)<100
    if arvo>0
        mjono(1) = num2str(arvo);
    end
-    
+
 else
    suurinYks = floor(log10(div));
    mjono(6) = num2str(suurinYks);
@ -1125,7 +1125,7 @@ T = zeros(m,1);
         end
      end
   end
-   
+
 function T = clusternum(X, T, k, c)
 m = size(X,1)+1;
 while(~isempty(k))
@ -1136,7 +1136,7 @@ while(~isempty(k))
   % Assign this node number to leaf children
   t = (children<=m);
   T(children(t)) = c;
-   
+
   % Move to next level
   k = children(~t) - m;
 end