ourMELONS/R/admix1.R

#' @title Admixture analysis
#' @param tietue a named record list
#' @details If the record == -1, the mixture results file is loaded. Otherwise,
#' will the required variables be retrieved from the record fields?
#' `tietue`should contain the following elements: PARTITION, COUNTS, SUMCOUNTS,
#' alleleCodes, adjprior, popnames, rowsFromInd, data, npops, noalle
#' @param tietue tietue
#' @importFrom methods is
#' @export
admix1 <- function(tietue) {
  if (!is.list(tietue)) {
    message("Load mixture result file. These are the files in this directory:")
    print(list.files())
    pathname_filename <- file.choose()
    if (!file.exists(pathname_filename)) {
      stop(
        "File ", pathname_filename,
        " does not exist. Check spelling and location."
      )
    } else {
      cat("---------------------------------------------------\n")
      message("Reading mixture result from: ", pathname_filename, "...")
    }
    # ASK: what is this supposed to do? What do graphic obj have to do here?
    # h0 = findobj('Tag','filename1_text');
    # set(h0,'String',filename); clear h0;

    struct_array <- load(pathname_filename)
    if (isfield(struct_array, "c")) { # Matlab versio
      c <- struct_array$c
      if (!isfield(c, "PARTITION") | !isfield(c, "rowsFromInd")) {
        stop("Incorrect file format")
      }
    } else if (isfield(struct_array, "PARTITION")) { # Mideva versio
      c <- struct_array
      if (!isfield(c, "rowsFromInd")) stop("Incorrect file format")
    } else {
      stop("Incorrect file format")
    }

    if (isfield(c, "gene_lengths") &
      strcmp(c$mixtureType, "linear_mix") |
      strcmp(c$mixtureType, "codon_mix")) {
      # if the mixture is from a linkage model Redirect the call to the linkage
      # admixture function. call function noindex to remove the index column
      c$data <- noIndex(c$data, c$noalle)
      stop("linkage_admix not implemented")
    }
    PARTITION <- c$PARTITION
    COUNTS <- c$COUNTS
    SUMCOUNTS <- c$SUMCOUNTS
    alleleCodes <- c$alleleCodes
    adjprior <- c$adjprior
    popnames <- c$popnames
    rowsFromInd <- c$rowsFromInd
    data <- c$data
    npops <- c$npops
    noalle <- c$noalle
  } else {
    PARTITION <- tietue$PARTITION
    COUNTS <- tietue$COUNTS
    SUMCOUNTS <- tietue$SUMCOUNTS
    alleleCodes <- tietue$alleleCodes
    adjprior <- tietue$adjprior
    popnames <- tietue$popnames
    rowsFromInd <- tietue$rowsFromInd
    data <- as.double(tietue$data)
    npops <- tietue$npops
    noalle <- tietue$noalle
  }

  answers <- inputdlg(
    prompt = paste(
      "Input the minimum size of a population that will",
      "be taken into account when admixture is estimated."
    ),
    definput = 5
  )
  alaRaja <- as.numeric(answers)
  npops <- poistaLiianPienet(npops, rowsFromInd, alaRaja)

  nloci <- size(COUNTS, 2)
  ninds <- size(data, 1) / rowsFromInd

  answers <- inputdlg("Input number of iterations", definput = 50)
  if (isempty(answers)) {
    return()
  }
  iterationCount <- as.numeric(answers[1, 1]) # maybe [[]]?

  answers <- inputdlg(
    prompt = "Input number of reference individuals from each population",
    definput = 50
  )
  if (isempty(answers)) {
    nrefIndsInPop <- 50
  } else {
    nrefIndsInPop <- as.numeric(answers[1, 1])
  }

  answers <- inputdlg(
    prompt = "Input number of iterations for reference individuals",
    definput = 10
  )
  if (isempty(answers)) {
    return()
  }
  iterationCountRef <- as.numeric(answers[1, 1])

  # First calculate log-likelihood ratio for all individuals:
  likelihood <- zeros(ninds, 1)
  allfreqs <- computeAllFreqs2(noalle)
  for (ind in 1:ninds) {
    omaFreqs <- computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd)
    osuusTaulu <- zeros(1, npops)
    if (PARTITION[ind] == 0) {
      # Yksil?on outlier
    } else if (PARTITION[ind] != 0) {
      if (PARTITION[ind] > 0) {
        osuusTaulu[PARTITION[ind]] <- 1
      } else {
        # Yksilöt, joita ei ole sijoitettu mihinkään koriin.
        arvot <- zeros(1, npops)
        for (q in 1:npops) {
          osuusTaulu <- zeros(1, npops)
          osuusTaulu[q] <- 1
          arvot[q] <- computeIndLogml(omaFreqs, osuusTaulu)
        }
        iso_arvo <- base::max(arvot)
        isoimman_indeksi <- match(base::max(arvot), arvot)
        osuusTaulu <- zeros(1, npops)
        osuusTaulu[isoimman_indeksi] <- 1
        PARTITION[ind] <- isoimman_indeksi
      }
      logml <- computeIndLogml(omaFreqs, osuusTaulu)
      logmlAlku <- logml
      for (osuus in c(0.5, 0.25, 0.05, 0.01)) {
        etsiResult <- etsiParas(osuus, osuusTaulu, omaFreqs, logml)
        osuusTaulu <- etsiResult[1]
        logml <- etsiResult[2]
      }
      logmlLoppu <- logml
      likelihood[ind] <- logmlLoppu - logmlAlku
    }
  }

  # Analyze further only individuals who have log-likelihood ratio larger than
  # 3
  to_investigate <- t(matlab2r::find(likelihood > 3))
  cat("Possibly admixed individuals:\n")
  for (i in 1:length(to_investigate)) {
    cat(as.character(to_investigate[i]))
  }
  cat(" ")
  cat("Populations for possibly admixed individuals:\n")
  admix_populaatiot <- unique(PARTITION[to_investigate])
  for (i in 1:length(admix_populaatiot)) {
    cat(as.character(admix_populaatiot[i]))
  }

  # THUS, there are two types of individuals, who will not be analyzed with
  # simulated allele frequencies: those who belonged to a mini-population
  # which was removed, and those who have log-likelihood ratio less than 3.
  # The value in the PARTITION for the first kind of individuals is 0. The
  # second kind of individuals can be identified, because they do not
  # belong to "to_investigate" array. When the results are presented, the
  # first kind of individuals are omitted completely, while the second kind
  # of individuals are completely put to the population, where they ended up
  # in the mixture analysis. These second type of individuals will have a
  # unit p-value.


  # Simulate allele frequencies a given number of times and save the average
  # result to "proportionsIt" array.

  proportionsIt <- zeros(ninds, npops)
  for (iterationNum in 1:iterationCount) {
    cat("Iter:", as.character(iterationNum))
    allfreqs <- simulateAllFreqs(noalle) # Allele frequencies on this iteration.

    for (ind in to_investigate) {
      # disp(num2str(ind));
      omaFreqs <- computePersonalAllFreqs(
        ind, data, allfreqs, rowsFromInd
      )
      osuusTaulu <- zeros(1, npops)
      if (PARTITION[ind] == 0) {
        # Yksil?on outlier
      } else if (PARTITION[ind] != 0) {
        if (PARTITION[ind] > 0) {
          osuusTaulu[PARTITION[ind]] <- 1
        } else {
          # Yksilöt, joita ei ole sijoitettu mihinkään koriin.
          arvot <- zeros(1, npops)
          for (q in 1:npops) {
            osuusTaulu <- zeros(1, npops)
            osuusTaulu[q] <- 1
            arvot[q] <- computeIndLogml(omaFreqs, osuusTaulu)
          }
          iso_arvo <- base::max(arvot)
          isoimman_indeksi <- match(base::max(arvot), arvot)
          osuusTaulu <- zeros(1, npops)
          osuusTaulu[isoimman_indeksi] <- 1
          PARTITION[ind] <- isoimman_indeksi
        }
        logml <- computeIndLogml(omaFreqs, osuusTaulu)

        for (osuus in c(0.5, 0.25, 0.05, 0.01)) {
          etsiResult <- etsiParas(osuus, osuusTaulu, omaFreqs, logml)
          osuusTaulu <- etsiResult[1]
          logml <- etsiResult[2]
        }
      }
      proportionsIt[ind, ] <- proportionsIt[ind, ] * (iterationNum - 1) +
        osuusTaulu
      proportionsIt[ind, ] <- proportionsIt[ind, ] / iterationNum
    }
  }

  # disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']);
  # disp('each population.');

  # allfreqs = simulateAllFreqs(noalle);  # Simuloidaan alleelifrekvenssisetti
  allfreqs <- computeAllFreqs2(noalle) # Koitetaan tällaista.


  # Initialize the data structures, which are required in taking the missing
  # data into account:

  # number of different levels of "missingness" in each pop (max 3).
  n_missing_levels <- zeros(npops, 1)

  # the mean values for different levels.
  missing_levels <- zeros(npops, 3)

  # level of each individual (one of the levels of its population).
  missing_level_partition <- zeros(ninds, 1)
  for (i in 1:npops) {
    inds <- matlab2r::find(PARTITION == i)
    # Proportions of non-missing data for the individuals:
    non_missing_data <- zeros(length(inds), 1)
    for (j in 1:length(inds)) {
      ind <- inds[j]
      non_missing_data[j] <- length(
        matlab2r::find(
          data[(ind - 1) * rowsFromInd + 1:ind * rowsFromInd, ] > 0
        )
      ) / (rowsFromInd * nloci)
    }
    if (all(non_missing_data > 0.9)) {
      n_missing_levels[i] <- 1
      missing_levels[i, 1] <- mean(non_missing_data)
      missing_level_partition[inds] <- 1
    } else {
      ordered <- ordering <- sort(non_missing_data)
      # part = learn_simple_partition(ordered, 0.05);
      part <- learn_partition_modified(ordered)
      aux <- sortrows(cbind(part, ordering), 2)
      part <- aux[, 1]
      missing_level_partition[inds] <- part
      n_levels <- length(unique(part))
      n_missing_levels[i] <- n_levels
      for (j in 1:n_levels) {
        missing_levels[i, j] <- mean(
          non_missing_data[matlab2r::find(part == j)]
        )
      }
    }
  }

  # Create and analyse reference individuals for populations
  # with potentially admixed individuals:
  refTaulu <- zeros(npops, 100, 3)
  for (pop in t(admix_populaatiot)) {
    for (level in 1:n_missing_levels[pop]) {
      potential_inds_in_this_pop_and_level <-
        matlab2r::find(
          PARTITION == pop & missing_level_partition == level &
            likelihood > 3
        ) # Potential admix individuals here.

      if (!isempty(potential_inds_in_this_pop_and_level)) {

        # refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs);
        refData <- simulateIndividuals(
          nrefIndsInPop, rowsFromInd, allfreqs, pop,
          missing_levels[pop, level]
        )

        cat(
          "Analysing the reference individuals from pop", pop,
          "(level", level, ")."
        )
        refProportions <- zeros(nrefIndsInPop, npops)
        for (iter in 1:iterationCountRef) {
          # disp(['Iter: ' num2str(iter)]);
          allfreqs <- simulateAllFreqs(noalle)

          for (ind in 1:nrefIndsInPop) {
            omaFreqs <- computePersonalAllFreqs(
              ind, refData, allfreqs, rowsFromInd
            )
            osuusTaulu <- zeros(1, npops)
            osuusTaulu[pop] <- 1
            logml <- computeIndLogml(omaFreqs, osuusTaulu)
            for (osuus in c(0.5, 0.25, 0.05, 0.01)) {
              etsiResult <- etsiParas(
                osuus, osuusTaulu, omaFreqs, logml
              )
              osuusTaulu <- etsiResult[1]
              logml <- etsiResult[2]
            }
            refProportions[ind, ] <-
              refProportions[ind, ] * (iter - 1) + osuusTaulu
            refProportions[ind, ] <- refProportions[ind, ] / iter
          }
        }
        for (ind in 1:nrefIndsInPop) {
          omanOsuus <- refProportions[ind, pop]
          if (round(omanOsuus * 100) == 0) {
            omanOsuus <- 0.01
          }
          if (abs(omanOsuus) < 1e-5) {
            omanOsuus <- 0.01
          }
          refTaulu[pop, round(omanOsuus * 100), level] <-
            refTaulu[pop, round(omanOsuus * 100), level] + 1
        }
      }
    }
  }

  # Rounding of the results:
  proportionsIt <- proportionsIt * 100
  proportionsIt <- round(proportionsIt)
  proportionsIt <- proportionsIt / 100
  for (ind in 1:ninds) {
    if (!any(to_investigate == ind)) {
      if (PARTITION[ind] > 0) {
        proportionsIt[ind, PARTITION[ind]] <- 1
      }
    } else {
      # In case of a rounding error, the sum is made equal to unity by
      # fixing the largest value.
      if ((PARTITION[ind] > 0) & (sum(proportionsIt[ind, ]) != 1)) {
        isoin <- base::max(proportionsIt[ind, ])
        indeksi <- match(isoin, base::max(proportionsIt[ind, ]))
        erotus <- sum(proportionsIt[ind, ]) - 1
        proportionsIt[ind, indeksi] <- isoin - erotus
      }
    }
  }

  # Calculate p-value for each individual:
  uskottavuus <- zeros(ninds, 1)
  for (ind in 1:ninds) {
    pop <- PARTITION[ind]
    if (pop == 0) { # Individual is outlier
      uskottavuus[ind] <- 1
    } else if (isempty(matlab2r::find(to_investigate == ind))) {
      # Individual had log-likelihood ratio<3
      uskottavuus[ind] <- 1
    } else {
      omanOsuus <- proportionsIt[ind, pop]
      if (abs(omanOsuus) < 1e-5) {
        omanOsuus <- 0.01
      }
      if (round(omanOsuus * 100) == 0) {
        omanOsuus <- 0.01
      }
      level <- missing_level_partition[ind]
      refPienempia <- sum(refTaulu[pop, 1:round(100 * omanOsuus), level])
      uskottavuus[ind] <- refPienempia / nrefIndsInPop
    }
  }

  tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount)
  viewPartition(proportionsIt, popnames)

  talle <- inputdlg("Do you want to save the admixture results? [Y/n]", "y")
  if (talle %in% c("y", "Y", "yes", "Yes")) {
    # waitALittle;
    filename <- inputdlg(
      "Save results as (file name):", "admixture_results.rda"
    )


    if (filename == 0) {
      # Cancel was pressed
      return()
    } else { # copy 'baps4_output.baps' into the text file with the same name.
      if (file.exists("baps4_output.baps")) {
        file.copy("baps4_output.baps", paste0(filename, ".txt"))
        file.remove("baps4_output.baps")
      }
    }

    if (!is(tietue, "list")) {
      c$proportionsIt <- proportionsIt
      c$pvalue <- uskottavuus # Added by Jing
      c$mixtureType <- "admix" # Jing
      c$admixnpops <- npops
      save(c, file = filename)
    } else {
      tietue$proportionsIt <- proportionsIt
      tietue$pvalue <- uskottavuus # Added by Jing
      tietue$mixtureType <- "admix"
      tietue$admixnpops <- npops
      save(tietue, file = filename)
    }
  }
}