ourMELONS/R/greedyMix.R

#' @title Clustering of individuals
#' @param data data file
#' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
#' @param partitionCompare a list of partitions to compare
#' @param npops number of populations
#' @param counts counts
#' @param sumcounts sumcounts
#' @param max_iter maximum number of iterations
#' @param alleleCodes allele codes
#' @param inp input file
#' @param popnames population names
#' @param fixedK if \code{TRUE}, the number of populations is fixed
#' @param verbose if \code{TRUE}, prints extra output information
#' @importFrom utils read.delim
#' @importFrom vcfR read.vcfR
#' @importFrom Rsamtools scanBam
#' @importFrom adegenet read.genepop .readExt
#' @references Samtools: a suite of programs for interacting
#' with high-throughput sequencing data. <http://www.htslib.org/>
#' @export
#' @examples
#' data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")
#' greedyMix(data, "baps")
greedyMix <- function(
  data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 3L,
  counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
  inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
) {
  # Importing and handling data ================================================
  # TODO: use format as class and make handling data a generic
  if (tolower(format) %in% "fasta") {
    data <- convert_FASTA_to_BAPS(data)
    format <- "baps"
  }
  if (tolower(format) %in% "baps") {
    data <- process_BAPS_data(data, NULL)
    c <- list(
      noalle = data[["noalle"]],
      data = data[["data"]],
      adjprior = data[["adjprior"]],
      priorTerm = data[["priorTerm"]],
      rowsFromInd = data[["rowsFromInd"]],
      Z = data[["Z"]],
      dist = data[["dist"]]
    )
  } else if (tolower(format) %in% "genepop") {
    data <- process_GenePop_data(data)
    c <- list(
      noalle = data[["noalle"]],
      data = data[["data"]],
      adjprior = data[["adjprior"]],
      priorTerm = data[["priorTerm"]],
      rowsFromInd = data[["rowsFromInd"]],
      Z = data[["Z"]],
      dist = data[["dist"]]
    )
  } else {
    data <- importFile(data, format, verbose)
    data <- handleData(data, tolower(format))
    c <- list(
      noalle = data[["noalle"]],
      data = data[["newData"]],
      adjprior = data[["adjprior"]],
      priorTerm = data[["priorTerm"]],
      rowsFromInd = data[["rowsFromInd"]],
      Z = data[["Z"]],
      dist = data[["dist"]]
    )
  }

  # Comparing partitions =======================================================
  ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
  if (!is.null(partitionCompare)) {
    logmls <- comparePartitions(
      c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
      c[["rowsFromInd"]], c[["noalle"]], c[["adjprior"]]
    )
  }

  # Generating partition summary ===============================================
  ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
  c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
  logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA
  logml <- logml_npops_partitionSummary[["logml"]]
  npops <- logml_npops_partitionSummary[["npops"]]
  partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]

  # Generating output object ===================================================
  out <- list(
    "alleleCodes" = alleleCodes, "adjprior" = c[["adjprior"]],
    "popnames" = popnames, "rowsFromInd" = c[["rowsFromInd"]],
    "data" = c[["data"]], "npops" = npops, "noalle" = c[["noalle"]],
    "mixtureType" = "mix", "logml" = logml
  )
  if (logml == 1) {
    return(out)
  }

  # Writing mixture info =======================================================
  changesInLogml <- writeMixtureInfo(
    logml, c[["rowsFromInd"]], c[["data"]], c[["adjprior"]], c[["priorTerm"]],
    NULL, inp, partitionSummary, popnames, fixedK, verbose
  )

  # Updateing results ==========================================================
  return(c(out, list("changesInLogml" = changesInLogml)))
}
Added bare-bones greedyMix 2020-05-20 15:34:40 +02:00			`#' @title Clustering of individuals`
greedyMix changed to channel data load (close #16) 2021-09-03 08:43:37 +02:00			`#' @param data data file`
Improved handling of data input on greedyMix 2021-09-03 13:08:40 +02:00			`#' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"`
Added missing documentation for arguments (#25) 2023-08-09 15:27:45 +02:00			`#' @param partitionCompare a list of partitions to compare`
			`#' @param npops number of populations`
			`#' @param counts counts`
			`#' @param sumcounts sumcounts`
			`#' @param max_iter maximum number of iterations`
			`#' @param alleleCodes allele codes`
			`#' @param inp input file`
			`#' @param popnames population names`
			`#' @param fixedK if \code{TRUE}, the number of populations is fixed`
Added greedyMix support for VCF (closes #17) 2021-09-03 11:10:06 +02:00			`#' @param verbose if \code{TRUE}, prints extra output information`
Improved documentation 2020-06-24 11:48:23 +02:00			`#' @importFrom utils read.delim`
Fixed dependencies 2021-09-03 11:17:00 +02:00			`#' @importFrom vcfR read.vcfR`
Dependency fixes 2021-09-03 12:56:00 +02:00			`#' @importFrom Rsamtools scanBam`
Added read.genepop() to greedyMix() (closes #19) 2022-01-27 11:16:32 +01:00			`#' @importFrom adegenet read.genepop .readExt`
Added SAM/BAM support (closes #18) 2021-09-03 12:50:11 +02:00			`#' @references Samtools: a suite of programs for interacting`
			`#' with high-throughput sequencing data. <http://www.htslib.org/>`
Added bare-bones greedyMix 2020-05-20 15:34:40 +02:00			`#' @export`
Moved used datasets to /isnt/extdata (#25) 2023-08-09 10:54:48 +02:00			`#' @examples`
Unwrapped `greedyMix() example (#24) 2024-08-07 11:42:29 +02:00			`#' data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")`
Updated documentation (#24) 2024-04-08 13:29:03 +02:00			`#' greedyMix(data, "baps")`
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`greedyMix <- function(`
Changed default `npops` to 3 (#24) 2024-04-10 14:25:45 +02:00			`data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 3L,`
Incorporated handleData() on greedyMix() (#25) 2023-08-11 11:18:03 +02:00			`counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,`
			`inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE`
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`) {`
			`# Importing and handling data ================================================`
Aligned processing of Genepop with MATLAB code 2024-09-13 13:53:01 +02:00			`# TODO: use format as class and make handling data a generic`
Fails for FAST input to `greedyMix()` (#24) This is a desirable feature, but was not implemented in the original BAPS, so better leave it out until the original planned features is working. 2024-04-08 13:14:54 +02:00			`if (tolower(format) %in% "fasta") {`
Improved handling of FASTA data (#24) 2024-08-09 14:57:07 +02:00			`data <- convert_FASTA_to_BAPS(data)`
			`format <- "baps"`
Fails for FAST input to `greedyMix()` (#24) This is a desirable feature, but was not implemented in the original BAPS, so better leave it out until the original planned features is working. 2024-04-08 13:14:54 +02:00			`}`
Improved handling of BAPS format (#24) 2024-03-25 16:05:00 +01:00			`if (tolower(format) %in% "baps") {`
			`data <- process_BAPS_data(data, NULL)`
			`c <- list(`
			`noalle = data[["noalle"]],`
			`data = data[["data"]],`
			`adjprior = data[["adjprior"]],`
			`priorTerm = data[["priorTerm"]],`
			`rowsFromInd = data[["rowsFromInd"]],`
			`Z = data[["Z"]],`
			`dist = data[["dist"]]`
			`)`
Aligned processing of Genepop with MATLAB code 2024-09-13 13:53:01 +02:00			`} else if (tolower(format) %in% "genepop") {`
			`data <- process_GenePop_data(data)`
			`c <- list(`
			`noalle = data[["noalle"]],`
			`data = data[["data"]],`
			`adjprior = data[["adjprior"]],`
			`priorTerm = data[["priorTerm"]],`
			`rowsFromInd = data[["rowsFromInd"]],`
			`Z = data[["Z"]],`
			`dist = data[["dist"]]`
			`)`
Improved handling of BAPS format (#24) 2024-03-25 16:05:00 +01:00			`} else {`
			`data <- importFile(data, format, verbose)`
			`data <- handleData(data, tolower(format))`
			`c <- list(`
			`noalle = data[["noalle"]],`
			`data = data[["newData"]],`
			`adjprior = data[["adjprior"]],`
			`priorTerm = data[["priorTerm"]],`
			`rowsFromInd = data[["rowsFromInd"]],`
			`Z = data[["Z"]],`
			`dist = data[["dist"]]`
			`)`
			`}`
Added MATLAB code from original greedyMix() (#25) 2023-08-09 11:26:29 +02:00
Improved handling of NULL objects (#25) 2023-08-09 14:17:17 +02:00			`# Comparing partitions =======================================================`
Fixed calculation og `ninds` (#24) 2024-04-08 10:11:25 +02:00			`ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))`
Improved handling of NULL objects (#25) 2023-08-09 14:17:17 +02:00			`if (!is.null(partitionCompare)) {`
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`logmls <- comparePartitions(`
Incorporated handleData() on greedyMix() (#25) 2023-08-11 11:18:03 +02:00			`c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,`
			`c[["rowsFromInd"]], c[["noalle"]], c[["adjprior"]]`
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`)`
Added MATLAB code from original greedyMix() (#25) 2023-08-09 11:26:29 +02:00			`}`
Improved argument retrieval on greedyMix() (#25) 2023-08-09 13:13:50 +02:00
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`# Generating partition summary ===============================================`
Fixed calculation of `ninds` `ekat` (#24) 2024-03-25 16:04:11 +01:00			`ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])`
			`c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)`
Syntax fixes 2024-09-27 06:59:18 +02:00			`logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA`
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`logml <- logml_npops_partitionSummary[["logml"]]`
			`npops <- logml_npops_partitionSummary[["npops"]]`
			`partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]`
Improved handling of NULL objects (#25) 2023-08-09 14:17:17 +02:00
			`# Generating output object ===================================================`
			`out <- list(`
Fixed to indMix (#25) 2023-08-11 14:31:08 +02:00			`"alleleCodes" = alleleCodes, "adjprior" = c[["adjprior"]],`
			`"popnames" = popnames, "rowsFromInd" = c[["rowsFromInd"]],`
			`"data" = c[["data"]], "npops" = npops, "noalle" = c[["noalle"]],`
			`"mixtureType" = "mix", "logml" = logml`
			`)`
Improved handling of NULL objects (#25) 2023-08-09 14:17:17 +02:00			`if (logml == 1) {`
			`return(out)`
			`}`
Added MATLAB code from original greedyMix() (#25) 2023-08-09 11:26:29 +02:00
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`# Writing mixture info =======================================================`
			`changesInLogml <- writeMixtureInfo(`
Fixed basic parsing of FASTA files (#25) 2023-09-11 12:15:30 +02:00			`logml, c[["rowsFromInd"]], c[["data"]], c[["adjprior"]], c[["priorTerm"]],`
Added verbose argument to writeMixtureInfo() 2023-09-11 14:06:45 +02:00			`NULL, inp, partitionSummary, popnames, fixedK, verbose`
Incorporating subfunctions of greedyMix() (#25) 2023-08-09 12:06:09 +02:00			`)`
Added MATLAB code from original greedyMix() (#25) 2023-08-09 11:26:29 +02:00
Improved handling of NULL objects (#25) 2023-08-09 14:17:17 +02:00			`# Updateing results ==========================================================`
Fixed output of greedyMix() 2023-09-14 11:02:06 +02:00			`return(c(out, list("changesInLogml" = changesInLogml)))`
Restyled files Ran through styler::style_dir() in the R and tests directories in preparation for #23. 2021-11-10 14:02:35 +01:00			`}`