2020-05-20 15:34:40 +02:00
|
|
|
#' @title Clustering of individuals
|
2021-09-03 08:43:37 +02:00
|
|
|
#' @param data data file
|
2021-09-03 13:08:40 +02:00
|
|
|
#' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
|
2023-08-09 15:27:45 +02:00
|
|
|
#' @param partitionCompare a list of partitions to compare
|
|
|
|
|
#' @param npops number of populations
|
|
|
|
|
#' @param counts counts
|
|
|
|
|
#' @param sumcounts sumcounts
|
|
|
|
|
#' @param max_iter maximum number of iterations
|
|
|
|
|
#' @param alleleCodes allele codes
|
|
|
|
|
#' @param inp input file
|
|
|
|
|
#' @param popnames population names
|
|
|
|
|
#' @param fixedK if \code{TRUE}, the number of populations is fixed
|
2021-09-03 11:10:06 +02:00
|
|
|
#' @param verbose if \code{TRUE}, prints extra output information
|
2020-06-24 11:48:23 +02:00
|
|
|
#' @importFrom utils read.delim
|
2021-09-03 11:17:00 +02:00
|
|
|
#' @importFrom vcfR read.vcfR
|
2021-09-03 12:56:00 +02:00
|
|
|
#' @importFrom Rsamtools scanBam
|
2022-01-27 11:16:32 +01:00
|
|
|
#' @importFrom adegenet read.genepop .readExt
|
2021-09-03 12:50:11 +02:00
|
|
|
#' @references Samtools: a suite of programs for interacting
|
|
|
|
|
#' with high-throughput sequencing data. <http://www.htslib.org/>
|
2020-05-20 15:34:40 +02:00
|
|
|
#' @export
|
2023-08-09 10:54:48 +02:00
|
|
|
#' @examples
|
2024-08-07 11:42:29 +02:00
|
|
|
#' data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")
|
2024-04-08 13:29:03 +02:00
|
|
|
#' greedyMix(data, "baps")
|
2023-08-09 12:06:09 +02:00
|
|
|
greedyMix <- function(
|
2024-04-10 14:25:45 +02:00
|
|
|
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 3L,
|
2023-08-11 11:18:03 +02:00
|
|
|
counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
|
|
|
|
|
inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
|
2023-08-09 12:06:09 +02:00
|
|
|
) {
|
|
|
|
|
# Importing and handling data ================================================
|
2024-09-13 13:53:01 +02:00
|
|
|
# TODO: use format as class and make handling data a generic
|
2024-04-08 13:14:54 +02:00
|
|
|
if (tolower(format) %in% "fasta") {
|
2024-08-09 14:57:07 +02:00
|
|
|
data <- convert_FASTA_to_BAPS(data)
|
|
|
|
|
format <- "baps"
|
2024-04-08 13:14:54 +02:00
|
|
|
}
|
2024-03-25 16:05:00 +01:00
|
|
|
if (tolower(format) %in% "baps") {
|
|
|
|
|
data <- process_BAPS_data(data, NULL)
|
|
|
|
|
c <- list(
|
|
|
|
|
noalle = data[["noalle"]],
|
|
|
|
|
data = data[["data"]],
|
|
|
|
|
adjprior = data[["adjprior"]],
|
|
|
|
|
priorTerm = data[["priorTerm"]],
|
|
|
|
|
rowsFromInd = data[["rowsFromInd"]],
|
|
|
|
|
Z = data[["Z"]],
|
|
|
|
|
dist = data[["dist"]]
|
|
|
|
|
)
|
2024-09-13 13:53:01 +02:00
|
|
|
} else if (tolower(format) %in% "genepop") {
|
|
|
|
|
data <- process_GenePop_data(data)
|
|
|
|
|
c <- list(
|
|
|
|
|
noalle = data[["noalle"]],
|
|
|
|
|
data = data[["data"]],
|
|
|
|
|
adjprior = data[["adjprior"]],
|
|
|
|
|
priorTerm = data[["priorTerm"]],
|
|
|
|
|
rowsFromInd = data[["rowsFromInd"]],
|
|
|
|
|
Z = data[["Z"]],
|
|
|
|
|
dist = data[["dist"]]
|
|
|
|
|
)
|
2024-03-25 16:05:00 +01:00
|
|
|
} else {
|
|
|
|
|
data <- importFile(data, format, verbose)
|
|
|
|
|
data <- handleData(data, tolower(format))
|
|
|
|
|
c <- list(
|
|
|
|
|
noalle = data[["noalle"]],
|
|
|
|
|
data = data[["newData"]],
|
|
|
|
|
adjprior = data[["adjprior"]],
|
|
|
|
|
priorTerm = data[["priorTerm"]],
|
|
|
|
|
rowsFromInd = data[["rowsFromInd"]],
|
|
|
|
|
Z = data[["Z"]],
|
|
|
|
|
dist = data[["dist"]]
|
|
|
|
|
)
|
|
|
|
|
}
|
2023-08-09 11:26:29 +02:00
|
|
|
|
2023-08-09 14:17:17 +02:00
|
|
|
# Comparing partitions =======================================================
|
2024-04-08 10:11:25 +02:00
|
|
|
ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
|
2023-08-09 14:17:17 +02:00
|
|
|
if (!is.null(partitionCompare)) {
|
2023-08-09 12:06:09 +02:00
|
|
|
logmls <- comparePartitions(
|
2023-08-11 11:18:03 +02:00
|
|
|
c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
|
|
|
|
|
c[["rowsFromInd"]], c[["noalle"]], c[["adjprior"]]
|
2023-08-09 12:06:09 +02:00
|
|
|
)
|
2023-08-09 11:26:29 +02:00
|
|
|
}
|
2023-08-09 13:13:50 +02:00
|
|
|
|
2023-08-09 12:06:09 +02:00
|
|
|
# Generating partition summary ===============================================
|
2024-03-25 16:04:11 +01:00
|
|
|
ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
|
|
|
|
|
c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
|
2024-09-27 06:59:18 +02:00
|
|
|
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA
|
2023-08-09 12:06:09 +02:00
|
|
|
logml <- logml_npops_partitionSummary[["logml"]]
|
|
|
|
|
npops <- logml_npops_partitionSummary[["npops"]]
|
|
|
|
|
partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]
|
2023-08-09 14:17:17 +02:00
|
|
|
|
|
|
|
|
# Generating output object ===================================================
|
|
|
|
|
out <- list(
|
2023-08-11 14:31:08 +02:00
|
|
|
"alleleCodes" = alleleCodes, "adjprior" = c[["adjprior"]],
|
|
|
|
|
"popnames" = popnames, "rowsFromInd" = c[["rowsFromInd"]],
|
|
|
|
|
"data" = c[["data"]], "npops" = npops, "noalle" = c[["noalle"]],
|
|
|
|
|
"mixtureType" = "mix", "logml" = logml
|
|
|
|
|
)
|
2023-08-09 14:17:17 +02:00
|
|
|
if (logml == 1) {
|
|
|
|
|
return(out)
|
|
|
|
|
}
|
2023-08-09 11:26:29 +02:00
|
|
|
|
2023-08-09 12:06:09 +02:00
|
|
|
# Writing mixture info =======================================================
|
|
|
|
|
changesInLogml <- writeMixtureInfo(
|
2023-09-11 12:15:30 +02:00
|
|
|
logml, c[["rowsFromInd"]], c[["data"]], c[["adjprior"]], c[["priorTerm"]],
|
2023-09-11 14:06:45 +02:00
|
|
|
NULL, inp, partitionSummary, popnames, fixedK, verbose
|
2023-08-09 12:06:09 +02:00
|
|
|
)
|
2023-08-09 11:26:29 +02:00
|
|
|
|
2023-08-09 14:17:17 +02:00
|
|
|
# Updateing results ==========================================================
|
2023-09-14 11:02:06 +02:00
|
|
|
return(c(out, list("changesInLogml" = changesInLogml)))
|
2021-11-10 14:02:35 +01:00
|
|
|
}
|