Merge branch 'issue-24' into develop

* issue-24:
  Increment version number to 0.0.0.9029
  Disabled FASTA tests on greedyMix() (#24)
  Added synthetic FASTA file (#24)
  Ensure diffInCounts returns as.matrix (#24)
  Adapted tests to FASTA on greedyMix() (#24)
  Removed baps file pointing to fasta
  Improved conversion from FASTA to BAPS (#24)
  Improved handling of FASTA data (#24)
  `process_BAPS_data()` can handle file being an R object
  Added function to convert from FASTA to BAPS (#24)
  Unwrapped `greedyMix() example (#24)
This commit is contained in:
Waldir Leoncio 2024-08-26 06:32:51 +02:00
commit 010f18dd19
11 changed files with 109 additions and 27 deletions

View file

@ -1,6 +1,6 @@
Package: rBAPS Package: rBAPS
Title: Bayesian Analysis of Population Structure Title: Bayesian Analysis of Population Structure
Version: 0.0.0.9028 Version: 0.0.0.9029
Date: 2020-11-09 Date: 2020-11-09
Authors@R: Authors@R:
c( c(
@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
License: GPL-3 License: GPL-3
BugReports: https://github.com/ocbe-uio/rBAPS/issues BugReports: https://github.com/ocbe-uio/rBAPS/issues
Encoding: UTF-8 Encoding: UTF-8
RoxygenNote: 7.3.1 RoxygenNote: 7.3.2
Suggests: Suggests:
testthat (>= 2.1.0) testthat (>= 2.1.0)
Imports: Imports:

View file

@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(convert_FASTA_to_BAPS)
export(greedyMix) export(greedyMix)
export(handleData) export(handleData)
export(importFile) export(importFile)

View file

@ -13,5 +13,5 @@ computeDiffInCounts <- function(rows, max_noalle, nloci, data) {
diffInCounts[element] <- diffInCounts[element] + 1 diffInCounts[element] <- diffInCounts[element] + 1
} }
} }
return(diffInCounts) return(as.matrix(diffInCounts))
} }

16
R/convert_FASTA_to_BAPS.R Normal file
View file

@ -0,0 +1,16 @@
#' @title Convert from FASTA to BAPS
#' @description Converts a file (not an R object) from FASTA to BAPS format
#' @param file filename of FASTA file
#' @return `data` in BAPS format
#' @author Waldir Leoncio
#' @export
#' @examples
#' file <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
#' convert_FASTA_to_BAPS(file)
convert_FASTA_to_BAPS <- function(file) {
data <- load_fasta(file) # Processing data
data <- cbind(data, seq_len(nrow(data))) # Add IDs of individuals (sequential)
data[data == 0] <- -9 # Because zeros (missing) in BAPS are coded as -9
colnames(data) <- paste("V", seq_len(ncol(data)), sep = "")
return(data)
}

View file

@ -19,10 +19,8 @@
#' with high-throughput sequencing data. <http://www.htslib.org/> #' with high-throughput sequencing data. <http://www.htslib.org/>
#' @export #' @export
#' @examples #' @examples
#' \dontrun{ # TEMP: unwrap once #24 is resolved #' data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")
#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
#' greedyMix(data, "baps") #' greedyMix(data, "baps")
#' } # TEMP: unwrap once #24 is resolved
greedyMix <- function( greedyMix <- function(
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 3L, data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 3L,
counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL, counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
@ -30,7 +28,8 @@ greedyMix <- function(
) { ) {
# Importing and handling data ================================================ # Importing and handling data ================================================
if (tolower(format) %in% "fasta") { if (tolower(format) %in% "fasta") {
stop("FASTA format not yet supported on greedyMix") data <- convert_FASTA_to_BAPS(data)
format <- "baps"
} }
if (tolower(format) %in% "baps") { if (tolower(format) %in% "baps") {
data <- process_BAPS_data(data, NULL) data <- process_BAPS_data(data, NULL)
@ -69,7 +68,7 @@ greedyMix <- function(
# Generating partition summary =============================================== # Generating partition summary ===============================================
ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]]) ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L) c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA data
logml <- logml_npops_partitionSummary[["logml"]] logml <- logml_npops_partitionSummary[["logml"]]
npops <- logml_npops_partitionSummary[["npops"]] npops <- logml_npops_partitionSummary[["npops"]]
partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]] partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]

View file

@ -30,6 +30,7 @@ handleData <- function(raw_data, format = "Genepop") {
"bam" = stop("BAM format not supported for processing yet") "bam" = stop("BAM format not supported for processing yet")
) )
data <- as.matrix(raw_data) data <- as.matrix(raw_data)
dataApu <- data[, seq_len(nloci)] dataApu <- data[, seq_len(nloci)]
nollat <- matlab2r::find(dataApu == 0) nollat <- matlab2r::find(dataApu == 0)
if (!isempty(nollat)) { if (!isempty(nollat)) {
@ -54,9 +55,14 @@ handleData <- function(raw_data, format = "Genepop") {
alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1))) alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
} }
for (loc in seq_len(nloci)) { # This is where data gets converted to {1, 2, 3, 4} for {A, C, G, T}
for (all in seq_len(noalle[loc])) { codes <- unique(as.vector(data[, -ncol(data)]))
data[matlab2r::find(data[, loc] == alleleCodes[all, loc]), loc] <- all skip_conversion <- base::min(codes) == -9 && base::max(codes) == 4
if (!skip_conversion) {
for (loc in seq_len(nloci)) {
for (all in seq_len(noalle[loc])) {
data[matlab2r::find(data[, loc] == alleleCodes[all, loc]), loc] <- all
}
} }
} }

View file

@ -2,14 +2,23 @@ process_BAPS_data <- function(file, partitionCompare) {
if (!is.null(partitionCompare)) { if (!is.null(partitionCompare)) {
cat('Data:', file, '\n') cat('Data:', file, '\n')
} }
data <- read.table(file)
ninds <- testaaOnkoKunnollinenBapsData(data) # for testing purposes? # Importing data
if (is.character(file)) {
data <- read.table(file)
} else {
data <- file
}
ninds <- testaaOnkoKunnollinenBapsData(data) # Checks if last column is ID
if (ninds == 0) { if (ninds == 0) {
warning('Incorrect Data-file.') warning('Incorrect Data-file.')
return(NULL) return(NULL)
} }
popnames <- NULL # Dropped specification of population names (from BAPS 6) popnames <- NULL # Dropped specification of population names (from BAPS 6)
# Processing data
result <- handleData(data, format = "BAPS") result <- handleData(data, format = "BAPS")
data <- result$newData data <- result$newData
rowsFromInd <- result$rowsFromInd rowsFromInd <- result$rowsFromInd

View file

@ -0,0 +1,39 @@
>1
AACGAAACGATCGCGTCACCGGAACGTTGTCCGTCTCGAATAGCACTGTGGGAACGTGTTTTACATTCGT
TAGTAACATGGTCAGCTGCTCATCCGTATT
>2
ATCAGCAAACGAGAAGTTGCAGAGGTCTTTGGTTTGAGCATTGCCCCCATACAATCGACTTCTGGCCTGG
AATGCACCACAAACATACCCCACAGGCTCG
>3
GCTTTTACTAAGGCCTATCGGATTCAACGTCACTAAGACTCGGCACTAACAGGCCGTTGTAAGCCGCTCT
GTCTGAGTATGGATGGTGGAGGCGGAGCCG
>4
ACCTGGACCTCTGTATTAACGGCTGTGATTCTGAGGGGGGTATCGCAGCGCACTTTCTAGCTATATCACG
CAAGGATAAAGTTCACCCATCACGTTGACC
>5
ACAATACGTCATCCACACCGCGCCTATGGAAGAATTTGCCCTTTCGGCGACAGCCCATGCTGTCAAGGAG
GTAACATAGCTACCAGGTCCCATTCCAGGA
>6
TCCCCCCAGTGGACACGGCTCGGGTAATGCAGCTTACCTCAACGCTAACGCATTTGACAGTAGTGAATCA
CGGGCAACGCTGGGTGATTGCAAGTTTTGT
>7
GCAACCACTGGTCGCCTGGAGCATTGATCAGGAACATGTCTGCAAGGGGGGCCGTTGCGGGTTTCAGTCA
TCGTATTGCGCTGCAAATCCTCGGAGCCTC
>8
CACCCGTAAAGCACGAGTAGGTTTCACCGCGACTTATATATTCCACCATACGGTTAACAAGGCAACACTT
ATTCGTCGTCCAATGATCGTCCCTCTCCAG
>9
CGAATCCATTCGGGATAAAGTTAATACGTAAGTCGAACGGGGTTTAGGAAGAGCTCTGCTGTTAAGCGCG
CTTATCATCTTATATGTGTCAGTTGTGTAC
>10
CGTTCGCATTTATAGGATATCCCCTAAACTAATTGGTAGTGATGGTATACCAGCGGTGCATTGTCCTCGC
CTGTAGTTTAAGTCAACCTCTGCCTTAATC

View file

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/convert_FASTA_to_BAPS.R
\name{convert_FASTA_to_BAPS}
\alias{convert_FASTA_to_BAPS}
\title{Convert from FASTA to BAPS}
\usage{
convert_FASTA_to_BAPS(file)
}
\arguments{
\item{file}{filename of FASTA file}
}
\value{
`data` in BAPS format
}
\description{
Converts a file (not an R object) from FASTA to BAPS format
}
\examples{
file <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
convert_FASTA_to_BAPS(file)
}
\author{
Waldir Leoncio
}

View file

@ -48,10 +48,8 @@ greedyMix(
Clustering of individuals Clustering of individuals
} }
\examples{ \examples{
\dontrun{ # TEMP: unwrap once #24 is resolved data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")
data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
greedyMix(data, "baps") greedyMix(data, "baps")
} # TEMP: unwrap once #24 is resolved
} }
\references{ \references{
Samtools: a suite of programs for interacting Samtools: a suite of programs for interacting

View file

@ -46,10 +46,6 @@ raw_bam <- importFile(
data = file.path(path_inst, "bam_example.bam"), data = file.path(path_inst, "bam_example.bam"),
format = "BAM", format = "BAM",
) )
raw_baps <- importFile(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
)
test_that("Files are imported correctly", { test_that("Files are imported correctly", {
expect_equal(dim(raw_fasta), c(5, 99)) expect_equal(dim(raw_fasta), c(5, 99))
@ -61,13 +57,6 @@ test_that("Files are imported correctly", {
) )
) )
expect_equal(length(raw_bam[[1]]), 13) expect_equal(length(raw_bam[[1]]), 13)
expect_error(
greedyMix(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
),
"FASTA format not yet supported on greedyMix"
)
}) })
test_that("greedyMix() fails successfully", { test_that("greedyMix() fails successfully", {
@ -77,6 +66,7 @@ test_that("greedyMix() fails successfully", {
test_that("greedyMix() works when it should", { test_that("greedyMix() works when it should", {
baps_file <- file.path(path_inst, "BAPS_clustering_diploid.txt") baps_file <- file.path(path_inst, "BAPS_clustering_diploid.txt")
fasta_file <- file.path(path_inst, "FASTA_clustering_haploid.fasta")
greedy_baps <- greedyMix(baps_file, "BAPS") greedy_baps <- greedyMix(baps_file, "BAPS")
expect_type(greedy_baps, "list") expect_type(greedy_baps, "list")
expect_length(greedy_baps, 10L) expect_length(greedy_baps, 10L)