Merge branch 'issue-24' into develop

* issue-24:
  Increment version number to 0.0.0.9029
  Disabled FASTA tests on greedyMix() (#24)
  Added synthetic FASTA file (#24)
  Ensure diffInCounts returns as.matrix (#24)
  Adapted tests to FASTA on greedyMix() (#24)
  Removed baps file pointing to fasta
  Improved conversion from FASTA to BAPS (#24)
  Improved handling of FASTA data (#24)
  `process_BAPS_data()` can handle file being an R object
  Added function to convert from FASTA to BAPS (#24)
  Unwrapped `greedyMix() example (#24)
This commit is contained in:
Waldir Leoncio 2024-08-26 06:32:51 +02:00
commit 010f18dd19
11 changed files with 109 additions and 27 deletions

View file

@ -1,6 +1,6 @@
Package: rBAPS
Title: Bayesian Analysis of Population Structure
Version: 0.0.0.9028
Version: 0.0.0.9029
Date: 2020-11-09
Authors@R:
c(
@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
License: GPL-3
BugReports: https://github.com/ocbe-uio/rBAPS/issues
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Suggests:
testthat (>= 2.1.0)
Imports:

View file

@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand
export(convert_FASTA_to_BAPS)
export(greedyMix)
export(handleData)
export(importFile)

View file

@ -13,5 +13,5 @@ computeDiffInCounts <- function(rows, max_noalle, nloci, data) {
diffInCounts[element] <- diffInCounts[element] + 1
}
}
return(diffInCounts)
return(as.matrix(diffInCounts))
}

16
R/convert_FASTA_to_BAPS.R Normal file
View file

@ -0,0 +1,16 @@
#' @title Convert from FASTA to BAPS
#' @description Converts a file (not an R object) from FASTA to BAPS format
#' @param file filename of FASTA file
#' @return `data` in BAPS format
#' @author Waldir Leoncio
#' @export
#' @examples
#' file <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
#' convert_FASTA_to_BAPS(file)
convert_FASTA_to_BAPS <- function(file) {
data <- load_fasta(file) # Processing data
data <- cbind(data, seq_len(nrow(data))) # Add IDs of individuals (sequential)
data[data == 0] <- -9 # Because zeros (missing) in BAPS are coded as -9
colnames(data) <- paste("V", seq_len(ncol(data)), sep = "")
return(data)
}

View file

@ -19,10 +19,8 @@
#' with high-throughput sequencing data. <http://www.htslib.org/>
#' @export
#' @examples
#' \dontrun{ # TEMP: unwrap once #24 is resolved
#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
#' data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")
#' greedyMix(data, "baps")
#' } # TEMP: unwrap once #24 is resolved
greedyMix <- function(
data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 3L,
counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
@ -30,7 +28,8 @@ greedyMix <- function(
) {
# Importing and handling data ================================================
if (tolower(format) %in% "fasta") {
stop("FASTA format not yet supported on greedyMix")
data <- convert_FASTA_to_BAPS(data)
format <- "baps"
}
if (tolower(format) %in% "baps") {
data <- process_BAPS_data(data, NULL)
@ -69,7 +68,7 @@ greedyMix <- function(
# Generating partition summary ===============================================
ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose) # FIXME: not working for FASTA data
logml <- logml_npops_partitionSummary[["logml"]]
npops <- logml_npops_partitionSummary[["npops"]]
partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]

View file

@ -30,6 +30,7 @@ handleData <- function(raw_data, format = "Genepop") {
"bam" = stop("BAM format not supported for processing yet")
)
data <- as.matrix(raw_data)
dataApu <- data[, seq_len(nloci)]
nollat <- matlab2r::find(dataApu == 0)
if (!isempty(nollat)) {
@ -54,11 +55,16 @@ handleData <- function(raw_data, format = "Genepop") {
alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
}
# This is where data gets converted to {1, 2, 3, 4} for {A, C, G, T}
codes <- unique(as.vector(data[, -ncol(data)]))
skip_conversion <- base::min(codes) == -9 && base::max(codes) == 4
if (!skip_conversion) {
for (loc in seq_len(nloci)) {
for (all in seq_len(noalle[loc])) {
data[matlab2r::find(data[, loc] == alleleCodes[all, loc]), loc] <- all
}
}
}
nind <- as.integer(base::max(data[, ncol(data)]))
nrows <- size(data, 1)

View file

@ -2,14 +2,23 @@ process_BAPS_data <- function(file, partitionCompare) {
if (!is.null(partitionCompare)) {
cat('Data:', file, '\n')
}
# Importing data
if (is.character(file)) {
data <- read.table(file)
ninds <- testaaOnkoKunnollinenBapsData(data) # for testing purposes?
} else {
data <- file
}
ninds <- testaaOnkoKunnollinenBapsData(data) # Checks if last column is ID
if (ninds == 0) {
warning('Incorrect Data-file.')
return(NULL)
}
popnames <- NULL # Dropped specification of population names (from BAPS 6)
# Processing data
result <- handleData(data, format = "BAPS")
data <- result$newData
rowsFromInd <- result$rowsFromInd

View file

@ -0,0 +1,39 @@
>1
AACGAAACGATCGCGTCACCGGAACGTTGTCCGTCTCGAATAGCACTGTGGGAACGTGTTTTACATTCGT
TAGTAACATGGTCAGCTGCTCATCCGTATT
>2
ATCAGCAAACGAGAAGTTGCAGAGGTCTTTGGTTTGAGCATTGCCCCCATACAATCGACTTCTGGCCTGG
AATGCACCACAAACATACCCCACAGGCTCG
>3
GCTTTTACTAAGGCCTATCGGATTCAACGTCACTAAGACTCGGCACTAACAGGCCGTTGTAAGCCGCTCT
GTCTGAGTATGGATGGTGGAGGCGGAGCCG
>4
ACCTGGACCTCTGTATTAACGGCTGTGATTCTGAGGGGGGTATCGCAGCGCACTTTCTAGCTATATCACG
CAAGGATAAAGTTCACCCATCACGTTGACC
>5
ACAATACGTCATCCACACCGCGCCTATGGAAGAATTTGCCCTTTCGGCGACAGCCCATGCTGTCAAGGAG
GTAACATAGCTACCAGGTCCCATTCCAGGA
>6
TCCCCCCAGTGGACACGGCTCGGGTAATGCAGCTTACCTCAACGCTAACGCATTTGACAGTAGTGAATCA
CGGGCAACGCTGGGTGATTGCAAGTTTTGT
>7
GCAACCACTGGTCGCCTGGAGCATTGATCAGGAACATGTCTGCAAGGGGGGCCGTTGCGGGTTTCAGTCA
TCGTATTGCGCTGCAAATCCTCGGAGCCTC
>8
CACCCGTAAAGCACGAGTAGGTTTCACCGCGACTTATATATTCCACCATACGGTTAACAAGGCAACACTT
ATTCGTCGTCCAATGATCGTCCCTCTCCAG
>9
CGAATCCATTCGGGATAAAGTTAATACGTAAGTCGAACGGGGTTTAGGAAGAGCTCTGCTGTTAAGCGCG
CTTATCATCTTATATGTGTCAGTTGTGTAC
>10
CGTTCGCATTTATAGGATATCCCCTAAACTAATTGGTAGTGATGGTATACCAGCGGTGCATTGTCCTCGC
CTGTAGTTTAAGTCAACCTCTGCCTTAATC

View file

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/convert_FASTA_to_BAPS.R
\name{convert_FASTA_to_BAPS}
\alias{convert_FASTA_to_BAPS}
\title{Convert from FASTA to BAPS}
\usage{
convert_FASTA_to_BAPS(file)
}
\arguments{
\item{file}{filename of FASTA file}
}
\value{
`data` in BAPS format
}
\description{
Converts a file (not an R object) from FASTA to BAPS format
}
\examples{
file <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
convert_FASTA_to_BAPS(file)
}
\author{
Waldir Leoncio
}

View file

@ -48,10 +48,8 @@ greedyMix(
Clustering of individuals
}
\examples{
\dontrun{ # TEMP: unwrap once #24 is resolved
data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
data <- system.file("extdata", "BAPS_clustering_diploid.txt", package = "rBAPS")
greedyMix(data, "baps")
} # TEMP: unwrap once #24 is resolved
}
\references{
Samtools: a suite of programs for interacting

View file

@ -46,10 +46,6 @@ raw_bam <- importFile(
data = file.path(path_inst, "bam_example.bam"),
format = "BAM",
)
raw_baps <- importFile(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
)
test_that("Files are imported correctly", {
expect_equal(dim(raw_fasta), c(5, 99))
@ -61,13 +57,6 @@ test_that("Files are imported correctly", {
)
)
expect_equal(length(raw_bam[[1]]), 13)
expect_error(
greedyMix(
data = file.path(path_inst, "FASTA_clustering_haploid.fasta"),
format = "FASTA"
),
"FASTA format not yet supported on greedyMix"
)
})
test_that("greedyMix() fails successfully", {
@ -77,6 +66,7 @@ test_that("greedyMix() fails successfully", {
test_that("greedyMix() works when it should", {
baps_file <- file.path(path_inst, "BAPS_clustering_diploid.txt")
fasta_file <- file.path(path_inst, "FASTA_clustering_haploid.fasta")
greedy_baps <- greedyMix(baps_file, "BAPS")
expect_type(greedy_baps, "list")
expect_length(greedy_baps, 10L)