diff --git a/R/linkage.R b/R/linkage.R index 80d6b30..ccaf9b5 100644 --- a/R/linkage.R +++ b/R/linkage.R @@ -4,10 +4,14 @@ #' linkage algorithm. The input Y is a distance matrix such as is generated by #' PDIST. Y may also be a more general dissimilarity matrix conforming to the #' output format of PDIST. -#' @param Y data +#' +#' Z = linkage(X) returns a matrix Z that encodes a tree containing hierarchical clusters of the rows of the input data matrix X. +#' @param Y matrix #' @param method either 'si', 'av', 'co' 'ce' or 'wa' +#' @note This is also a base Matlab function. The reason why the source code is also present here is unclear. #' @export linkage <- function(Y, method = 'co') { + #TODO: compare R output with MATLAB output k <- size(Y)[1] n <- size(Y)[2] m <- (1 + sqrt(1 + 8 * n)) / 2 @@ -24,18 +28,33 @@ linkage <- function(Y, method = 'co') { N[1:m] <- 1 n <- m; # since m is changing, we need to save m in n. R <- 1:n - for (s in 1:(n-1)) { - X <- Y - v <- min(X)[1] - k <- min(X)[2] + for (s in 1:(n - 1)) { + X <- as.matrix(as.vector(Y), ncol=1) + + v <- min_MATLAB(X)$mins + k <- min_MATLAB(X)$idx + i <- floor(m + 1 / 2 - sqrt(m ^ 2 - m + 1 / 4 - 2 * (k - 1))) j <- k - (i - 1) * (m - i / 2) + i - Z[s, ] <- c(R[i], R[j], v) # update one more row to the output matrix A - # Temp variables - I1 <- 1:(i - 1) - I2 <- (i + 1):(j - 1) - I3 <- (j + 1):m + Z[s, ] <- c(R[i], R[j], v) # update one more row to the output matrix A + + # Temp variables + if (i > 1) { + I1 <- 1:(i - 1) + } else { + I1 <- NULL + } + if (i + 1 <= j - 1) { + I2 <- (i + 1):(j - 1) + } else { + I2 <- NULL + } + if (j + 1 <= m) { + I3 <- (j + 1):m + } else { + I3 <- NULL + } U <- c(I1, I2, I3) I <- c( I1 * (m - (I1 + 1) / 2) - m + i, @@ -47,11 +66,13 @@ linkage <- function(Y, method = 'co') { I2 * (m - (I2 + 1) / 2) - m + j, j * (m - (j + 1) / 2) - m + I3 ) - + # Workaround in R for negative values in I and J + # I <- I[I > 0 & I <= length(Y)] + # J <- J[J > 0 & J <= length(Y)] switch(method, - 'si' = Y[I] <- min(Y[I], Y[J]), # single linkage + 'si' = Y[I] <- apply(cbind(Y[I], Y[J]), 1, min), # single linkage 'av' = Y[I] <- Y[I] + Y[J], # average linkage - 'co' = Y[I] <- max(Y[I], Y[J]), #complete linkage + 'co' = Y[I] <- apply(cbind(Y[I], Y[J]), 1, max), #complete linkage 'ce' = { K <- N[R[i]] + N[R[j]] # centroid linkage Y[I] <- (N[R[i]] * Y[I] + N[R[j]] * Y[J] - @@ -61,8 +82,7 @@ linkage <- function(Y, method = 'co') { Y[J] - N[R[U]] * v) / (N[R[i]] + N[R[j]] + N[R[U]]) ) J <- c(J, i * (m - (i + 1) / 2) - m + j) - Y[J] <- vector() # no need for the cluster information about j - + Y <- Y[-J] # no need for the cluster information about j # update m, N, R m <- m - 1 N[n + s] <- N[R[i]] + N[R[j]] diff --git a/R/min.R b/R/min.R new file mode 100644 index 0000000..2d8d617 --- /dev/null +++ b/R/min.R @@ -0,0 +1,16 @@ +#' @title Minimum (MATLAB version) +#' @description Finds the minimum value for each column of a matrix, potentially returning the indices instead +#' @param X matrix +#' @param indices return indices? +#' @return Either a list or a vector +#' @author Waldir Leoncio +#' @export +min_MATLAB <- function(X, indices = TRUE) { + mins <- apply(X, 2, min) + idx <- sapply(seq_len(ncol(X)), function(x) match(mins[x], X[, x])) + if (indices) { + return(list(mins = mins, idx = idx)) + } else { + return(mins) + } +} \ No newline at end of file diff --git a/R/newGetDistances.R b/R/newGetDistances.R index 896b1a3..b386afa 100644 --- a/R/newGetDistances.R +++ b/R/newGetDistances.R @@ -5,14 +5,14 @@ newGetDistances <- function(data, rowsFromInd) { empties <- find(data < 0) data[empties] <- 0 - data <- as.integer(data) # max(noalle) oltava <256 + data <- apply(data, 2, as.numeric) # max(noalle) oltava <256 pariTaulu <- zeros(riviLkm, 2) aPointer <- 1 - for (a in (1:ninds) - 1) { - pariTaulu[aPointer:(aPointer + ninds - 1 - a), 1] <- - ones(ninds - a, 1) * a - pariTaulu[aPointer:aPointer + ninds - 1 - a, 2] <- t(a + 1:ninds) + for (a in 1:(ninds - 1)) { + pariTaulu_rows <- aPointer:(aPointer + ninds - 1 - a) + pariTaulu[pariTaulu_rows, 1] <- ones(ninds - a, 1) * a + pariTaulu[pariTaulu_rows, 2] <- t((a + 1):ninds) aPointer <- aPointer + ninds - a } @@ -31,9 +31,9 @@ newGetDistances <- function(data, rowsFromInd) { rm(pariTaulu, miinus) x <- zeros(size(eka)) - x <- as.integer(x) + x <- apply(x, 2, as.integer) y <- zeros(size(toka)) - y <- as.integer(y) + y <- apply(y, 2, as.integer) for (j in 1:nloci) { for (k in 1:rowsFromInd) { @@ -57,7 +57,6 @@ newGetDistances <- function(data, rowsFromInd) { muut <- find(vertailuja > 0) dist[muut] <- summa[muut] / vertailuja[muut] rm(summa, vertailuja) - - Z = linkage(t(dist)) + Z <- linkage(t(dist)) return(list(Z = Z, dist = dist)) } \ No newline at end of file diff --git a/tests/testthat/test-greedyMix.R b/tests/testthat/test-greedyMix.R index 18e72e7..9a1c05a 100644 --- a/tests/testthat/test-greedyMix.R +++ b/tests/testthat/test-greedyMix.R @@ -1,11 +1,21 @@ # library(devtools)#TEMP -# library(testthat)#TEMP +library(testthat)#TEMP # library(rBAPS)#TEMP context("Opening files on greedyMix") -# greedyMix( -# tietue = "data/ExamplesDataFormatting/Example baseline data in GENEPOP format for Trained clustering.txt", -# format = "GenePop", -# savePreProcessed = FALSE -# ) \ No newline at end of file +greedyMix( + tietue = "data/ExamplesDataFormatting/Example baseline data in GENEPOP format for Trained clustering.txt", + format = "GenePop", + savePreProcessed = FALSE +) + +context("Linkage") + +test_that("Linkages are properly calculated", { + Y <- c(0.5, 0.3, 0.6, 0.3, 0.3, 0.2, 0.3, 0.3, 0.3, 0.5) + expect_equal( + object = linkage(Y), + expected = matrix(c(2, 1, 7, 8, 4, 3, 5, 6, .2, .3, .3, .6), ncol=3) + ) +}) \ No newline at end of file