ourMELONS/R/linkage.R

#' @title Linkage
#' @description Create hierarchical cluster tree.
#' @details Z = LINKAGE(Y) creates a hierarchical cluster tree, using the single
#' linkage algorithm.  The input Y is a distance matrix such as is generated by
#' PDIST. Y may also be a more general dissimilarity matrix conforming to the
#' output format of PDIST.
#'
#' Z = linkage(X) returns a matrix Z that encodes a tree containing hierarchical clusters of the rows of the input data matrix X.
#' @param Y matrix
#' @param method either 'si', 'av', 'co' 'ce' or 'wa'
#' @note This is also a base MATLAB function. The reason why the BAPS
#' source code also contains a LINKAGE function is unclear. One could speculate
#' that BAPS should use this function instead of the base one, so this is why
#' this function is part of this package (instead of a MATLAB-replicating
#' package such as matlab2r)
linkage <- function(Y, method = "co") {
  k <- size(Y)[1]
  n <- size(Y)[2]
  m <- (1 + sqrt(1 + 8 * n)) / 2
  if ((k != 1) | (m != trunc(m))) {
    stop(
      "The first input has to match the output of the PDIST function in size."
    )
  }
  method <- tolower(substr(method, 1, 2)) # simplify the switch string.
  monotonic <- 1
  Z <- zeros(m - 1, 3) # allocate the output matrix.
  N <- zeros(1, 2 * m - 1)
  N[seq_len(m)] <- 1
  n <- m # since m is changing, we need to save m in n.
  R <- seq_len(n)
  for (s in 1:(n - 1)) {
    X <- as.matrix(as.vector(Y), nrow = 1)
    v <- matlab2r::min(X)$mins
    k <- matlab2r::min(X)$idx

    i <- floor(m + 1 / 2 - sqrt(m^2 - m + 1 / 4 - 2 * (k - 1)))
    j <- k - (i - 1) * (m - i / 2) + i

    Z[s, ] <- c(R[i], R[j], v) # update one more row to the output matrix A

    # Temp variables
    if (i > 1) {
      I1 <- 1:(i - 1)
    } else {
      I1 <- NULL
    }
    if (i + 1 <= j - 1) {
      I2 <- (i + 1):(j - 1)
    } else {
      I2 <- NULL
    }
    if (j + 1 <= m) {
      I3 <- (j + 1):m
    } else {
      I3 <- NULL
    }
    U <- c(I1, I2, I3)
    I <- c(
      I1 * (m - (I1 + 1) / 2) - m + i,
      i * (m - (i + 1) / 2) - m + I2,
      i * (m - (i + 1) / 2) - m + I3
    )
    J <- c(
      I1 * (m - (I1 + 1) / 2) - m + j,
      I2 * (m - (I2 + 1) / 2) - m + j,
      j * (m - (j + 1) / 2) - m + I3
    )
    # Workaround in R for negative values in I and J
    # I <- I[I > 0 & I <= length(Y)]
    # J <- J[J > 0 & J <= length(Y)]
    switch(method,
      "si" = Y[I] <- apply(cbind(Y[I], Y[J]), 1, base::min), # single linkage
      "av" = Y[I] <- Y[I] + Y[J], # average linkage
      "co" = Y[I] <- apply(cbind(Y[I], Y[J]), 1, base::max), # complete linkage
      "ce" = {
        K <- N[R[i]] + N[R[j]] # centroid linkage
        Y[I] <- (N[R[i]] * Y[I] + N[R[j]] * Y[J] -
          (N[R[i]] * N[R[j]] * v^2) / K) / K
      },
      "wa" = Y[I] <- ((N[R[U]] + N[R[i]]) * Y[I] + (N[R[U]] + N[R[j]]) *
        Y[J] - N[R[U]] * v) / (N[R[i]] + N[R[j]] + N[R[U]])
    )
    J <- c(J, i * (m - (i + 1) / 2) - m + j)
    Y <- Y[-J] # no need for the cluster information about j

    # update m, N, R
    m <- m - 1
    N[n + s] <- N[R[i]] + N[R[j]]
    R[i] <- n + s
    if (j < n) {
      R[j:(n - 1)] <- R[(j + 1):n]
    }
  }
  return(Z)
}