diff --git a/DESCRIPTION b/DESCRIPTION index 557b643..36476ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,8 +36,8 @@ Description: Partial R implementation of the BAPS software License: GPL-3 BugReports: https://github.com/ocbe-uio/rBAPS/issues Encoding: UTF-8 -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 Suggests: testthat (>= 2.1.0) Imports: - methods, ape, vcfR, Rsamtools, adegenet, matlab2r, R6 + methods, ape, vcfR, Rsamtools, adegenet, matlab2r, R6, zeallot diff --git a/NAMESPACE b/NAMESPACE index 3b8b822..f0dc387 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,3 +74,4 @@ importFrom(stats,runif) importFrom(stats,sd) importFrom(utils,read.delim) importFrom(vcfR,read.vcfR) +importFrom(zeallot,"%<-%") diff --git a/R/argmin.R b/R/argmin.R new file mode 100644 index 0000000..4d50d1c --- /dev/null +++ b/R/argmin.R @@ -0,0 +1,12 @@ +argmin <- function(v) { + # ARGMIN Return as a subscript vector the location of the smallest element of a multidimensional array v. + # indices <- argmin(v) + # Returns the first minimum in the case of ties. + # Example: + # X = [2 8 4 7 3 9] + # argmin[X] <- [1 1], i.e., row 1 column 1 + m <- i <- NA + c(m, i) %<-% matlab2r::min(v) + indices <- ind2subv(mysize(v), i) + return(indices) +} diff --git a/R/cliques_to_jtree.R b/R/cliques_to_jtree.R new file mode 100644 index 0000000..68d57a5 --- /dev/null +++ b/R/cliques_to_jtree.R @@ -0,0 +1,49 @@ +cliques_to_jtree <- function(cliques, ns) { + # MK_JTREE Make an optimal junction tree. + # [jtree, root, B, w] = mk_jtree(cliques, ns) + # A junction tree is a tree that satisfies the jtree property, which says: + # for each pair of cliques U, V with intersection S, all cliques on the path between U and V + # contain S. (This ensures that local propagation leads to # global consistency.) + # We can create a junction tree by computing the maximal spanning tree of the junction graph. + # (The junction graph connects all cliques, and the weight of an edge (i, j) is + # |C(i) intersect C(j)|, where C(i) is the i'th clique.) + # The best jtree is the maximal spanning tree which minimizes the sum of the costs on each edge, + # where cost[i, j] <- w(C(i)) + w(C(j)), and w(C) is the weight of clique C, + # which is the total number of values C can take on. + # For details, see + # - Jensen and Jensen, "Optimal Junction Trees", UAI 94. + # Input: + # cliques{i} = nodes in clique i + # ns[i] <- number of values node i can take on + # Output: + # jtree[i, j] <- 1 iff cliques i and j aer connected + # root <- the clique that should be used as root + # B[i, j] <- 1 iff node j occurs in clique i + # w[i] <- weight of clique i + + num_cliques <- length(cliques) + w <- zeros(num_cliques, 1) + B <- zeros(num_cliques, 1) + for (i in 1:num_cliques) { + B[i, cliques[[i]]] <- 1 + w[i] <- prod(ns(cliques[[i]])) + } + + # C1[i, j] <- length(intersect(cliques{i}, cliques{j})) + # The length of the intersection of two sets is the dot product of their bit vector representation. + C1 <- B %*% t(B) + C1 <- setdiag(C1, 0) + + # C2[i, j] <- w(i) + w(j) + num_cliques <- length(w) + W <- repmat(w, c(1, num_cliques)) + C2 <- W + t(W) + C2 <- setdiag(C2, 0) + + jtree <- zeros(minimum_spanning_tree(-C1, C2))# Using - C1 gives * maximum * spanning tree + + # The root is arbitrary, but since the first pass is towards the root, + # we would like this to correspond to going forward in time in a DBN. + root <- num_cliques + return(list("jtree" = jtree, "root" = root, "B" = B, "w" = w)) +} diff --git a/R/elim_order.R b/R/elim_order.R new file mode 100644 index 0000000..3fdb547 --- /dev/null +++ b/R/elim_order.R @@ -0,0 +1,67 @@ +elim_order <- function(G, node_sizes) { + # BEST_FIRST_ELIM_ORDER Greedily search for an optimal elimination order. + # order <- best_first_elim_order(moral_graph, node_sizes) + # Find an order in which to eliminate nodes from the graph in such a way as to try and minimize the + # weight of the resulting triangulated graph. The weight of a graph is the sum of the weights of each + # of its cliques the weight of a clique is the product of the weights of each of its members the + # weight of a node is the number of values it can take on. + # Since this is an NP - hard problem, we use the following greedy heuristic: + # at each step, eliminate that node which will result in the addition of the least + # number of fill - in edges, breaking ties by choosing the node that induces the lighest clique. + # For details, see + # - Kjaerulff, "Triangulation of graphs - - algorithms giving small total state space", + # Univ. Aalborg tech report, 1990 (www.cs.auc.dk/!uk) + # - C. Huang and A. Darwiche, "Inference in Belief Networks: A procedural guide", + # Intl. J. Approx. Reasoning, 11, 1994 + + # Warning: This code is pretty old and could probably be made faster. + + n <- length(G) + # if (nargin < 3, stage = { 1:n } end# no constraints) { + + # For long DBNs, it may be useful to eliminate all the nodes in slice t before slice t + 1. + # This will ensure that the jtree has a repeating structure (at least away from both edges). + # This is why we have stages. + # See the discussion of splicing jtrees on p68 of + # Geoff Zweig's PhD thesis, Dept. Comp. Sci., UC Berkeley, 1998. + # This constraint can increase the clique size significantly. + + MG <- G# copy the original graph + uneliminated <- ones(1, n) + order <- zeros(1, n) + # t <- 1 # Counts which time slice we are on + for (i in 1:n) { + U <- find(uneliminated) + # valid <- myintersect(U, stage{t}) + valid <- U + # Choose the best node from the set of valid candidates + min_fill <- zeros(1, length(valid)) + min_weight <- zeros(1, length(valid)) + for (j in 1:length(valid)) { + k <- valid(j) + nbrs <- myintersect(neighbors(G, k), U) + l <- length(nbrs) + M <- MG[nbrs, nbrs] + min_fill[j] <- l^2 - sum(M)# num. added edges + min_weight[j] <- prod(node_sizes[k, nbrs])# weight of clique + } + lightest_nbrs <- find(min_weight == min(min_weight)) + # break ties using min - fill heuristic + best_nbr_ndx <- argmin(min_fill[lightest_nbrs]) + j <- lightest_nbrs[best_nbr_ndx] # we will eliminate the j'th element of valid + # j1s <- find(score1 == min(score1)) + # j <- j1s(argmin(score2(j1s))) + k <- valid(j) + uneliminated[k] <- 0 + order[i] <- k + ns <- myintersect(neighbors(G, k), U) + if (!is.null(ns)) { + G[ns, ns] <- 1 + G <- setdiag(G, 0) + } + # if (!any(as.logical(uneliminated(stage{t})))# are we allowed to the next slice?) { + # t <- t + 1 + # } + } + return(order) +} diff --git a/R/findCliques.R b/R/findCliques.R new file mode 100644 index 0000000..f413e64 --- /dev/null +++ b/R/findCliques.R @@ -0,0 +1,48 @@ +findCliques <- function(M) { + # Muuttaa graafin M kolmioituvaksi ja laskee siitה klikit ja + # separaattorit. + # Hyצdynnetההn Kevin Murphyn algoritmeja Graph Theory toolboxista. + # Pהivitetty 12.8.2005 + order <- elim_order(M, ones(length(M))) + G <- cliques <- root <- NULL + c(G, cliques) %<-% triangulate(M, order) + c(jtree, root) %<-% cliques_to_jtree(cliques, ones(length(M))) + ncliq <- length(cliques) + separators <- cell(ncliq - 1, 1) # n - solmuisessa puussa n - 1 viivaa + + jono <- zeros(length(ncliq)) + jono[1] <- root + i <- 1 + pointer <- 2 # Seuraava tyhjה paikka + + while (!is.null(find(jono != 0))) { # Puun leveyssuuntainen lהpikהynti) { + lapset <- find(jtree[jono[i], ] != 0) + jtree[, jono[i]] <- 0 # Klikki kהsitelty + jono[pointer:(pointer + length(lapset) - 1)] <- lapset + for (j in 1:length(lapset)) { + ehdokas <- myintersect(cliques[[jono[i]]], cliques[[lapset[j]]]) + kelpaa <- 1 + for (k in 1:(pointer + j - 3)) { + # Tutkitaan, ettה separaattoriehdokasta ei vielה kהsitelty + if (ehdokas == separators[[k]]) { + kelpaa <- 0 + } + } + if (kelpaa) { + separators[[pointer + j - 2]] <- ehdokas + } + } + jono[i] <- 0 + pointer <- pointer + length(lapset) + i <- i + 1 + } + notEmpty <- zeros(ncliq - 1, 1) + for (i in 1:(ncliq - 1)) { + if (!is.null(separators[[i]])) { + notEmpty[i] <- 1 + } + } + notEmpty <- find(notEmpty == 1) + separators <- separators(notEmpty) + return(list("cliques" = cliques, "separators" = separators, "G" = G)) +} diff --git a/R/ind2subv.R b/R/ind2subv.R new file mode 100644 index 0000000..6fef574 --- /dev/null +++ b/R/ind2subv.R @@ -0,0 +1,38 @@ +ind2subv <- function(siz, ndx) stop("Needs translation") +# function sub = ind2subv(siz, ndx) +# % IND2SUBV Like the built-in ind2sub, but returns the answer as a row vector. +# % sub = ind2subv(siz, ndx) +# % +# % siz and ndx can be row or column vectors. +# % sub will be of size length(ndx) * length(siz). +# % +# % Example +# % ind2subv([2 2 2], 1:8) returns +# % [1 1 1 +# % 2 1 1 +# % ... +# % 2 2 2] +# % That is, the leftmost digit toggle fastest. +# % +# % See also SUBV2IND + +# n = length(siz); + +# if n==0 +# sub = ndx; +# return; +# end + +# if all(siz==2) +# sub = dec2bitv(ndx-1, n); +# sub = sub(:,n:-1:1)+1; +# return; +# end + +# cp = [1 cumprod(siz(:)')]; +# ndx = ndx(:) - 1; +# sub = zeros(length(ndx), n); +# for i = n:-1:1 % i'th digit +# sub(:,i) = floor(ndx/cp(i))+1; +# ndx = rem(ndx,cp(i)); +# end diff --git a/R/laskeKlitik.R b/R/laskeKlitik.R new file mode 100644 index 0000000..7e09d52 --- /dev/null +++ b/R/laskeKlitik.R @@ -0,0 +1,28 @@ +laskeKlikit <- function(M, maxCliqSize, maxSepSize) { + # Laskee samankokoisten klikkien mההrהn verkosta M + # ncliques(i)=kokoa i olevien klikkien mההr? + # nseparators vastaavasti + ncliques <- zeros(1, maxCliqSize) + nseparators <- zeros(1, maxSepSize) + if (M == c()) { + return() + } + cliques_separators <- findCliques(M) + cliques <- cliques_separators$cliques + separators <- cliques_separators$separators + rm(cliques_separators) + for (i in 1:length(cliques)) { + ncliques[length[cliques[[i]]]] <- ncliques[length(cliques[[i]])] + 1 + } + # cliqmax=max(find(ncliques!=0)) + # ncliques=ncliques(1:cliqmax) + for (i in 1:length(separators)) { + nseparators[length[separators[[i]]]] <- nseparators[length(separators[[i]])] + 1 + } + return( + list( + ncliques = ncliques, nseparators = nseparators, cliques = cliques, + separators = separators + ) + ) +} diff --git a/R/minimum_spanning_tree.R b/R/minimum_spanning_tree.R new file mode 100644 index 0000000..a9e7288 --- /dev/null +++ b/R/minimum_spanning_tree.R @@ -0,0 +1,51 @@ +minimum_spanning_tree <- function(C1, C2) stop("needs translation") +# function A = minimum_spanning_tree(C1, C2) +# % +# % Find the minimum spanning tree using Prim's algorithm. +# % C1(i,j) is the primary cost of connecting i to j. +# % C2(i,j) is the (optional) secondary cost of connecting i to j, used to break ties. +# % We assume that absent edges have 0 cost. +# % To find the maximum spanning tree, used -1*C. +# % See Aho, Hopcroft & Ullman 1983, "Data structures and algorithms", p 237. + +# % Prim's is O(V^2). Kruskal's algorithm is O(E log E) and hence is more efficient +# % for sparse graphs, but is implemented in terms of a priority queue. + +# % We partition the nodes into those in U and those not in U. +# % closest(i) is the vertex in U that is closest to i in V-U. +# % lowcost(i) is the cost of the edge (i, closest(i)), or infinity is i has been used. +# % In Aho, they say C(i,j) should be "some appropriate large value" if the edge is missing. +# % We set it to infinity. +# % However, since lowcost is initialized from C, we must distinguish absent edges from used nodes. + +# n = length(C1); +# if nargin==1, C2 = zeros(n); end +# A = zeros(n); + +# closest = ones(1,n); +# used = zeros(1,n); % contains the members of U +# used(1) = 1; % start with node 1 +# C1(find(C1==0))=inf; +# C2(find(C2==0))=inf; +# lowcost1 = C1(1,:); +# lowcost2 = C2(1,:); + +# for i=2:n +# ks = find(lowcost1==min(lowcost1)); +# k = ks(argmin(lowcost2(ks))); +# A(k, closest(k)) = 1; +# A(closest(k), k) = 1; +# lowcost1(k) = inf; +# lowcost2(k) = inf; +# used(k) = 1; +# NU = find(used==0); +# for ji=1:length(NU) +# for j=NU(ji) +# if C1(k,j) < lowcost1(j) +# lowcost1(j) = C1(k,j); +# lowcost2(j) = C2(k,j); +# closest(j) = k; +# end +# end +# end +# end diff --git a/R/myintersect.R b/R/myintersect.R new file mode 100644 index 0000000..e2e8f26 --- /dev/null +++ b/R/myintersect.R @@ -0,0 +1,29 @@ +myintersect <- function(A, B) { + # MYINTERSECT Intersection of two sets of positive integers (much faster than built - in intersect) + # C <- myintersect(A, B) + + A <- t(A) + B <- t(B) + + if (is.null(A)) { + ma <- 0 + } else { + ma <- max(A) + } + + if (is.null(B)) { + mb <- 0 + } else { + mb <- max(B) + } + + if (ma == 0 | mb == 0) { + C <- vector() + } else { + # bits <- sparse(1, max(ma, mb)) + bits <- zeros(1, max(ma, mb)) + bits[A] <- 1 + C <- B[as.logical(bits[B])] + } + return(C) +} diff --git a/R/myisvector.R b/R/myisvector.R new file mode 100644 index 0000000..498e97b --- /dev/null +++ b/R/myisvector.R @@ -0,0 +1,7 @@ +myisvector <- function(V) { + # Kuten isvector(V) + + A <- size(V) + r <- (length(A) == 2) & (min(A) == 1) + return(r) +} diff --git a/R/mysize.R b/R/mysize.R new file mode 100644 index 0000000..a888d67 --- /dev/null +++ b/R/mysize.R @@ -0,0 +1,17 @@ +mysize <- function(M) { + # MYSIZE Like the built - in size, except it returns n if (M is a vector of length n, and 1 if M is a scalar.) { + # sz <- mysize(M) + # The behavior is best explained by examples + # - M <- rand(1, 1), mysize[M] <- 1, size(M) = [1 1] + # - M <- rand(2, 1), mysize[M] <- 2, size(M) = [2 1] + # - M <- rand(1, 2), mysize[M] <- 2, size(M) = [1 2] + # - M <- rand(2, 2,1), mysize[M] <- [2 2], size(M) = [2 2] + # - M <- rand(1, 2,1), mysize[M] <- 2, size(M) = [1 2] + + if (myisvector(M)) { + sz <- length(M) + } else { + sz <- size(M) + } + return(sz) +} diff --git a/R/mysubset.R b/R/mysubset.R new file mode 100644 index 0000000..1e74c86 --- /dev/null +++ b/R/mysubset.R @@ -0,0 +1,13 @@ +mysubset <- function(small, large) { + # MYSUBSET Is the small set of + ve integers a subset of the large set? + # p <- mysubset(small, large) + + # Surprisingly, this is not built - in. + + if (is.null(small)) { + p <- 1# is.null(large) + } else { + p <- length(myintersect(small, large)) == length(small) + } + return(p) +} diff --git a/R/myunion.R b/R/myunion.R new file mode 100644 index 0000000..9d64098 --- /dev/null +++ b/R/myunion.R @@ -0,0 +1,31 @@ +myunion <- function(A, B) { + # MYUNION Union of two sets of positive integers (much faster than built - in union) + # C <- myunion(A, B) + + if (is.null(A)) { + ma <- 0 + } else { + ma <- max(A) + } + + if (is.null(B)) { + mb <- 0 + } else { + mb <- max(B) + } + + if (ma == 0 & mb == 0) { + C <- vector() + } else if (ma == 0 & mb > 0) { + C <- B + } else if (ma > 0 & mb == 0) { + C <- A + } else { + # bits <- sparse(1, max(ma, mb)) + bits <- zeros(1, max(c(ma, mb))) + bits[A] <- 1 + bits[B] <- 1 + C <- find(bits) + } + return(C) +} diff --git a/R/neighbors.R b/R/neighbors.R new file mode 100644 index 0000000..9192827 --- /dev/null +++ b/R/neighbors.R @@ -0,0 +1,8 @@ +neighbors <- function(adj_mat, i) { + # NEIGHBORS Find the parents and children of a node in a graph. + # ns <- neighbors(adj_mat, i) + + # ns <- myunion(children(adj_mat, i), parents(adj_mat, i)) + ns <- find(adj_mat[i, ]) + return(ns) +} diff --git a/R/rBAPS-package.R b/R/rBAPS-package.R index 3867ba3..9def117 100644 --- a/R/rBAPS-package.R +++ b/R/rBAPS-package.R @@ -9,4 +9,5 @@ #' isempty isfield isspace max min ones questdlg rand repmat reshape #' size sortrows squeeze strcmp times zeros disp #' @importFrom stats runif +#' @importFrom zeallot %<-% NULL diff --git a/R/setdiag.R b/R/setdiag.R new file mode 100644 index 0000000..df57346 --- /dev/null +++ b/R/setdiag.R @@ -0,0 +1,19 @@ +setdiag <- function(M, v) { + # SETDIAG Set the diagonal of a matrix to a specified scalar/vector. + # M <- set_diag(M, v) + + n <- length(M) + if (length(v) == 1) { + v <- repmat(v, c(1, n)) + } + + # e.g., for 3x3 matrix, elements are numbered + # 1 4 7 + # 2 5 8 + # 3 6 9 + # so diagnoal = [1 5 9] + + J <- seq(1, n ^ 2, n + 1) + M[J] <- v + return(M) +} diff --git a/R/triangulate.R b/R/triangulate.R new file mode 100644 index 0000000..47bb409 --- /dev/null +++ b/R/triangulate.R @@ -0,0 +1,39 @@ +triangulate <- function(G, order) { + # TRIANGULATE Ensure G is triangulated (chordal), i.e., every cycle of length > 3 has a chord. + # [G, cliques, fill_ins, cliques_containing_node] = triangulate(G, order) + # cliques{i} is the i'th maximal complete subgraph of the triangulated graph. + # fill_ins[i, j] <- 1 iff we add a fill - in arc between i and j. + # To find the maximal cliques, we save each induced cluster (created by adding connecting + # neighbors) that is not a subset of any previously saved cluster. (A cluster is a complete, + # but not necessarily maximal, set of nodes.) + + MG <- G + n <- length(G) + eliminated <- zeros(1, n) + cliques = list() + for (i in 1:n) { + u <- order[i] + U <- find(!eliminated)# uneliminated + nodes <- myintersect(neighbors(G, u), U)# look up neighbors in the partially filled - in graph + nodes <- myunion(nodes, u)# the clique will always contain at least u + G[nodes, nodes] <- 1# make them all connected to each other + G <- setdiag(G, 0) + eliminated[u] <- 1 + + exclude <- 0 + for (c in 1:length(cliques)) { + if (mysubset(nodes, cliques[[c]])) { # not maximal) + exclude <- 1 + break + } + } + if (!exclude) { + cnum <- length(cliques) + 1 + cliques[[cnum]] <- nodes + } + } + + # fill_ins <- sparse(triu(max(0, G - MG), 1)) + fill_ins <- 1 + return(list("G" = G, "cliques" = cliques, "fill_ins" = fill_ins)) +} diff --git a/tests/testthat/test-spatialMixture.R b/tests/testthat/test-spatialMixture.R index 9203d56..1f61c4c 100644 --- a/tests/testthat/test-spatialMixture.R +++ b/tests/testthat/test-spatialMixture.R @@ -33,3 +33,21 @@ test_that("testaaKoordinaatit works as expected", { ) ) }) + +test_that("lakseKlitik() and subfunctions produce expected output", { + # TODO: test elim_order() + # TODO: test triangulate() + # TODO: test neighbors() + # TODO: test myintersect() + # TODO: test mysubset() + # TODO: test findCliques() + # TODO: test cliques_to_jtree() + # TODO: test minimum_spanning_tree() + # TODO: test myunion() + # TODO: test argmin() + # TODO: test mysize() + # TODO: test ind2subv() + # TODO: test myisvector() + # TODO: ... and anythin left from findCliques.m + # TODO: test lakseKlitik() +})