Merge branch 'issue-3' into develop

This commit is contained in:
Waldir Leoncio 2022-12-22 15:06:13 +01:00
commit ffca7ea3c4
19 changed files with 478 additions and 2 deletions

View file

@ -36,8 +36,8 @@ Description: Partial R implementation of the BAPS software
License: GPL-3 License: GPL-3
BugReports: https://github.com/ocbe-uio/rBAPS/issues BugReports: https://github.com/ocbe-uio/rBAPS/issues
Encoding: UTF-8 Encoding: UTF-8
RoxygenNote: 7.2.1 RoxygenNote: 7.2.3
Suggests: Suggests:
testthat (>= 2.1.0) testthat (>= 2.1.0)
Imports: Imports:
methods, ape, vcfR, Rsamtools, adegenet, matlab2r, R6 methods, ape, vcfR, Rsamtools, adegenet, matlab2r, R6, zeallot

View file

@ -74,3 +74,4 @@ importFrom(stats,runif)
importFrom(stats,sd) importFrom(stats,sd)
importFrom(utils,read.delim) importFrom(utils,read.delim)
importFrom(vcfR,read.vcfR) importFrom(vcfR,read.vcfR)
importFrom(zeallot,"%<-%")

12
R/argmin.R Normal file
View file

@ -0,0 +1,12 @@
argmin <- function(v) {
# ARGMIN Return as a subscript vector the location of the smallest element of a multidimensional array v.
# indices <- argmin(v)
# Returns the first minimum in the case of ties.
# Example:
# X = [2 8 4 7 3 9]
# argmin[X] <- [1 1], i.e., row 1 column 1
m <- i <- NA
c(m, i) %<-% matlab2r::min(v)
indices <- ind2subv(mysize(v), i)
return(indices)
}

49
R/cliques_to_jtree.R Normal file
View file

@ -0,0 +1,49 @@
cliques_to_jtree <- function(cliques, ns) {
# MK_JTREE Make an optimal junction tree.
# [jtree, root, B, w] = mk_jtree(cliques, ns)
# A junction tree is a tree that satisfies the jtree property, which says:
# for each pair of cliques U, V with intersection S, all cliques on the path between U and V
# contain S. (This ensures that local propagation leads to # global consistency.)
# We can create a junction tree by computing the maximal spanning tree of the junction graph.
# (The junction graph connects all cliques, and the weight of an edge (i, j) is
# |C(i) intersect C(j)|, where C(i) is the i'th clique.)
# The best jtree is the maximal spanning tree which minimizes the sum of the costs on each edge,
# where cost[i, j] <- w(C(i)) + w(C(j)), and w(C) is the weight of clique C,
# which is the total number of values C can take on.
# For details, see
# - Jensen and Jensen, "Optimal Junction Trees", UAI 94.
# Input:
# cliques{i} = nodes in clique i
# ns[i] <- number of values node i can take on
# Output:
# jtree[i, j] <- 1 iff cliques i and j aer connected
# root <- the clique that should be used as root
# B[i, j] <- 1 iff node j occurs in clique i
# w[i] <- weight of clique i
num_cliques <- length(cliques)
w <- zeros(num_cliques, 1)
B <- zeros(num_cliques, 1)
for (i in 1:num_cliques) {
B[i, cliques[[i]]] <- 1
w[i] <- prod(ns(cliques[[i]]))
}
# C1[i, j] <- length(intersect(cliques{i}, cliques{j}))
# The length of the intersection of two sets is the dot product of their bit vector representation.
C1 <- B %*% t(B)
C1 <- setdiag(C1, 0)
# C2[i, j] <- w(i) + w(j)
num_cliques <- length(w)
W <- repmat(w, c(1, num_cliques))
C2 <- W + t(W)
C2 <- setdiag(C2, 0)
jtree <- zeros(minimum_spanning_tree(-C1, C2))# Using - C1 gives * maximum * spanning tree
# The root is arbitrary, but since the first pass is towards the root,
# we would like this to correspond to going forward in time in a DBN.
root <- num_cliques
return(list("jtree" = jtree, "root" = root, "B" = B, "w" = w))
}

67
R/elim_order.R Normal file
View file

@ -0,0 +1,67 @@
elim_order <- function(G, node_sizes) {
# BEST_FIRST_ELIM_ORDER Greedily search for an optimal elimination order.
# order <- best_first_elim_order(moral_graph, node_sizes)
# Find an order in which to eliminate nodes from the graph in such a way as to try and minimize the
# weight of the resulting triangulated graph. The weight of a graph is the sum of the weights of each
# of its cliques the weight of a clique is the product of the weights of each of its members the
# weight of a node is the number of values it can take on.
# Since this is an NP - hard problem, we use the following greedy heuristic:
# at each step, eliminate that node which will result in the addition of the least
# number of fill - in edges, breaking ties by choosing the node that induces the lighest clique.
# For details, see
# - Kjaerulff, "Triangulation of graphs - - algorithms giving small total state space",
# Univ. Aalborg tech report, 1990 (www.cs.auc.dk/!uk)
# - C. Huang and A. Darwiche, "Inference in Belief Networks: A procedural guide",
# Intl. J. Approx. Reasoning, 11, 1994
# Warning: This code is pretty old and could probably be made faster.
n <- length(G)
# if (nargin < 3, stage = { 1:n } end# no constraints) {
# For long DBNs, it may be useful to eliminate all the nodes in slice t before slice t + 1.
# This will ensure that the jtree has a repeating structure (at least away from both edges).
# This is why we have stages.
# See the discussion of splicing jtrees on p68 of
# Geoff Zweig's PhD thesis, Dept. Comp. Sci., UC Berkeley, 1998.
# This constraint can increase the clique size significantly.
MG <- G# copy the original graph
uneliminated <- ones(1, n)
order <- zeros(1, n)
# t <- 1 # Counts which time slice we are on
for (i in 1:n) {
U <- find(uneliminated)
# valid <- myintersect(U, stage{t})
valid <- U
# Choose the best node from the set of valid candidates
min_fill <- zeros(1, length(valid))
min_weight <- zeros(1, length(valid))
for (j in 1:length(valid)) {
k <- valid(j)
nbrs <- myintersect(neighbors(G, k), U)
l <- length(nbrs)
M <- MG[nbrs, nbrs]
min_fill[j] <- l^2 - sum(M)# num. added edges
min_weight[j] <- prod(node_sizes[k, nbrs])# weight of clique
}
lightest_nbrs <- find(min_weight == min(min_weight))
# break ties using min - fill heuristic
best_nbr_ndx <- argmin(min_fill[lightest_nbrs])
j <- lightest_nbrs[best_nbr_ndx] # we will eliminate the j'th element of valid
# j1s <- find(score1 == min(score1))
# j <- j1s(argmin(score2(j1s)))
k <- valid(j)
uneliminated[k] <- 0
order[i] <- k
ns <- myintersect(neighbors(G, k), U)
if (!is.null(ns)) {
G[ns, ns] <- 1
G <- setdiag(G, 0)
}
# if (!any(as.logical(uneliminated(stage{t})))# are we allowed to the next slice?) {
# t <- t + 1
# }
}
return(order)
}

48
R/findCliques.R Normal file
View file

@ -0,0 +1,48 @@
findCliques <- function(M) {
# Muuttaa graafin M kolmioituvaksi ja laskee siitה klikit ja
# separaattorit.
# Hyצdynnetההn Kevin Murphyn algoritmeja Graph Theory toolboxista.
# Pהivitetty 12.8.2005
order <- elim_order(M, ones(length(M)))
G <- cliques <- root <- NULL
c(G, cliques) %<-% triangulate(M, order)
c(jtree, root) %<-% cliques_to_jtree(cliques, ones(length(M)))
ncliq <- length(cliques)
separators <- cell(ncliq - 1, 1) # n - solmuisessa puussa n - 1 viivaa
jono <- zeros(length(ncliq))
jono[1] <- root
i <- 1
pointer <- 2 # Seuraava tyhjה paikka
while (!is.null(find(jono != 0))) { # Puun leveyssuuntainen lהpikהynti) {
lapset <- find(jtree[jono[i], ] != 0)
jtree[, jono[i]] <- 0 # Klikki kהsitelty
jono[pointer:(pointer + length(lapset) - 1)] <- lapset
for (j in 1:length(lapset)) {
ehdokas <- myintersect(cliques[[jono[i]]], cliques[[lapset[j]]])
kelpaa <- 1
for (k in 1:(pointer + j - 3)) {
# Tutkitaan, ettה separaattoriehdokasta ei vielה kהsitelty
if (ehdokas == separators[[k]]) {
kelpaa <- 0
}
}
if (kelpaa) {
separators[[pointer + j - 2]] <- ehdokas
}
}
jono[i] <- 0
pointer <- pointer + length(lapset)
i <- i + 1
}
notEmpty <- zeros(ncliq - 1, 1)
for (i in 1:(ncliq - 1)) {
if (!is.null(separators[[i]])) {
notEmpty[i] <- 1
}
}
notEmpty <- find(notEmpty == 1)
separators <- separators(notEmpty)
return(list("cliques" = cliques, "separators" = separators, "G" = G))
}

38
R/ind2subv.R Normal file
View file

@ -0,0 +1,38 @@
ind2subv <- function(siz, ndx) stop("Needs translation")
# function sub = ind2subv(siz, ndx)
# % IND2SUBV Like the built-in ind2sub, but returns the answer as a row vector.
# % sub = ind2subv(siz, ndx)
# %
# % siz and ndx can be row or column vectors.
# % sub will be of size length(ndx) * length(siz).
# %
# % Example
# % ind2subv([2 2 2], 1:8) returns
# % [1 1 1
# % 2 1 1
# % ...
# % 2 2 2]
# % That is, the leftmost digit toggle fastest.
# %
# % See also SUBV2IND
# n = length(siz);
# if n==0
# sub = ndx;
# return;
# end
# if all(siz==2)
# sub = dec2bitv(ndx-1, n);
# sub = sub(:,n:-1:1)+1;
# return;
# end
# cp = [1 cumprod(siz(:)')];
# ndx = ndx(:) - 1;
# sub = zeros(length(ndx), n);
# for i = n:-1:1 % i'th digit
# sub(:,i) = floor(ndx/cp(i))+1;
# ndx = rem(ndx,cp(i));
# end

28
R/laskeKlitik.R Normal file
View file

@ -0,0 +1,28 @@
laskeKlikit <- function(M, maxCliqSize, maxSepSize) {
# Laskee samankokoisten klikkien mההrהn verkosta M
# ncliques(i)=kokoa i olevien klikkien mההr?
# nseparators vastaavasti
ncliques <- zeros(1, maxCliqSize)
nseparators <- zeros(1, maxSepSize)
if (M == c()) {
return()
}
cliques_separators <- findCliques(M)
cliques <- cliques_separators$cliques
separators <- cliques_separators$separators
rm(cliques_separators)
for (i in 1:length(cliques)) {
ncliques[length[cliques[[i]]]] <- ncliques[length(cliques[[i]])] + 1
}
# cliqmax=max(find(ncliques!=0))
# ncliques=ncliques(1:cliqmax)
for (i in 1:length(separators)) {
nseparators[length[separators[[i]]]] <- nseparators[length(separators[[i]])] + 1
}
return(
list(
ncliques = ncliques, nseparators = nseparators, cliques = cliques,
separators = separators
)
)
}

51
R/minimum_spanning_tree.R Normal file
View file

@ -0,0 +1,51 @@
minimum_spanning_tree <- function(C1, C2) stop("needs translation")
# function A = minimum_spanning_tree(C1, C2)
# %
# % Find the minimum spanning tree using Prim's algorithm.
# % C1(i,j) is the primary cost of connecting i to j.
# % C2(i,j) is the (optional) secondary cost of connecting i to j, used to break ties.
# % We assume that absent edges have 0 cost.
# % To find the maximum spanning tree, used -1*C.
# % See Aho, Hopcroft & Ullman 1983, "Data structures and algorithms", p 237.
# % Prim's is O(V^2). Kruskal's algorithm is O(E log E) and hence is more efficient
# % for sparse graphs, but is implemented in terms of a priority queue.
# % We partition the nodes into those in U and those not in U.
# % closest(i) is the vertex in U that is closest to i in V-U.
# % lowcost(i) is the cost of the edge (i, closest(i)), or infinity is i has been used.
# % In Aho, they say C(i,j) should be "some appropriate large value" if the edge is missing.
# % We set it to infinity.
# % However, since lowcost is initialized from C, we must distinguish absent edges from used nodes.
# n = length(C1);
# if nargin==1, C2 = zeros(n); end
# A = zeros(n);
# closest = ones(1,n);
# used = zeros(1,n); % contains the members of U
# used(1) = 1; % start with node 1
# C1(find(C1==0))=inf;
# C2(find(C2==0))=inf;
# lowcost1 = C1(1,:);
# lowcost2 = C2(1,:);
# for i=2:n
# ks = find(lowcost1==min(lowcost1));
# k = ks(argmin(lowcost2(ks)));
# A(k, closest(k)) = 1;
# A(closest(k), k) = 1;
# lowcost1(k) = inf;
# lowcost2(k) = inf;
# used(k) = 1;
# NU = find(used==0);
# for ji=1:length(NU)
# for j=NU(ji)
# if C1(k,j) < lowcost1(j)
# lowcost1(j) = C1(k,j);
# lowcost2(j) = C2(k,j);
# closest(j) = k;
# end
# end
# end
# end

29
R/myintersect.R Normal file
View file

@ -0,0 +1,29 @@
myintersect <- function(A, B) {
# MYINTERSECT Intersection of two sets of positive integers (much faster than built - in intersect)
# C <- myintersect(A, B)
A <- t(A)
B <- t(B)
if (is.null(A)) {
ma <- 0
} else {
ma <- max(A)
}
if (is.null(B)) {
mb <- 0
} else {
mb <- max(B)
}
if (ma == 0 | mb == 0) {
C <- vector()
} else {
# bits <- sparse(1, max(ma, mb))
bits <- zeros(1, max(ma, mb))
bits[A] <- 1
C <- B[as.logical(bits[B])]
}
return(C)
}

7
R/myisvector.R Normal file
View file

@ -0,0 +1,7 @@
myisvector <- function(V) {
# Kuten isvector(V)
A <- size(V)
r <- (length(A) == 2) & (min(A) == 1)
return(r)
}

17
R/mysize.R Normal file
View file

@ -0,0 +1,17 @@
mysize <- function(M) {
# MYSIZE Like the built - in size, except it returns n if (M is a vector of length n, and 1 if M is a scalar.) {
# sz <- mysize(M)
# The behavior is best explained by examples
# - M <- rand(1, 1), mysize[M] <- 1, size(M) = [1 1]
# - M <- rand(2, 1), mysize[M] <- 2, size(M) = [2 1]
# - M <- rand(1, 2), mysize[M] <- 2, size(M) = [1 2]
# - M <- rand(2, 2,1), mysize[M] <- [2 2], size(M) = [2 2]
# - M <- rand(1, 2,1), mysize[M] <- 2, size(M) = [1 2]
if (myisvector(M)) {
sz <- length(M)
} else {
sz <- size(M)
}
return(sz)
}

13
R/mysubset.R Normal file
View file

@ -0,0 +1,13 @@
mysubset <- function(small, large) {
# MYSUBSET Is the small set of + ve integers a subset of the large set?
# p <- mysubset(small, large)
# Surprisingly, this is not built - in.
if (is.null(small)) {
p <- 1# is.null(large)
} else {
p <- length(myintersect(small, large)) == length(small)
}
return(p)
}

31
R/myunion.R Normal file
View file

@ -0,0 +1,31 @@
myunion <- function(A, B) {
# MYUNION Union of two sets of positive integers (much faster than built - in union)
# C <- myunion(A, B)
if (is.null(A)) {
ma <- 0
} else {
ma <- max(A)
}
if (is.null(B)) {
mb <- 0
} else {
mb <- max(B)
}
if (ma == 0 & mb == 0) {
C <- vector()
} else if (ma == 0 & mb > 0) {
C <- B
} else if (ma > 0 & mb == 0) {
C <- A
} else {
# bits <- sparse(1, max(ma, mb))
bits <- zeros(1, max(c(ma, mb)))
bits[A] <- 1
bits[B] <- 1
C <- find(bits)
}
return(C)
}

8
R/neighbors.R Normal file
View file

@ -0,0 +1,8 @@
neighbors <- function(adj_mat, i) {
# NEIGHBORS Find the parents and children of a node in a graph.
# ns <- neighbors(adj_mat, i)
# ns <- myunion(children(adj_mat, i), parents(adj_mat, i))
ns <- find(adj_mat[i, ])
return(ns)
}

View file

@ -9,4 +9,5 @@
#' isempty isfield isspace max min ones questdlg rand repmat reshape #' isempty isfield isspace max min ones questdlg rand repmat reshape
#' size sortrows squeeze strcmp times zeros disp #' size sortrows squeeze strcmp times zeros disp
#' @importFrom stats runif #' @importFrom stats runif
#' @importFrom zeallot %<-%
NULL NULL

19
R/setdiag.R Normal file
View file

@ -0,0 +1,19 @@
setdiag <- function(M, v) {
# SETDIAG Set the diagonal of a matrix to a specified scalar/vector.
# M <- set_diag(M, v)
n <- length(M)
if (length(v) == 1) {
v <- repmat(v, c(1, n))
}
# e.g., for 3x3 matrix, elements are numbered
# 1 4 7
# 2 5 8
# 3 6 9
# so diagnoal = [1 5 9]
J <- seq(1, n ^ 2, n + 1)
M[J] <- v
return(M)
}

39
R/triangulate.R Normal file
View file

@ -0,0 +1,39 @@
triangulate <- function(G, order) {
# TRIANGULATE Ensure G is triangulated (chordal), i.e., every cycle of length > 3 has a chord.
# [G, cliques, fill_ins, cliques_containing_node] = triangulate(G, order)
# cliques{i} is the i'th maximal complete subgraph of the triangulated graph.
# fill_ins[i, j] <- 1 iff we add a fill - in arc between i and j.
# To find the maximal cliques, we save each induced cluster (created by adding connecting
# neighbors) that is not a subset of any previously saved cluster. (A cluster is a complete,
# but not necessarily maximal, set of nodes.)
MG <- G
n <- length(G)
eliminated <- zeros(1, n)
cliques = list()
for (i in 1:n) {
u <- order[i]
U <- find(!eliminated)# uneliminated
nodes <- myintersect(neighbors(G, u), U)# look up neighbors in the partially filled - in graph
nodes <- myunion(nodes, u)# the clique will always contain at least u
G[nodes, nodes] <- 1# make them all connected to each other
G <- setdiag(G, 0)
eliminated[u] <- 1
exclude <- 0
for (c in 1:length(cliques)) {
if (mysubset(nodes, cliques[[c]])) { # not maximal)
exclude <- 1
break
}
}
if (!exclude) {
cnum <- length(cliques) + 1
cliques[[cnum]] <- nodes
}
}
# fill_ins <- sparse(triu(max(0, G - MG), 1))
fill_ins <- 1
return(list("G" = G, "cliques" = cliques, "fill_ins" = fill_ins))
}

View file

@ -33,3 +33,21 @@ test_that("testaaKoordinaatit works as expected", {
) )
) )
}) })
test_that("lakseKlitik() and subfunctions produce expected output", {
# TODO: test elim_order()
# TODO: test triangulate()
# TODO: test neighbors()
# TODO: test myintersect()
# TODO: test mysubset()
# TODO: test findCliques()
# TODO: test cliques_to_jtree()
# TODO: test minimum_spanning_tree()
# TODO: test myunion()
# TODO: test argmin()
# TODO: test mysize()
# TODO: test ind2subv()
# TODO: test myisvector()
# TODO: ... and anythin left from findCliques.m
# TODO: test lakseKlitik()
})