-
-
Notifications
You must be signed in to change notification settings - Fork 200
/
embedding.R
356 lines (340 loc) · 15.1 KB
/
embedding.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
## -----------------------------------------------------------------------
##
## IGraph R package
## Copyright (C) 2015 Gabor Csardi <csardi.gabor@gmail.com>
## 334 Harvard street, Cambridge, MA 02139 USA
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
## 02110-1301 USA
##
## -----------------------------------------------------------------------
#' Spectral Embedding of Adjacency Matrices
#'
#' Spectral decomposition of the adjacency matrices of graphs.
#'
#' This function computes a `no`-dimensional Euclidean representation of
#' the graph based on its adjacency matrix, \eqn{A}. This representation is
#' computed via the singular value decomposition of the adjacency matrix,
#' \eqn{A=UDV^T}.In the case, where the graph is a random dot product graph
#' generated using latent position vectors in \eqn{R^{no}} for each vertex, the
#' embedding will provide an estimate of these latent vectors.
#'
#' For undirected graphs the latent positions are calculated as
#' \eqn{X=U^{no}D^{1/2}}{U[no] sqrt(D[no])}, where \eqn{U^{no}}{U[no]} equals
#' to the first `no` columns of \eqn{U}, and \eqn{D^{1/2}}{sqrt(D[no])} is
#' a diagonal matrix containing the top `no` singular values on the
#' diagonal.
#'
#' For directed graphs the embedding is defined as the pair
#' \eqn{X=U^{no}D^{1/2}}{U[no] sqrt(D[no])} and \eqn{Y=V^{no}D^{1/2}}{V[no]
#' sqrt(D[no])}. (For undirected graphs \eqn{U=V}, so it is enough to keep one
#' of them.)
#'
#' @param graph The input graph, directed or undirected.
#' @param no An integer scalar. This value is the embedding dimension of the
#' spectral embedding. Should be smaller than the number of vertices. The
#' largest `no`-dimensional non-zero singular values are used for the
#' spectral embedding.
#' @param weights Optional positive weight vector for calculating a weighted
#' embedding. If the graph has a `weight` edge attribute, then this is
#' used by default. In a weighted embedding, the edge weights are used instead
#' of the binary adjacencny matrix.
#' @param which Which eigenvalues (or singular values, for directed graphs) to
#' use. \sQuote{lm} means the ones with the largest magnitude, \sQuote{la} is
#' the ones (algebraic) largest, and \sQuote{sa} is the (algebraic) smallest
#' eigenvalues. The default is \sQuote{lm}. Note that for directed graphs
#' \sQuote{la} and \sQuote{lm} are the equivalent, because the singular values
#' are used for the ordering.
#' @param scaled Logical scalar, if `FALSE`, then \eqn{U} and \eqn{V} are
#' returned instead of \eqn{X} and \eqn{Y}.
#' @param cvec A numeric vector, its length is the number vertices in the
#' graph. This vector is added to the diagonal of the adjacency matrix.
#' @param options A named list containing the parameters for the SVD
#' computation algorithm in ARPACK. By default, the list of values is assigned
#' the values given by [arpack_defaults()].
#' @return A list containing with entries: \item{X}{Estimated latent positions,
#' an `n` times `no` matrix, `n` is the number of vertices.}
#' \item{Y}{`NULL` for undirected graphs, the second half of the latent
#' positions for directed graphs, an `n` times `no` matrix, `n`
#' is the number of vertices.} \item{D}{The eigenvalues (for undirected graphs)
#' or the singular values (for directed graphs) calculated by the algorithm.}
#' \item{options}{A named list, information about the underlying ARPACK
#' computation. See [arpack()] for the details.}
#' @seealso [sample_dot_product()]
#' @references Sussman, D.L., Tang, M., Fishkind, D.E., Priebe, C.E. A
#' Consistent Adjacency Spectral Embedding for Stochastic Blockmodel Graphs,
#' *Journal of the American Statistical Association*, Vol. 107(499), 2012
#' @keywords graphs
#' @examples
#'
#' ## A small graph
#' lpvs <- matrix(rnorm(200), 20, 10)
#' lpvs <- apply(lpvs, 2, function(x) {
#' return(abs(x) / sqrt(sum(x^2)))
#' })
#' RDP <- sample_dot_product(lpvs)
#' embed <- embed_adjacency_matrix(RDP, 5)
#' @family embedding
#' @export
#' @cdocs igraph_adjacency_spectral_embedding
embed_adjacency_matrix <- adjacency_spectral_embedding_impl
#' Dimensionality selection for singular values using profile likelihood.
#'
#' Select the number of significant singular values, by finding the
#' \sQuote{elbow} of the scree plot, in a principled way.
#'
#' The input of the function is a numeric vector which contains the measure of
#' \sQuote{importance} for each dimension.
#'
#' For spectral embedding, these are the singular values of the adjacency
#' matrix. The singular values are assumed to be generated from a Gaussian
#' mixture distribution with two components that have different means and same
#' variance. The dimensionality \eqn{d} is chosen to maximize the likelihood
#' when the \eqn{d} largest singular values are assigned to one component of
#' the mixture and the rest of the singular values assigned to the other
#' component.
#'
#' This function can also be used for the general separation problem, where we
#' assume that the left and the right of the vector are coming from two Normal
#' distributions, with different means, and we want to know their border. See
#' examples below.
#'
#' @param sv A numeric vector, the ordered singular values.
#' @return A numeric scalar, the estimate of \eqn{d}.
#' @author Gabor Csardi \email{csardi.gabor@@gmail.com}
#' @seealso [embed_adjacency_matrix()]
#' @references M. Zhu, and A. Ghodsi (2006). Automatic dimensionality selection
#' from the scree plot via the use of profile likelihood. *Computational
#' Statistics and Data Analysis*, Vol. 51, 918--930.
#' @keywords graphs
#' @examples
#'
#' # Generate the two groups of singular values with
#' # Gaussian mixture of two components that have different means
#' sing.vals <- c(rnorm(10, mean = 1, sd = 1), rnorm(10, mean = 3, sd = 1))
#' dim.chosen <- dim_select(sing.vals)
#' dim.chosen
#'
#' # Sample random vectors with multivariate normal distribution
#' # and normalize to unit length
#' lpvs <- matrix(rnorm(200), 10, 20)
#' lpvs <- apply(lpvs, 2, function(x) {
#' (abs(x) / sqrt(sum(x^2)))
#' })
#' RDP.graph <- sample_dot_product(lpvs)
#' dim_select(embed_adjacency_matrix(RDP.graph, 10)$D)
#'
#' # Sample random vectors with the Dirichlet distribution
#' lpvs.dir <- sample_dirichlet(n = 20, rep(1, 10))
#' RDP.graph.2 <- sample_dot_product(lpvs.dir)
#' dim_select(embed_adjacency_matrix(RDP.graph.2, 10)$D)
#'
#' # Sample random vectors from hypersphere with radius 1.
#' lpvs.sph <- sample_sphere_surface(dim = 10, n = 20, radius = 1)
#' RDP.graph.3 <- sample_dot_product(lpvs.sph)
#' dim_select(embed_adjacency_matrix(RDP.graph.3, 10)$D)
#'
#' @family embedding
#' @export
#' @cdocs igraph_dim_select
dim_select <- dim_select_impl
#' Spectral Embedding of the Laplacian of a Graph
#'
#' Spectral decomposition of Laplacian matrices of graphs.
#'
#' This function computes a `no`-dimensional Euclidean representation of
#' the graph based on its Laplacian matrix, \eqn{L}. This representation is
#' computed via the singular value decomposition of the Laplacian matrix.
#'
#' They are essentially doing the same as [embed_adjacency_matrix()],
#' but work on the Laplacian matrix, instead of the adjacency matrix.
#'
#' @param graph The input graph, directed or undirected.
#' @param no An integer scalar. This value is the embedding dimension of the
#' spectral embedding. Should be smaller than the number of vertices. The
#' largest `no`-dimensional non-zero singular values are used for the
#' spectral embedding.
#' @param weights Optional positive weight vector for calculating a weighted
#' embedding. If the graph has a `weight` edge attribute, then this is
#' used by default. For weighted embedding, edge weights are used instead
#' of the binary adjacency matrix, and vertex strength (see
#' [strength()]) is used instead of the degrees.
#' @param which Which eigenvalues (or singular values, for directed graphs) to
#' use. \sQuote{lm} means the ones with the largest magnitude, \sQuote{la} is
#' the ones (algebraic) largest, and \sQuote{sa} is the (algebraic) smallest
#' eigenvalues. The default is \sQuote{lm}. Note that for directed graphs
#' \sQuote{la} and \sQuote{lm} are the equivalent, because the singular values
#' are used for the ordering.
#' @param type The type of the Laplacian to use. Various definitions exist for
#' the Laplacian of a graph, and one can choose between them with this
#' argument.
#'
#' Possible values: `D-A` means \eqn{D-A} where \eqn{D} is the degree
#' matrix and \eqn{A} is the adjacency matrix; `DAD` means
#' \eqn{D^{1/2}}{D^1/2} times \eqn{A} times \eqn{D^{1/2}{D^1/2}},
#' \eqn{D^{1/2}}{D^1/2} is the inverse of the square root of the degree matrix;
#' `I-DAD` means \eqn{I-D^{1/2}}{I-D^1/2}, where \eqn{I} is the identity
#' matrix. `OAP` is \eqn{O^{1/2}AP^{1/2}}{O^1/2 A P^1/2}, where
#' \eqn{O^{1/2}}{O^1/2} is the inverse of the square root of the out-degree
#' matrix and \eqn{P^{1/2}}{P^1/2} is the same for the in-degree matrix.
#'
#' `OAP` is not defined for undirected graphs, and is the only defined type
#' for directed graphs.
#'
#' The default (i.e. type `default`) is to use `D-A` for undirected
#' graphs and `OAP` for directed graphs.
#' @param scaled Logical scalar, if `FALSE`, then \eqn{U} and \eqn{V} are
#' returned instead of \eqn{X} and \eqn{Y}.
#' @param options A named list containing the parameters for the SVD
#' computation algorithm in ARPACK. By default, the list of values is assigned
#' the values given by [arpack_defaults()].
#' @return A list containing with entries: \item{X}{Estimated latent positions,
#' an `n` times `no` matrix, `n` is the number of vertices.}
#' \item{Y}{`NULL` for undirected graphs, the second half of the latent
#' positions for directed graphs, an `n` times `no` matrix, `n`
#' is the number of vertices.} \item{D}{The eigenvalues (for undirected graphs)
#' or the singular values (for directed graphs) calculated by the algorithm.}
#' \item{options}{A named list, information about the underlying ARPACK
#' computation. See [arpack()] for the details.}
#' @author Gabor Csardi \email{csardi.gabor@@gmail.com}
#' @seealso [embed_adjacency_matrix()],
#' [sample_dot_product()]
#' @references Sussman, D.L., Tang, M., Fishkind, D.E., Priebe, C.E. A
#' Consistent Adjacency Spectral Embedding for Stochastic Blockmodel Graphs,
#' *Journal of the American Statistical Association*, Vol. 107(499), 2012
#' @keywords graphs
#' @export
#' @examples
#'
#' ## A small graph
#' lpvs <- matrix(rnorm(200), 20, 10)
#' lpvs <- apply(lpvs, 2, function(x) {
#' return(abs(x) / sqrt(sum(x^2)))
#' })
#' RDP <- sample_dot_product(lpvs)
#' embed <- embed_laplacian_matrix(RDP, 5)
#' @family embedding
#' @cdocs igraph_laplacian_spectral_embedding
embed_laplacian_matrix <- laplacian_spectral_embedding_impl
#' Sample vectors uniformly from the surface of a sphere
#'
#' Sample finite-dimensional vectors to use as latent position vectors in
#' random dot product graphs
#'
#' `sample_sphere_surface()` generates uniform samples from \eqn{S^{dim-1}}
#' (the `(dim-1)`-sphere) with radius `radius`, i.e. the Euclidean
#' norm of the samples equal `radius`.
#'
#' @param dim Integer scalar, the dimension of the random vectors.
#' @param n Integer scalar, the sample size.
#' @param radius Numeric scalar, the radius of the sphere to sample.
#' @param positive Logical scalar, whether to sample from the positive orthant
#' of the sphere.
#' @return A `dim` (length of the `alpha` vector for
#' `sample_dirichlet()`) times `n` matrix, whose columns are the sample
#' vectors.
#'
#' @family latent position vector samplers
#'
#' @export
#' @examples
#' lpvs.sph <- sample_sphere_surface(dim = 10, n = 20, radius = 1)
#' RDP.graph.3 <- sample_dot_product(lpvs.sph)
#' vec.norm <- apply(lpvs.sph, 2, function(x) {
#' sum(x^2)
#' })
#' vec.norm
sample_sphere_surface <- function(dim, n = 1, radius = 1, positive = TRUE) {
# Argument checks
dim <- as.numeric(dim)
n <- as.numeric(n)
radius <- as.numeric(radius)
positive <- as.logical(positive)
on.exit(.Call(R_igraph_finalizer))
# Function call
res <- .Call(R_igraph_sample_sphere_surface, dim, n, radius, positive)
res
}
#' Sample vectors uniformly from the volume of a sphere
#'
#' Sample finite-dimensional vectors to use as latent position vectors in
#' random dot product graphs
#'
#' `sample_sphere_volume()` generates uniform samples from \eqn{S^{dim-1}}
#' (the `(dim-1)`-sphere) i.e. the Euclidean norm of the samples is
#' smaller or equal to `radius`.
#'
#' @param dim Integer scalar, the dimension of the random vectors.
#' @param n Integer scalar, the sample size.
#' @param radius Numeric scalar, the radius of the sphere to sample.
#' @param positive Logical scalar, whether to sample from the positive orthant
#' of the sphere.
#' @return A `dim` (length of the `alpha` vector for
#' `sample_dirichlet()`) times `n` matrix, whose columns are the sample
#' vectors.
#'
#' @family latent position vector samplers
#'
#' @export
#' @examples
#' lpvs.sph.vol <- sample_sphere_volume(dim = 10, n = 20, radius = 1)
#' RDP.graph.4 <- sample_dot_product(lpvs.sph.vol)
#' vec.norm <- apply(lpvs.sph.vol, 2, function(x) {
#' sum(x^2)
#' })
#' vec.norm
sample_sphere_volume <- function(dim, n = 1, radius = 1, positive = TRUE) {
# Argument checks
dim <- as.numeric(dim)
n <- as.numeric(n)
radius <- as.numeric(radius)
positive <- as.logical(positive)
on.exit(.Call(R_igraph_finalizer))
# Function call
res <- .Call(R_igraph_sample_sphere_volume, dim, n, radius, positive)
res
}
#' Sample from a Dirichlet distribution
#'
#' Sample finite-dimensional vectors to use as latent position vectors in
#' random dot product graphs
#'
#' `sample_dirichlet()` generates samples from the Dirichlet distribution
#' with given \eqn{\alpha}{alpha} parameter. The sample is drawn from
#' `length(alpha)-1`-simplex.
#'
#' @param n Integer scalar, the sample size.
#' @param alpha Numeric vector, the vector of \eqn{\alpha}{alpha} parameter for
#' the Dirichlet distribution.
#' @return A `dim` (length of the `alpha` vector for
#' `sample_dirichlet()`) times `n` matrix, whose columns are the sample
#' vectors.
#'
#' @family latent position vector samplers
#'
#' @export
#' @examples
#' lpvs.dir <- sample_dirichlet(n = 20, alpha = rep(1, 10))
#' RDP.graph.2 <- sample_dot_product(lpvs.dir)
#' colSums(lpvs.dir)
sample_dirichlet <- function(n, alpha) {
# Argument checks
n <- as.numeric(n)
alpha <- as.numeric(alpha)
on.exit(.Call(R_igraph_finalizer))
# Function call
res <- .Call(R_igraph_sample_dirichlet, n, alpha)
res
}