forked from yihui/knitr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache.R
328 lines (308 loc) · 13.9 KB
/
cache.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
## ideas borrowed from cacheSweave but not as sophisticated
## closure adapted from Hadley's decumar: https://github.com/hadley/decumar
## but it is using .rdb and .rdx as 'hard cache' (instead of cache in memory)
new_cache = function() {
cache_path = function(hash) {
d = dirname(hash)
if (!file.exists(d)) dir.create(d, showWarnings = FALSE, recursive = TRUE)
file.path(d, basename(hash))
}
cache_purge = function(hash) {
for (h in hash) unlink(paste(cache_path(h), c('rdb', 'rdx', 'RData'), sep = '.'))
}
cache_save = function(keys, outname, hash, lazy = TRUE) {
meta_name = cache_meta_name(hash)
if (exists(meta_name, envir = knit_global())) outname = c(outname, meta_name)
out0 = outname
on.exit(rm(list = out0, envir = knit_global()), add = TRUE)
# keys are new variables created; outname is the text output of a chunk
path = cache_path(hash)
# add random seed to cache if exists
if (exists('.Random.seed', envir = globalenv(), inherits = FALSE)) {
copy_env(globalenv(), knit_global(), '.Random.seed')
outname = c('.Random.seed', outname)
}
if (!lazy) outname = c(keys, outname)
save(list = outname, file = paste(path, 'RData', sep = '.'), envir = knit_global())
if (!lazy) return() # everything has been saved; no need to make lazy db
# random seed is always load()ed
keys = as.character(setdiff(keys, '.Random.seed'))
getFromNamespace('makeLazyLoadDB', 'tools')(knit_global(), path, variables = keys)
}
save_objects = function(objs, label, path) {
if (length(objs) == 0L) objs = ''
# save object names
x = paste(c(label, objs), collapse = '\t')
if (file.exists(path)) {
lines = read_utf8(path)
lines = lines[lines != label] # knitr < 1.5 may have lines == label
idx = substr(lines, 1L, nchar(label) + 1L) == paste0(label, '\t')
if (any(idx)) {
lines[idx] = x # update old objects
} else lines = c(lines, x)
} else lines = x
write_utf8(lines, path)
}
cache_objects = function(keys, globals, label, path) {
save_objects(keys, label, valid_path(path, '__objects'))
save_objects(globals, label, valid_path(path, '__globals'))
}
cache_load = function(hash, lazy = TRUE) {
path = cache_path(hash)
if (!is_abs_path(path)) path = file.path(getwd(), path)
if (lazy) lazyLoad(path, envir = knit_global())
# load output from last run if exists
if (file.exists(path2 <- paste(path, 'RData', sep = '.'))) {
load(path2, envir = knit_global())
if (exists('.Random.seed', envir = knit_global(), inherits = FALSE))
copy_env(knit_global(), globalenv(), '.Random.seed')
name = cache_meta_name(hash)
if (exists(name, envir = knit_global())) {
.knitEnv$meta = c(
.knitEnv$meta, get(name, envir = knit_global(), inherits = FALSE)
)
rm(list = name, envir = knit_global())
}
}
}
cache_library = function(path, save = TRUE) {
# save or load R packages
path = valid_path(path, '__packages')
if (save) {
x = rev(.packages())
if (file.exists(path)) x = setdiff(c(read_utf8(path), x), xfun::base_pkgs())
write_utf8(x, path)
} else {
if (!file.exists(path)) return()
for (p in read_utf8(path))
suppressPackageStartupMessages(library(p, character.only = TRUE))
}
}
cache_exists = function(hash, lazy = TRUE) {
is.character(hash) &&
all(file.exists(paste(
cache_path(hash), if (lazy) c('rdb', 'rdx') else 'RData', sep = '.'
)))
}
# when cache=3, code output is stored in .[hash], so cache=TRUE won't lose
# output as cacheSweave does; for cache=1,2, output is the evaluate() list
cache_output = function(hash, mode = 'character') {
name = cache_output_name(hash)
res = get(name, envir = knit_global(), mode = mode, inherits = FALSE)
# clean up this hidden variable after we obtain its value
if (mode == mode(res)) rm(list = name, envir = knit_global())
res
}
list(purge = cache_purge, save = cache_save, load = cache_load, objects = cache_objects,
exists = cache_exists, output = cache_output, library = cache_library)
}
# analyze code and find out all possible variables (not necessarily global variables)
find_symbols = function(code) {
if (is.null(code) || length(p <- parse(text = code, keep.source = TRUE)) == 0) return()
p = getParseData(p)
p = p[p$terminal & p$token %in% c('SYMBOL', 'SYMBOL_FUNCTION_CALL', 'SPECIAL'), ]
unique(p$text)
}
# a variable name to store the metadata object from code chunks
cache_meta_name = function(hash) sprintf('.%s_meta', hash)
# a variable name to store the text output of code chunks
cache_output_name = function(hash) sprintf('.%s', hash)
cache = new_cache()
# a regex for cache files
cache_rx = '_[abcdef0123456789]{32}[.](rdb|rdx|RData)$'
#' Build automatic dependencies among chunks
#'
#' When the chunk option \code{autodep = TRUE}, all names of objects created in
#' a chunk will be saved in a file named \file{__objects} and all global objects
#' used in a chunk will be saved to \file{__globals}. This function can analyze
#' object names in these files to automatically build cache dependencies, which
#' is similar to the effect of the \code{dependson} option. It is supposed to be
#' used in the first chunk of a document and this chunk must not be cached.
#' @param path Path to the dependency file.
#' @param labels A vector of labels of chunks for which the dependencies will be
#' built. By default, dependencies for all chunks will be built.
#' @return \code{NULL}. The dependencies are built as a side effect.
#' @note Be cautious about \code{path}: because this function is used in a
#' chunk, the working directory when the chunk is evaluated is the directory
#' of the input document in \code{\link{knit}}, and if that directory differs
#' from the working directory before calling \code{knit()}, you need to adjust
#' the \code{path} argument here to make sure this function can find the cache
#' files \file{__objects} and \file{__globals}.
#' @export
#' @seealso \code{\link{dep_prev}}
#' @references \url{https://yihui.org/knitr/demo/cache/}
dep_auto = function(path = opts_chunk$get('cache.path'), labels = all_labels()) {
# this function should be evaluated in the original working directory
owd = setwd(opts_knit$get('output.dir')); on.exit(setwd(owd))
paths = valid_path(path, c('__objects', '__globals'))
locals = parse_objects(paths[1L]); globals = parse_objects(paths[2L])
if (is.null(locals) || is.null(globals)) return(invisible(NULL))
if (!identical(names(locals), names(globals))) {
warning('corrupt dependency files? \ntry remove ', paste(paths, collapse = '; '))
return(invisible(NULL))
}
nms = intersect(all_labels(), names(locals)) # guarantee correct order
for (i in match(labels, nms)) {
# ignore first chunk (i < 2); locals may contain old chunk names (i will be NA)
if (is.na(i) || i < 2 || length(g <- globals[[nms[i]]]) == 0) next
for (j in 1:(i - 1L)) {
# check if current globals are in old locals
if (any(g %in% locals[[nms[j]]]))
dep_list$set(setNames(list(unique(c(dep_list$get(nms[j]), nms[i]))), nms[j]))
}
}
}
# parse objects in dependency files
parse_objects = function(path) {
if (!file.exists(path)) {
warning('file ', path, ' not found'); return()
}
lines = strsplit(read_utf8(path), '\t')
if (length(lines) < 2L) return() # impossible for dependson
objs = lapply(lines, `[`, -1L)
names(objs) = lapply(lines, `[`, 1L)
objs
}
#' Load the cache database of a code chunk
#'
#' If a code chunk has turned on the chunk option \code{cache = TRUE}, a cache
#' database will be established after the document is compiled. You can use this
#' function to manually load the database anywhere in the document (even before
#' the code chunk). This makes it possible to use objects created later in the
#' document earlier, e.g. in an inline R expression before the cached code
#' chunk, which is normally not possible because \pkg{knitr} compiles the
#' document in a linear fashion, and objects created later cannot be used before
#' they are created.
#' @param label The chunk label of the code chunk that has a cache database.
#' @param object The name of the object to be fetched from the database. If it
#' is missing, \code{NULL} is returned).
#' @param notfound A value to use when the \code{object} cannot be found.
#' @param path Path of the cache database (normally set in the global chunk
#' option \code{cache.path}).
#' @param dir Path to use as the working directory. Defaults to the output
#' directory if run inside a \pkg{knitr} context and to the current working
#' directory otherwise. Any relative \code{path} is defined from \code{dir}.
#' @param envir Environment to use for cache loading, into which all objects in
#' the cache for the specified chunk (not just that in \code{object}) will be
#' loaded. Defaults to the value in \code{\link{knit_global}}.
#' @param lazy Whether to \code{\link{lazyLoad}} the cache database (depending
#' on the chunk option \code{cache.lazy = TRUE} or \code{FALSE} of that code
#' chunk).
#' @note Apparently this function loads the value of the object from the
#' \emph{previous} run of the document, which may be problematic when the
#' value of the object becomes different the next time the document is
#' compiled. Normally you must compile the document twice to make sure the
#' cache database is created, and the object can be read from it. Please use
#' this function with caution.
#' @references See the example #114 at
#' \url{https://github.com/yihui/knitr-examples}.
#' @return Invisible \code{NULL} when \code{object} is not specified (the cache
#' database will be loaded as a side effect), otherwise the value of the
#' object if found.
#' @export
load_cache = function(
label, object, notfound = 'NOT AVAILABLE', path = opts_chunk$get('cache.path'),
dir = opts_knit$get('output.dir'), envir = NULL, lazy = TRUE
) {
if (is.null(dir)) dir = "."
owd = setwd(dir); on.exit(setwd(owd), add = TRUE)
if (!is.null(envir)) {
oldenv = .knitEnv$knit_global
on.exit(.knitEnv$knit_global <- oldenv, add = TRUE)
.knitEnv$knit_global = envir
}
path = valid_path(path, label)
p0 = dirname(path); p1 = basename(path)
p2 = list.files(p0, cache_rx)
if (length(p2) == 0) return(notfound)
p2 = p2[substr(p2, 1, nchar(p1)) == p1]
if (length(p2) == 0) return(notfound)
if (length(p2) > 3) stop(
'Wrong cache databases for the chunk ', label,
'. You need to remove redundant cache files. Found ', paste(p2, collapse = ', ')
)
p2 = unique(gsub('[.](rdb|rdx|RData)$', '', p2))
if (length(p2) != 1) stop('Cannot identify the cache database for chunk ', label)
cache$load(file.path(p0, p2), lazy)
if (missing(object)) return(invisible(NULL))
if (exists(object, envir = knit_global(), inherits = FALSE)) {
get(object, envir = knit_global(), inherits = FALSE)
} else notfound
}
#' Make later chunks depend on previous chunks
#'
#' This function can be used to build dependencies among chunks so that all
#' later chunks depend on previous chunks, i.e. whenever the cache of a previous
#' chunk is updated, the cache of all its later chunks will be updated.
#' @return \code{NULL}; the internal dependency structure is updated as a side
#' effect.
#' @export
#' @seealso \code{\link{dep_auto}}
#' @references \url{https://yihui.org/knitr/demo/cache/}
dep_prev = function() {
labs = names(knit_code$get())
if ((n <- length(labs)) < 2L) return() # one chunk or less; no sense of deps
opts_knit$set(warn.uncached.dep = FALSE)
for (i in 1L:(n - 1L)) {
dep_list$set(setNames(list(labs[(i + 1L):n]), labs[i]))
}
}
#' An unevaluated expression to return .Random.seed if exists
#'
#' This expression returns \code{.Random.seed} when \code{eval(rand_seed)} and
#' \code{NULL} otherwise.
#'
#' It is designed to work with \code{opts_chunk$set(cache.extra = rand_seed)}
#' for reproducibility of chunks that involve with random number generation. See
#' references.
#' @export
#' @format NULL
#' @references \url{https://yihui.org/knitr/demo/cache/}
#' @examples eval(rand_seed)
#' rnorm(1) # .Random.seed is created (or modified)
#' eval(rand_seed)
rand_seed = quote({
.GlobalEnv$.Random.seed
})
#' Clean cache files that are probably no longer needed
#'
#' If you remove or rename some cached code chunks, their original cache files
#' will not be automatically cleaned. You can use this function to identify
#' these possible files, and clean them if you are sure they are no longer
#' needed.
#' @param clean Boolean; whether to remove the files.
#' @param path Path to the cache.
#' @note The identification is not guaranteed to be correct, especially when
#' multiple documents share the same cache directory. You are recommended to
#' call \code{clean_cache(FALSE)} and carefully check the list of files (if
#' any) before you really delete them (\code{clean_cache(TRUE)}).
#'
#' This function must be called within a code chunk in a source document,
#' since it needs to know all chunk labels of the current document to
#' determine which labels are no longer present, and delete cache
#' corresponding to these labels.
#' @export
clean_cache = function(clean = FALSE, path = opts_chunk$get('cache.path')) {
odir = opts_knit$get('output.dir')
if (is.null(odir)) {
warning('This function must be called inside a source document')
return()
}
owd = setwd(odir); on.exit(setwd(owd))
if (file_test('-d', path)) {
p0 = path; p1 = ''
} else {
p0 = dirname(path); p1 = basename(path)
}
files = list.files(p0, cache_rx, full.names = TRUE)
if (length(files) == 0) return()
base = basename(files)
labs = .knitEnv$labels
if (length(labs) == 0) return()
i = !(sub(cache_rx, '', base) %in% paste0(p1, labs))
if (p1 != '') i = i & (substr(base, 1, nchar(p1)) == p1)
if (!any(i)) return()
if (clean) unlink(files[i]) else message(
'Clean these cache files?\n\n', one_string(files[i]), '\n'
)
}