forked from yihui/knitr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpandoc.R
143 lines (139 loc) · 6.27 KB
/
pandoc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#' A Pandoc wrapper to convert documents to other formats
#'
#' This function calls Pandoc to convert documents to other formats such as
#' HTML, LaTeX/PDF and Word, etc, (optionally) based on a configuration file or
#' in-file configurations which specify the options to use for Pandoc.
#'
#' There are two ways to input the Pandoc configurations -- through a config
#' file, or embed the configurations in the input file as special comments
#' between \verb{<!--pandoc} and \verb{-->}.
#'
#' The configuration file is a DCF file (see \code{\link{read.dcf}}). This file
#' must contain a field named \code{t} which means the output format. The
#' configurations are written in the form of \code{tag:value} and passed to
#' Pandoc (if no value is needed, just leave it empty, e.g. the option
#' \code{standalone} or \code{s} for short). If there are multiple output
#' formats, write each format and relevant configurations in a block, and
#' separate blocks with blank lines.
#'
#' If there are multiple records of the \code{t} field in the configuration, the
#' input markdown file will be converted to all these formats by default, unless
#' the \code{format} argument is specified as one single format.
#' @param input A character vector of Markdown filenames (must be encoded in
#' UTF-8).
#' @param format Name of the output format (see References). This can be a
#' character vector of multiple formats; by default, it is obtained from the
#' \code{t} field in the configuration. If the configuration is empty or the
#' \code{t} field is not found, the default output format will be
#' \code{'html'}.
#' @param config Path to the Pandoc configuration file. If missing, it is
#' assumed to be a file with the same base name as the \code{input} file and
#' an extension \code{.pandoc} (e.g. for \file{foo.md} it looks for
#' \file{foo.pandoc})
#' @param ext Filename extensions. By default, the extension is inferred from
#' the \code{format}, e.g. \code{latex} creates \code{pdf}, \code{dzslides}
#' creates \code{html}, and so on
#' @return The output filename(s) (or an error if the conversion failed).
#' @references Pandoc: \url{https://pandoc.org}; Examples and rules of the
#' configurations: \url{https://yihui.org/knitr/demo/pandoc/}
#'
#' Also see R Markdown (v2) at \url{https://rmarkdown.rstudio.com}. The
#' \pkg{rmarkdown} package has several convenience functions and templates
#' that make it very easy to use Pandoc. The RStudio IDE also has
#' comprehensive support for it, so I'd recommend users who are not familiar
#' with command-line tools to use the \pkg{rmarkdown} package instead.
#' @seealso \code{\link{read.dcf}}
#' @export
#' @examples system('pandoc -h') # see possible output formats
pandoc = function(input, format, config = getOption('config.pandoc'), ext = NA) {
exec = tryCatch(rmarkdown::pandoc_exec(), error = function(e) Sys.which('pandoc'))
if (length(exec) != 1 || exec == '')
stop('Please install either RStudio or Pandoc (https://pandoc.org)')
cfg = if (is.null(config)) with_ext(input[1L], 'pandoc') else config
txt = pandoc_cfg(read_utf8(input[1]))
if (file.exists(cfg)) txt = c(txt, '', read_utf8(cfg))
con = textConnection(txt, encoding = 'UTF-8'); on.exit(close(con), add = TRUE)
cfg = read.dcf(con)
nms = colnames(cfg)
if (length(nms) && 'format' %in% nms) {
warning('the "format" field in the configuration must be renamed to "t"')
colnames(cfg)[nms == 'format'] = 't' # for backward compatibility
}
if (missing(format)) format = pandoc_fmt(cfg)
mapply(
pandoc_one, input, format, ext, exec, MoreArgs = list(cfg = cfg),
USE.NAMES = FALSE
)
}
# format is a scalar
pandoc_one = function(input, format, ext, exec, cfg) {
cmn = NULL # common arguments
if (nrow(cfg) == 0L) cfg = character(0) else if (nrow(cfg) == 1L) {
if ('t' %in% colnames(cfg)) {
cfg = if (cfg[1L, 't'] == format) drop(cfg) else NA
} else {cmn = drop(cfg); cfg = NA}
} else {
if (!('t' %in% colnames(cfg)))
stop('for a config file with multiple output formats, there must be a field named "t"')
if (sum(idx <- is.na(cfg[, 't'])) > 1L)
stop('at most one "t" field can be NA')
if (sum(idx) == 1L) cmn = cfg[idx, ]
cfg = cfg[!idx, , drop = FALSE]
cfg = cfg[cfg[, 't'] == format, ]
if (!is.null(dim(cfg))) {
if (nrow(cfg) > 1) stop('the output format is not unique in config')
cfg = character(0) # nrow(cfg) == 0; format not found in cfg
}
}
out = unname(if (!is.na(cfg['o'])) cfg['o'] else {
if (!is.na(cfg['output'])) cfg['output'] else {
with_ext(input, if (is.na(ext)) pandoc_ext(format) else ext)
}
})
cfg = cfg[setdiff(names(cfg), c('o', 'output', 't'))]
cmd = c(
pandoc_arg(cfg), pandoc_arg(cmn), '-t', format, '-o', shQuote(out), shQuote(input)
)
message('Executing: pandoc ', paste(cmd, collapse = ' '))
if (system2(exec, cmd) == 0L) out else stop('conversion failed')
}
# detect output format from config
pandoc_fmt = function(config) {
fields = colnames(config)
if (prod(dim(config)) == 0 || !('t' %in% fields)) return('html')
na.omit(config[, 't'])
}
# infer output extension from format
pandoc_ext = function(format) {
if (grepl('^html', format)) return('html')
if (grepl('^latex|beamer|context|texinfo', format)) return('pdf')
if (format %in% c('s5', 'slidy', 'slideous', 'dzslides')) return('html')
if (grepl('^rst', format)) return('rst')
if (format == 'opendocument') return('xml')
format
}
# give me a vector of arguments, I turn them into command-line
pandoc_arg = function(x) {
if (length(x) == 0L || all(is.na(x))) return()
x = x[!is.na(x)] # options not provided
nms = names(x)
if (any(grepl('\n', x))) {
# one argument used multiple times, e.g. --bibliography
x = str_split(x, '\n')
nms = rep(nms, sapply(x, length))
x = unlist(x)
}
a1 = nchar(nms) == 1L
paste0(ifelse(a1, '-', '--'), nms, ifelse(x == '', '', ifelse(a1, ' ', '=')), x)
}
# identify pandoc config in markdown comments
pandoc_cfg = function(x) {
if (length(i1 <- grep('^<!--pandoc', x)) == 0L ||
length(i2 <- grep('-->\\s*$', x)) == 0L) return(character(0))
i1 = i1[1L]; if (all(i2 < i1)) return(character(0))
i2 = i2[i2 >= i1][1L]
cfg = x[i1:i2]
cfg[1L] = gsub('^<!--pandoc\\s*', '', cfg[1L])
cfg[length(cfg)] = gsub('-->\\s*$', '', cfg[length(cfg)])
cfg
}