-
Notifications
You must be signed in to change notification settings - Fork 0
/
translate.R
167 lines (112 loc) · 5.83 KB
/
translate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
library(xml2)
library(readr)
options(warn=1)
# setwd("~/prosjekter/R/MMPI")
#xml_replace (
#
#(xml_find_all(xmlmail,".//w:t[text()='" . fra . "']"))[[1]],
#
#read_xml(paste0(docspec,"<w:t>" . til . "</w:t>","</document>"))
#
#)
### main doc
.translate_doc <- function (.doc, .dict, .xmlfile="document.xml", .docspec=c('<document space="preserve" xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">', '</document>')) {
library(xml2)
library(readr)
library(tools)
print(1)
target <- tempdir()
if ( is.character(.dict) && file.exists(.dict) ) {
dict <- read_csv(.dict,col_names=F)
} else if (is.vector(.dict)) {
.dict = tibble(matrix(.dict,ncol=2))
} else {
stop ("dict must be cvs or vector")
}
if ( is.character(.doc) && file.exists(.doc) ) {
unzip (.doc, exdir=target)
} else {
stop ("doc must be path to document")
}
destination <- paste0(file_path_sans_ext(.doc),"-EN.docx")
sapply(.xmlfile,function(.xmlfile){
body <- read_xml(paste0(target,"/word/",.xmlfile))
tabs <- xml_find_all(body,'.//w:ind[contains(@w:right,1974)]')
if (length(tabs)>0) {
fra <- tabs[[1]]
til <- xml_child(read_xml(paste0(.docspec[1],'<w:ind w:left="6514" w:right="1372" w:firstLine="0"/>',.docspec[2])))
xml_replace(fra, til)
}
tabs <- xml_find_all(body,'.//w:ind[contains(@w:right,2366)]')
if (length(tabs)>0) {
fra <- tabs[[1]]
til <- xml_child(read_xml(paste0(.docspec[1],'<w:ind w:left="6514" w:right="1372" w:firstLine="0"/>',.docspec[2])))
xml_replace(fra, til)
}
res <- apply(
data.frame(dict),
1,
function(X){
needle <- X[[1]]
haystack <- X[[2]]
if ( nchar(needle) > 1 && substring(needle,1,1) == "~" ) {
needle <- substring(needle,2)
fra <- xml_find_all(body,paste0(".//w:t[contains(text(),'", needle, "')]"))
haystack <- gsub(needle,haystack,xml_text(fra))
} else {
fra <- (xml_find_all(body,paste0(".//w:t[text()='", needle, "']")))
}
til <- xml_child(read_xml(paste0(.docspec[1],"<w:t>", haystack, "</w:t>",.docspec[2])))
for (f in fra) {
xml_replace(f, til)
}
bm <- xml_find_all(body,paste0('.//w:bookmarkStart[@w:name="',needle,'"]'))
if (length(bm)>0){
print(bm)
print(needle)
bm.id <- xml_attrs(bm)[[1]]["id"]
bm.name <- xml_attrs(bm)[[1]]["name"]
if ( needle == bm.name ) {
bm.til <- xml_child(read_xml(paste0(
.docspec[1],
'<w:bookmarkStart w:name="', haystack ,'" w:id="', bm.id ,'"/>',
.docspec[2])))
xml_replace(bm[[1]],bm.til)
}
}
}
)
write_xml(body,paste0(target,"/word/",.xmlfile))
})
currdir <- getwd()
if ( (images <- regexpr("MMPI|NEO-PI",.doc))[1] > 0 ) {
thefilesdir <- paste0("./media/", substr(.doc,images[1],images[1]+attr(images,"match.length")-1))
thefiles <- list.files(thefilesdir,
full.names=TRUE)
print(thefilesdir)
mediadir <- paste0(target,"/word/media/")
sapply(thefiles,function(X){
dest <- paste0(mediadir,basename(X))
print("Copy to mediadir")
print(X)
print(dest)
file.copy(X,
dest,overwrite=TRUE)
print("Copied to mediadir")
})
}
setwd(target)
zip(paste0(destination),files=".",flags="-r9X")
setwd(currdir)
print(target)
}
translate_doc <- function (.doc, .dict, .xmlfile="document.xml", .docspec=c('<document space="preserve" xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">', '</document>')) {
print(.doc)
sapply(.doc,function(X){
print(X)
.translate_doc(X,.dict,.xmlfile,.docspec)})
}
mmpidocs <- list.files("~/prosjekter/adopsjonsforum/rapporter/2020/01",pattern="MMPI.*docx",full.names=TRUE)
neopidocs <- list.files("~/prosjekter/adopsjonsforum/rapporter/2020/01",pattern="NEO-PI.*docx",full.names=TRUE)
translate_doc(neopidocs,"neo-pidict.csv",c("document.xml","header3.xml","footer1.xml"))
translate_doc(mmpidocs,"mmpidict.csv",c("document.xml","header3.xml","footer1.xml"))