forked from tleyden/open-ocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert-pdf.go
77 lines (63 loc) · 1.62 KB
/
convert-pdf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
package ocrworker
/* Use this module if you want to call tesseract over
pdfsandwich with an image as input file.
Useful with big documents.
Use cases:
engine: tesseract with file_type: pdf and preprocessor: convert-pdf
engine: sandwich with file_type: [tif, png, jpg] and preprocessor: convert-pdf
*/
import (
"fmt"
"github.com/couchbaselabs/logg"
"io/ioutil"
"os"
"os/exec"
)
type ConvertPdf struct {
}
func (c ConvertPdf) preprocess(ocrRequest *OcrRequest) error {
tmpFileNameInput, err := createTempFileName()
tmpFileNameInput = fmt.Sprintf("%s.pdf", tmpFileNameInput)
if err != nil {
return err
}
defer os.Remove(tmpFileNameInput)
tmpFileNameOutput, err := createTempFileName()
tmpFileNameOutput = fmt.Sprintf("%s.tif", tmpFileNameOutput)
if err != nil {
return err
}
defer os.Remove(tmpFileNameOutput)
err = saveBytesToFileName(ocrRequest.ImgBytes, tmpFileNameInput)
if err != nil {
return err
}
logg.LogTo(
"PREPROCESSOR_WORKER",
"Convert PDF %s -> %s",
tmpFileNameInput,
tmpFileNameOutput,
)
var gsArgs []string
gsArgs = append(gsArgs,
"-dQUIET",
"-dNOPAUSE",
"-dBATCH",
"-sOutputFile="+tmpFileNameOutput,
"-sDEVICE=tiffg4",
tmpFileNameInput,
)
logg.LogTo("PREPROCESSOR_WORKER", "output: %s", gsArgs)
out, err := exec.Command("gs", gsArgs...).CombinedOutput()
if err != nil {
logg.LogFatal("Error running command: %s. out: %s", err, out)
}
logg.LogTo("PREPROCESSOR_WORKER", "output: %v", string(out))
// read bytes from output file
resultBytes, err := ioutil.ReadFile(tmpFileNameOutput)
if err != nil {
return err
}
ocrRequest.ImgBytes = resultBytes
return nil
}