Skip to content

Commit

Permalink
Add a file format validation activity
Browse files Browse the repository at this point in the history
  • Loading branch information
djjuhasz committed Nov 6, 2024
1 parent 2123d67 commit 8eeedb0
Show file tree
Hide file tree
Showing 9 changed files with 313 additions and 13 deletions.
5 changes: 5 additions & 0 deletions cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"

"github.com/artefactual-sdps/temporal-activities/bagcreate"
"github.com/artefactual-sdps/temporal-activities/ffvalidate"
"github.com/go-logr/logr"
"go.artefactual.dev/tools/temporal"
temporalsdk_activity "go.temporal.io/sdk/activity"
Expand Down Expand Up @@ -58,6 +59,10 @@ func (m *Main) Run(ctx context.Context) error {
temporalsdk_workflow.RegisterOptions{Name: m.cfg.Temporal.WorkflowName},
)

w.RegisterActivityWithOptions(
ffvalidate.New(m.cfg.FileFormat).Execute,
temporalsdk_activity.RegisterOptions{Name: ffvalidate.Name},
)
w.RegisterActivityWithOptions(
bagcreate.New(m.cfg.Bagit).Execute,
temporalsdk_activity.RegisterOptions{Name: bagcreate.Name},
Expand Down
11 changes: 10 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/artefactual-sdps/preprocessing-demo
go 1.23.2

require (
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4
github.com/artefactual-sdps/temporal-activities v0.0.0-20241105002718-bc4a9d85ce42
github.com/go-logr/logr v1.4.2
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.18.2
Expand Down Expand Up @@ -32,7 +32,15 @@ require (
github.com/pborman/uuid v1.2.1 // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/richardlehane/characterize v1.0.0 // indirect
github.com/richardlehane/match v1.0.5 // indirect
github.com/richardlehane/mscfb v1.0.4 // indirect
github.com/richardlehane/msoleps v1.0.3 // indirect
github.com/richardlehane/siegfried v1.11.1 // indirect
github.com/richardlehane/xmldetect v1.0.2 // indirect
github.com/robfig/cron v1.2.0 // indirect
github.com/ross-spencer/spargo v0.4.1 // indirect
github.com/ross-spencer/wikiprov v0.2.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
Expand All @@ -44,6 +52,7 @@ require (
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/exp v0.0.0-20231219180239-dc181d75b848 // indirect
golang.org/x/image v0.17.0 // indirect
golang.org/x/net v0.28.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/sys v0.24.0 // indirect
Expand Down
21 changes: 21 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241105002718-bc4a9d85ce42 h1:0Ymucvkou8aiZkQrVgZsTODGeGoQHVNV414IFOFRxX0=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241105002718-bc4a9d85ce42/go.mod h1:hV4rUdJ8FYqlEkvT0OqWuHj8DNc1v9j5/Dv8VsrYXLU=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
Expand Down Expand Up @@ -80,10 +82,27 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/richardlehane/characterize v1.0.0 h1:2MMnKFqYd+hsKpQrPkc5JjbcIzVBIfvSoaMd563GOj0=
github.com/richardlehane/characterize v1.0.0/go.mod h1:9mhxzxtWkXoLQpkg+gt7ioK6//+3hrsv3VHkbj8kbuQ=
github.com/richardlehane/match v1.0.5 h1:+tuXp28xaIPsvKbhHyuivce9qMEfE8nP9d0wSxJef9o=
github.com/richardlehane/match v1.0.5/go.mod h1:Vz0T28BYeZrU9h54iHnyjDfhVlKvn7XB7smnVripJME=
github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM=
github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk=
github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
github.com/richardlehane/msoleps v1.0.3 h1:aznSZzrwYRl3rLKRT3gUk9am7T/mLNSnJINvN0AQoVM=
github.com/richardlehane/msoleps v1.0.3/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
github.com/richardlehane/siegfried v1.11.1 h1:Ke0IKqeQE3Y6ptSWFwaeX4kVI/OEokWgXOKR49otMbw=
github.com/richardlehane/siegfried v1.11.1/go.mod h1:63eYvfRqpjzB/TKa8uNAeQsl/+h9JuIezbt7Rc9sqMo=
github.com/richardlehane/xmldetect v1.0.2 h1:/3ooFuJwtgpMMe14/7m8a/JIvECMx6SpsPcDRiNyR8o=
github.com/richardlehane/xmldetect v1.0.2/go.mod h1:Zp1lhTLRJa2p2QKA4jOruVQYc0NFQDO0YUz3k/k6JcE=
github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ=
github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/ross-spencer/spargo v0.4.1 h1:+a570tI+az8j/s0+06mntNqwsJ7DXuq7PESUIXlKie8=
github.com/ross-spencer/spargo v0.4.1/go.mod h1:szEHC5cu+q6g0RD7otV7xvYGb+fQVYj1/SkiVTr4IC4=
github.com/ross-spencer/wikiprov v0.2.0 h1:I0RAdlgVW5z2sMk/vAPS5cXTbIsMNAnYEIAS+CZ4urE=
github.com/ross-spencer/wikiprov v0.2.0/go.mod h1:a7GkJgwKK3D2DlrGindbHR2VciEbHHCl6fFAKaiRhVI=
github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ=
github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4=
github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
Expand Down Expand Up @@ -141,6 +160,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20231219180239-dc181d75b848 h1:+iq7lrkxmFNBM7xx+Rae2W6uyPfhPeDWD+n+JgppptE=
golang.org/x/exp v0.0.0-20231219180239-dc181d75b848/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI=
golang.org/x/image v0.17.0 h1:nTRVVdajgB8zCMZVsViyzhnMKPwYeroEERRC64JuLco=
golang.org/x/image v0.17.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
Expand Down
38 changes: 38 additions & 0 deletions hack/kube/overlays/dev/preprocessing-secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,41 @@ stringData:
[bagit]
checksumAlgorithm = "sha512"
[fileformat]
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
allowed_file_formats.csv: |
Format name,PRONOM PUID
text,x-fmt/16
text,x-fmt/21
text,x-fmt/22
text,x-fmt/62
text,x-fmt/111
text,x-fmt/282
text,x-fmt/283
PDF/A,fmt/95
PDF/A,fmt/354
PDF/A,fmt/476
PDF/A,fmt/477
PDF/A,fmt/478
CSV,x-fmt/18
SIARD,fmt/161
SIARD,fmt/1196
SIARD,fmt/1777
TIFF,fmt/353
JPEG 2000,x-fmt/392
WAVE,fmt/1
WAVE,fmt/2
WAVE,fmt/6
WAVE,fmt/141
FFV1,fmt/569
MPEG-4,fmt/199
XML/XSD,fmt/101
XML/XSD,x-fmt/280
INTERLIS,fmt/1014
INTERLIS,fmt/1012
INTERLIS,fmt/654
INTERLIS,fmt/1013
INTERLIS,fmt/1011
INTERLIS,fmt/653
38 changes: 38 additions & 0 deletions hack/kube/overlays/enduro/preprocessing-secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,41 @@ stringData:
[bagit]
checksumAlgorithm = "sha512"
[fileformat]
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
allowed_file_formats.csv: |
Format name,PRONOM PUID
text,x-fmt/16
text,x-fmt/21
text,x-fmt/22
text,x-fmt/62
text,x-fmt/111
text,x-fmt/282
text,x-fmt/283
PDF/A,fmt/95
PDF/A,fmt/354
PDF/A,fmt/476
PDF/A,fmt/477
PDF/A,fmt/478
CSV,x-fmt/18
SIARD,fmt/161
SIARD,fmt/1196
SIARD,fmt/1777
TIFF,fmt/353
JPEG 2000,x-fmt/392
WAVE,fmt/1
WAVE,fmt/2
WAVE,fmt/6
WAVE,fmt/141
FFV1,fmt/569
MPEG-4,fmt/199
XML/XSD,fmt/101
XML/XSD,x-fmt/280
INTERLIS,fmt/1014
INTERLIS,fmt/1012
INTERLIS,fmt/654
INTERLIS,fmt/1013
INTERLIS,fmt/1011
INTERLIS,fmt/653
5 changes: 4 additions & 1 deletion internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"

"github.com/artefactual-sdps/temporal-activities/bagcreate"
"github.com/artefactual-sdps/temporal-activities/ffvalidate"
"github.com/spf13/viper"
)

Expand All @@ -33,7 +34,9 @@ type Configuration struct {

Temporal Temporal
Worker WorkerConfig
Bagit bagcreate.Config

Bagit bagcreate.Config
FileFormat ffvalidate.Config
}

type Temporal struct {
Expand Down
59 changes: 52 additions & 7 deletions internal/workflow/preprocessing.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ package workflow
import (
"fmt"
"path/filepath"
"strings"
"time"

"github.com/artefactual-sdps/temporal-activities/bagcreate"
"github.com/artefactual-sdps/temporal-activities/ffvalidate"
"go.artefactual.dev/tools/temporal"
temporalsdk_temporal "go.temporal.io/sdk/temporal"
temporalsdk_workflow "go.temporal.io/sdk/workflow"
Expand Down Expand Up @@ -39,6 +41,24 @@ func (r *PreprocessingWorkflowResult) newEvent(ctx temporalsdk_workflow.Context,
return ev
}

func (r *PreprocessingWorkflowResult) validationError(
ctx temporalsdk_workflow.Context,
ev *eventlog.Event,
msg string,
failures []string,
) *PreprocessingWorkflowResult {
r.Outcome = OutcomeContentError
ev.Complete(
temporalsdk_workflow.Now(ctx),
enums.EventOutcomeValidationFailure,
"Content error: %s:\n%s",
msg,
strings.Join(failures, "\n"),
)

return r
}

func (r *PreprocessingWorkflowResult) systemError(
ctx temporalsdk_workflow.Context,
err error,
Expand All @@ -48,7 +68,6 @@ func (r *PreprocessingWorkflowResult) systemError(
logger := temporalsdk_workflow.GetLogger(ctx)
logger.Error("System error", "message", err.Error())

// Complete last preservation task event.
ev.Complete(
temporalsdk_workflow.Now(ctx),
enums.EventOutcomeSystemFailure,
Expand All @@ -74,10 +93,8 @@ func (w *PreprocessingWorkflow) Execute(
ctx temporalsdk_workflow.Context,
params *PreprocessingWorkflowParams,
) (*PreprocessingWorkflowResult, error) {
var (
result PreprocessingWorkflowResult
e error
)
var e error
result := &PreprocessingWorkflowResult{}

logger := temporalsdk_workflow.GetLogger(ctx)
logger.Debug("PreprocessingWorkflow workflow running!", "params", params)
Expand All @@ -88,8 +105,36 @@ func (w *PreprocessingWorkflow) Execute(
}
result.RelativePath = params.RelativePath

// Validate file formats.
ev := result.newEvent(ctx, "Validate SIP file formats")
var validateFileFormat ffvalidate.Result
e = temporalsdk_workflow.ExecuteActivity(
withLocalActOpts(ctx),
ffvalidate.Name,
&ffvalidate.Params{Path: filepath.Join(w.sharedPath, params.RelativePath)},
).Get(ctx, &validateFileFormat)
if e != nil {
result.systemError(ctx, e, ev, "file format validation has failed")
return result, nil
}
if validateFileFormat.Failures != nil {
result.validationError(
ctx,
ev,
"file format validation has failed. One or more file formats are not allowed",
validateFileFormat.Failures,
)
} else {
ev.Succeed(temporalsdk_workflow.Now(ctx), "No disallowed file formats found")
}

// Stop here if there are validation errors.
if result.Outcome == OutcomeContentError {
return result, nil
}

// Bag the SIP for Enduro processing.
ev := result.newEvent(ctx, "Bag SIP")
ev = result.newEvent(ctx, "Bag SIP")
var createBag bagcreate.Result
e = temporalsdk_workflow.ExecuteActivity(
withLocalActOpts(ctx),
Expand All @@ -103,7 +148,7 @@ func (w *PreprocessingWorkflow) Execute(
}
ev.Succeed(temporalsdk_workflow.Now(ctx), "SIP has been bagged")

return &result, e
return result, nil
}

func withLocalActOpts(ctx temporalsdk_workflow.Context) temporalsdk_workflow.Context {
Expand Down
Loading

0 comments on commit 8eeedb0

Please sign in to comment.