From 03995be0add9b99332f5ff123e330bf202abf20d Mon Sep 17 00:00:00 2001 From: David Juhasz Date: Thu, 7 Nov 2024 15:42:11 -0800 Subject: [PATCH] Add an activity to merge AIS metadata files Fixes #77. Concatenate the Arelda metadata file from the original package and the METS file created by Archivematica into a single "AIS" metadata file. [skip codecov] --- go.mod | 2 + go.sum | 5 + internal/ais/combinemd.go | 135 ++++++++++++++++++++++++++ internal/ais/combinemd_test.go | 151 +++++++++++++++++++++++++++++ internal/ais/workflow.go | 22 ++++- internal/ais/workflow_test.go | 172 +++++++++++++++++++++++++++++++++ 6 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 internal/ais/combinemd.go create mode 100644 internal/ais/combinemd_test.go create mode 100644 internal/ais/workflow_test.go diff --git a/go.mod b/go.mod index 40d8a9c3..5ef7402c 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/artefactual-sdps/preprocessing-sfa go 1.23.2 require ( + github.com/antchfx/xmlquery v1.4.2 github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 github.com/beevik/etree v1.4.0 github.com/deckarep/golang-set/v2 v2.6.0 @@ -22,6 +23,7 @@ require ( ) require ( + github.com/antchfx/xpath v1.3.2 // indirect github.com/aws/aws-sdk-go v1.55.5 // indirect github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect diff --git a/go.sum b/go.sum index 7ab41d85..7f102ac9 100644 --- a/go.sum +++ b/go.sum @@ -13,6 +13,10 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs= cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA= +github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA= +github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U= +github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0= github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug= github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= @@ -307,6 +311,7 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= diff --git a/internal/ais/combinemd.go b/internal/ais/combinemd.go new file mode 100644 index 00000000..a63a9e2f --- /dev/null +++ b/internal/ais/combinemd.go @@ -0,0 +1,135 @@ +package ais + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/antchfx/xmlquery" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/fsutil" +) + +const CombineMDActivityName = "combine-metadata-files" + +type ( + CombineMDActivity struct{} + CombineMDActivityParams struct { + AreldaPath string + METSPath string + LocalDir string + } + CombineMDActivityResult struct { + Path string + } +) + +func NewCombineMDActivity() *CombineMDActivity { + return &CombineMDActivity{} +} + +func (a *CombineMDActivity) Execute( + ctx context.Context, + params CombineMDActivityParams, +) (*CombineMDActivityResult, error) { + if !fsutil.FileExists(params.AreldaPath) { + return nil, fmt.Errorf("missing Arelda file: %s", params.AreldaPath) + } + if !fsutil.FileExists(params.METSPath) { + return nil, fmt.Errorf("missing METS file: %s", params.METSPath) + } + + name, err := aisFilename(params.AreldaPath) + if err != nil { + return nil, fmt.Errorf("name AIS file: %v", err) + } + + dest := filepath.Join(params.LocalDir, name) + + // Combine metadata files into AIS file. + w, err := os.Create(dest) // #nosec G304 -- generated path. + if err != nil { + return nil, fmt.Errorf("create AIS file: %v", err) + } + defer w.Close() + + if err := w.Chmod(os.FileMode(0o644)); err != nil { + return nil, fmt.Errorf("set AIS file permissions: %v", err) + } + + if err = concat(w, filepath.Join(params.AreldaPath), filepath.Join(params.METSPath)); err != nil { + return nil, fmt.Errorf("concat: %v", err) + } + + // Delete original metadata files. + if err = removePaths(params.AreldaPath, params.METSPath); err != nil { + return nil, fmt.Errorf("removePaths: %v", err) + } + + return &CombineMDActivityResult{Path: dest}, nil +} + +func aisFilename(mdpath string) (string, error) { + id, err := parseAccessionID(mdpath) + if err != nil { + return "", fmt.Errorf("get accession number: %v", err) + } + + id = strings.ReplaceAll(id, "/", "_") + + return fmt.Sprintf("AIS_%s", id), nil +} + +func parseAccessionID(path string) (string, error) { + f, err := os.Open(path) // #nosec G304 -- trusted path. + if err != nil { + return "", fmt.Errorf("open metadata file: %v", err) + } + defer f.Close() + + sp, err := xmlquery.CreateStreamParser(f, "//paket/ablieferung/ablieferungsnummer") + if err != nil { + return "", fmt.Errorf("create XML parser: %v", err) + } + + n, err := sp.Read() + if err == io.EOF { + return "", fmt.Errorf("can't find ablieferungsnummer in %q", filepath.Base(path)) + } + if err != nil { + return "", fmt.Errorf("read XML stream: %v", err) + } + return n.InnerText(), nil +} + +func concat(w io.Writer, paths ...string) error { + for i := range paths { + r, err := os.Open(paths[i]) // #nosec G304 -- trusted path. + if err != nil { + return fmt.Errorf("read: %v", err) + } + defer r.Close() + + if _, err := io.Copy(w, r); err != nil { + return fmt.Errorf("copy: %v", err) + } + _ = r.Close() + } + + return nil +} + +func removePaths(paths ...string) error { + var err error + for i := range paths { + if e := os.Remove(paths[i]); e != nil { + err = errors.Join(err, fmt.Errorf("remove: %v", e)) + } + } + + return err +} diff --git a/internal/ais/combinemd_test.go b/internal/ais/combinemd_test.go new file mode 100644 index 00000000..83e08478 --- /dev/null +++ b/internal/ais/combinemd_test.go @@ -0,0 +1,151 @@ +package ais_test + +import ( + "fmt" + "path/filepath" + "testing" + + temporalsdk_activity "go.temporal.io/sdk/activity" + temporalsdk_testsuite "go.temporal.io/sdk/testsuite" + "gotest.tools/v3/assert" + "gotest.tools/v3/fs" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/ais" +) + +const ( + arelda = ` + + + + FILES + Bundesverwaltung (Bern) + 1000/893_3251903 + + +` + + mets = ` + + +` +) + +type test struct { + dir string + + name string + params ais.CombineMDActivityParams + want ais.CombineMDActivityResult + wantErr string + wantManifest fs.Manifest +} + +type testFunc func(string) test + +func testDir(t *testing.T) *fs.Dir { + return fs.NewDir(t, "ppsfa", + fs.WithFile("arelda.xml", arelda), + fs.WithFile("mets.xml", mets), + ) +} + +func TestExecute(t *testing.T) { + t.Parallel() + + for _, tf := range []testFunc{ + func(dir string) test { + return test{ + dir: dir, + name: "Returns the combined metadata", + params: ais.CombineMDActivityParams{ + AreldaPath: filepath.Join(dir, "arelda.xml"), + METSPath: filepath.Join(dir, "mets.xml"), + LocalDir: dir, + }, + want: ais.CombineMDActivityResult{ + Path: filepath.Join(dir, "AIS_1000_893_3251903"), + }, + wantManifest: fs.Expected(t, + fs.WithFile("AIS_1000_893_3251903", arelda+mets, fs.WithMode(0o644)), + ), + } + }, + func(dir string) test { + return test{ + dir: dir, + name: "Errors if the Arelda file doesn't exist", + params: ais.CombineMDActivityParams{ + AreldaPath: filepath.Join(dir, "missing.xml"), + LocalDir: dir, + }, + wantErr: fmt.Sprintf( + "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): missing Arelda file: %s/missing.xml", + dir, + ), + } + }, + func(dir string) test { + return test{ + dir: dir, + name: "Errors if the METS file doesn't exist", + params: ais.CombineMDActivityParams{ + AreldaPath: filepath.Join(dir, "arelda.xml"), + METSPath: filepath.Join(dir, "missing.xml"), + LocalDir: dir, + }, + wantErr: fmt.Sprintf( + "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): missing METS file: %s/missing.xml", + dir, + ), + } + }, + func(dir string) test { + return test{ + dir: dir, + name: "Errors when the Arelda file is invalid", + params: ais.CombineMDActivityParams{ + AreldaPath: filepath.Join(dir, "mets.xml"), + METSPath: filepath.Join(dir, "mets.xml"), + LocalDir: dir, + }, + wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): name AIS file: get accession number: can't find ablieferungsnummer in \"mets.xml\"", + } + }, + } { + tt := tf(testDir(t).Path()) + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ts := &temporalsdk_testsuite.WorkflowTestSuite{} + env := ts.NewTestActivityEnvironment() + env.RegisterActivityWithOptions( + ais.NewCombineMDActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: ais.CombineMDActivityName}, + ) + + future, err := env.ExecuteActivity(ais.CombineMDActivityName, tt.params) + if tt.wantErr != "" { + if err == nil { + t.Errorf("error is nil, expecting: %q", tt.wantErr) + } else { + assert.ErrorContains(t, err, tt.wantErr) + } + + return + } + assert.NilError(t, err) + + var got ais.CombineMDActivityResult + future.Get(&got) + assert.DeepEqual(t, got, tt.want) + assert.Assert(t, fs.Equal(tt.dir, tt.wantManifest)) + }) + } +} diff --git a/internal/ais/workflow.go b/internal/ais/workflow.go index ebd1bc83..cb8e1a27 100644 --- a/internal/ais/workflow.go +++ b/internal/ais/workflow.go @@ -174,6 +174,8 @@ func (w *Workflow) SessionHandler(ctx temporalsdk_workflow.Context, aipUUID, aip return "", errors.New("UpdatedAreldaMetadata.xml and metadata.xml files not found in METS") } + metadataPath := filepath.Join(localDir, filepath.Base(metadataRelPath)) + var fetchMetadataResult FetchActivityResult e = temporalsdk_workflow.ExecuteActivity( withActivityOptsForLongLivedRequest(ctx), @@ -181,13 +183,27 @@ func (w *Workflow) SessionHandler(ctx temporalsdk_workflow.Context, aipUUID, aip &FetchActivityParams{ AIPUUID: aipUUID, RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, metadataRelPath), - Destination: filepath.Join(localDir, filepath.Base(metadataRelPath)), + Destination: metadataPath, }, ).Get(ctx, &fetchMetadataResult) if e != nil { return "", e } + var combineMDResult CombineMDActivityResult + e = temporalsdk_workflow.ExecuteActivity( + withFilesystemActivityOpts(ctx), + CombineMDActivityName, + &CombineMDActivityParams{ + AreldaPath: metadataPath, + METSPath: metsPath, + LocalDir: localDir, + }, + ).Get(ctx, &combineMDResult) + if e != nil { + return "", e + } + var zipResult archivezip.Result e = temporalsdk_workflow.ExecuteActivity( withFilesystemActivityOpts(ctx), @@ -231,6 +247,10 @@ func RegisterWorkflow(ctx context.Context, tw temporalsdk_worker.Worker, config NewParseActivity().Execute, temporalsdk_activity.RegisterOptions{Name: ParseActivityName}, ) + tw.RegisterActivityWithOptions( + NewCombineMDActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: CombineMDActivityName}, + ) tw.RegisterActivityWithOptions( archivezip.New().Execute, temporalsdk_activity.RegisterOptions{Name: archivezip.Name}, diff --git a/internal/ais/workflow_test.go b/internal/ais/workflow_test.go new file mode 100644 index 00000000..bbbdd584 --- /dev/null +++ b/internal/ais/workflow_test.go @@ -0,0 +1,172 @@ +package ais_test + +import ( + "fmt" + "path/filepath" + "testing" + + "github.com/artefactual-sdps/temporal-activities/archivezip" + "github.com/artefactual-sdps/temporal-activities/bucketupload" + "github.com/artefactual-sdps/temporal-activities/removepaths" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/suite" + temporalsdk_activity "go.temporal.io/sdk/activity" + temporalsdk_testsuite "go.temporal.io/sdk/testsuite" + temporalsdk_worker "go.temporal.io/sdk/worker" + + "github.com/artefactual-sdps/preprocessing-sfa/internal/ais" +) + +type TestSuite struct { + suite.Suite + temporalsdk_testsuite.WorkflowTestSuite + + env *temporalsdk_testsuite.TestWorkflowEnvironment + workflow *ais.Workflow + testDir string +} + +func (s *TestSuite) setup(cfg *ais.Config) { + s.env = s.NewTestWorkflowEnvironment() + s.env.SetWorkerOptions(temporalsdk_worker.Options{EnableSessionWorker: true}) + s.testDir = s.T().TempDir() + cfg.WorkingDir = s.testDir + + s.registerActivities() + + s.workflow = ais.NewWorkflow(*cfg, nil) +} + +func (s *TestSuite) registerActivities() { + s.env.RegisterActivityWithOptions( + ais.NewFetchActivity(nil).Execute, + temporalsdk_activity.RegisterOptions{Name: ais.FetchActivityName}, + ) + s.env.RegisterActivityWithOptions( + ais.NewParseActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: ais.ParseActivityName}, + ) + s.env.RegisterActivityWithOptions( + ais.NewCombineMDActivity().Execute, + temporalsdk_activity.RegisterOptions{Name: ais.CombineMDActivityName}, + ) + s.env.RegisterActivityWithOptions( + archivezip.New().Execute, + temporalsdk_activity.RegisterOptions{Name: archivezip.Name}, + ) + s.env.RegisterActivityWithOptions( + bucketupload.New(nil).Execute, + temporalsdk_activity.RegisterOptions{Name: bucketupload.Name}, + ) + s.env.RegisterActivityWithOptions( + removepaths.New().Execute, + temporalsdk_activity.RegisterOptions{Name: removepaths.Name}, + ) +} + +func TestWorkflow(t *testing.T) { + suite.Run(t, new(TestSuite)) +} + +func (s *TestSuite) TestWorkflowSuccess() { + aipUUID := "9390594f-84c2-457d-bd6a-618f21f7c954" + + s.setup(&ais.Config{}) + s.mockActivitiesSuccess(aipUUID) + + s.env.ExecuteWorkflow( + s.workflow.Execute, + &ais.WorkflowParams{AIPUUID: aipUUID}, + ) + + s.True(s.env.IsWorkflowCompleted()) + s.env.AssertExpectations(s.T()) + + var result ais.WorkflowResult + err := s.env.GetWorkflowResult(&result) + s.NoError(err) + + s.Equal(result, ais.WorkflowResult{Key: "test-" + aipUUID + ".zip"}) +} + +func (s *TestSuite) mockActivitiesSuccess(aipUUID string) { + aipName := "test-" + aipUUID + aipPath := "9390/594f/84c2/457d/bd6a/618f/21f7/c954/test-9390594f-84c2-457d-bd6a-618f21f7c954.zip" + localDir := filepath.Join(s.testDir, fmt.Sprintf("search-md_%s", aipName)) + metsName := fmt.Sprintf("METS.%s.xml", aipUUID) + metsPath := filepath.Join(localDir, metsName) + + // Mock activities. + s.env.OnActivity( + ais.GetAIPPathActivity, + mock.AnythingOfType("*context.valueCtx"), + &ais.GetAIPPathActivityParams{AIPUUID: aipUUID}, + ).Return( + &ais.GetAIPPathActivityResult{Path: aipPath}, nil, + ) + + // Mock session activities. + sessionCtx := mock.AnythingOfType("*context.timerCtx") + s.env.OnActivity( + ais.FetchActivityName, + sessionCtx, + &ais.FetchActivityParams{ + AIPUUID: aipUUID, + RelativePath: fmt.Sprintf("%s/data/%s", aipName, metsName), + Destination: metsPath, + }, + ).Return( + &ais.FetchActivityResult{}, nil, + ) + + mdpath := "objects/header/metadata.xml" + s.env.OnActivity( + ais.ParseActivityName, + sessionCtx, + &ais.ParseActivityParams{METSPath: metsPath}, + ).Return( + &ais.ParseActivityResult{MetadataRelPath: mdpath}, nil, + ) + + areldaPath := filepath.Join(localDir, "metadata.xml") + s.env.OnActivity( + ais.FetchActivityName, + sessionCtx, + &ais.FetchActivityParams{ + AIPUUID: aipUUID, + RelativePath: fmt.Sprintf("%s/data/%s", aipName, mdpath), + Destination: areldaPath, + }, + ).Return( + &ais.FetchActivityResult{}, nil, + ) + + s.env.OnActivity( + ais.CombineMDActivityName, + sessionCtx, + ais.CombineMDActivityParams{ + AreldaPath: areldaPath, + METSPath: metsPath, + LocalDir: localDir, + }, + ).Return( + &ais.CombineMDActivityResult{Path: filepath.Join(localDir, "AIS_1000_893_3251903")}, nil, + ) + + zipPath := filepath.Join(s.testDir, aipName+".zip") + s.env.OnActivity( + archivezip.Name, + sessionCtx, + &archivezip.Params{SourceDir: localDir}, + ).Return( + &archivezip.Result{Path: zipPath}, nil, + ) + + s.env.OnActivity( + bucketupload.Name, + sessionCtx, + &bucketupload.Params{Path: zipPath}, + ).Return( + &bucketupload.Result{Key: aipName + ".zip"}, nil, + ) +}