Skip to content

Commit

Permalink
WIP: Add activity to merge AIS metadata files
Browse files Browse the repository at this point in the history
Fixes #77.
  • Loading branch information
djjuhasz committed Nov 8, 2024
1 parent 7dd47ca commit e13c1b7
Show file tree
Hide file tree
Showing 6 changed files with 1,014 additions and 27 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/artefactual-sdps/preprocessing-sfa
go 1.23.2

require (
github.com/antchfx/xmlquery v1.4.2
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4
github.com/beevik/etree v1.4.0
github.com/deckarep/golang-set/v2 v2.6.0
Expand All @@ -23,6 +24,7 @@ require (
)

require (
github.com/antchfx/xpath v1.3.2 // indirect
github.com/aws/aws-sdk-go v1.55.5 // indirect
github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ cloud.google.com/go/iam v1.1.13/go.mod h1:K8mY0uSXwEXS30KrnVb+j54LB/ntfZu1dr+4zF
cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/antchfx/xmlquery v1.4.2 h1:MZKd9+wblwxfQ1zd1AdrTsqVaMjMCwow3IqkCSe00KA=
github.com/antchfx/xmlquery v1.4.2/go.mod h1:QXhvf5ldTuGqhd1SHNvvtlhhdQLks4dD0awIVhXIDTA=
github.com/antchfx/xpath v1.3.2 h1:LNjzlsSjinu3bQpw9hWMY9ocB80oLOWuQqFvO6xt51U=
github.com/antchfx/xpath v1.3.2/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0=
github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug=
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
Expand Down Expand Up @@ -307,6 +311,7 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
Expand Down
120 changes: 120 additions & 0 deletions internal/ais/combinemd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package ais

import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"

"github.com/antchfx/xmlquery"

"github.com/artefactual-sdps/preprocessing-sfa/internal/fsutil"
)

const CombineMDActivityName = "combine-metadata-files"

type (
CombineMDActivity struct{}
CombineMDActivityParams struct {
AreldaRelPath string
METSRelPath string
WorkingDir string
}
CombineMDActivityResult struct {
Path string
}
)

func NewCombineMDActivity() *CombineMDActivity {
return &CombineMDActivity{}
}

func (a *CombineMDActivity) Execute(ctx context.Context, params CombineMDActivityParams) (*CombineMDActivityResult, error) {
areldaPath := filepath.Join(params.WorkingDir, params.AreldaRelPath)
if !fsutil.FileExists(areldaPath) {
return nil, fmt.Errorf("missing Arelda file: %s", areldaPath)
}

metsPath := filepath.Join(params.WorkingDir, params.METSRelPath)
if !fsutil.FileExists(metsPath) {
return nil, fmt.Errorf("missing METS file: %s", metsPath)
}

aisName, err := aisFilename(areldaPath)
if err != nil {
return nil, fmt.Errorf("name AIS file: %v", err)
}

dest := filepath.Join(params.WorkingDir, aisName)

f, err := os.Create(dest)
if err != nil {
return nil, fmt.Errorf("create AIS file: %v", err)
}
f.Close()

if err = fappend(dest, filepath.Join(params.WorkingDir, params.AreldaRelPath)); err != nil {
return nil, fmt.Errorf("append Arelda metadata: %v", err)
}

if err = fappend(dest, filepath.Join(params.WorkingDir, params.METSRelPath)); err != nil {
return nil, fmt.Errorf("append METS metadata: %v", err)
}

return &CombineMDActivityResult{Path: dest}, nil
}

func aisFilename(mdpath string) (string, error) {
id, err := parseAccessionID(mdpath)
if err != nil {
return "", fmt.Errorf("get accession number: %v", err)
}

id = strings.ReplaceAll(id, "/", "_")

return fmt.Sprintf("AIS_%s", id), nil
}

func parseAccessionID(path string) (string, error) {
f, err := os.Open(path) // #nosec G304 -- trusted path
if err != nil {
return "", fmt.Errorf("open metadata file: %v", err)
}
defer f.Close()

sp, err := xmlquery.CreateStreamParser(f, "//paket/ablieferung/ablieferungsnummer")
if err != nil {
return "", fmt.Errorf("create XML parser: %v", err)
}

n, err := sp.Read()
if err == io.EOF {
return "", fmt.Errorf("can't find ablieferungsnummer in %q", filepath.Base(path))
}
if err != nil {
return "", fmt.Errorf("read XML stream: %v", err)
}
return n.InnerText(), nil
}

func fappend(dest, src string) error {
r, err := os.Open(src)
if err != nil {
return fmt.Errorf("open src: %v", err)
}
defer r.Close()

w, err := os.OpenFile(dest, os.O_WRONLY|os.O_APPEND, os.FileMode(0o666))
if err != nil {
return fmt.Errorf("open dest: %v", err)
}
defer r.Close()

if _, err = io.Copy(w, r); err != nil {
return fmt.Errorf("copy: %v", err)
}

return nil
}
123 changes: 123 additions & 0 deletions internal/ais/combinemd_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package ais_test

import (
"strings"
"testing"

temporalsdk_activity "go.temporal.io/sdk/activity"
temporalsdk_testsuite "go.temporal.io/sdk/testsuite"
"gotest.tools/v3/assert"
"gotest.tools/v3/fs"

"github.com/artefactual-sdps/preprocessing-sfa/internal/ais"
)

const (
mets = `<?xml version='1.0' encoding='UTF-8'?>
<mets:mets xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mets="http://www.loc.gov/METS/" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd">
</mets:mets>
`

arelda = `<?xml version="1.0" encoding="UTF-8"?>
<paket xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xip="http://www.tessella.com/XIP/v4"
xmlns="http://bar.admin.ch/arelda/v4"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:submissionTests="http://bar.admin.ch/submissionTestResult"
xsi:type="paketAIP"
schemaVersion="5.0">
<ablieferung xsi:type="ablieferungFilesAIP">
<ablieferungstyp>FILES</ablieferungstyp>
<ablieferndeStelle>Bundesverwaltung (Bern)</ablieferndeStelle>
<ablieferungsnummer>1000/893_3251903</ablieferungsnummer>
</ablieferung>
</packet>
`
)

func testDir(t *testing.T) string {
td := fs.NewDir(t, "ppsfa",
fs.WithFile("arelda.xml", arelda),
fs.WithFile("mets.xml", mets),
)

return td.Path()
}

func TestExecute(t *testing.T) {
t.Parallel()

tests := []struct {
name string
params ais.CombineMDActivityParams
want ais.CombineMDActivityResult
wantErr string
}{
{
name: "Returns the combined metadata",
params: ais.CombineMDActivityParams{
AreldaRelPath: "arelda.xml",
METSRelPath: "mets.xml",
WorkingDir: testDir(t),
},
want: ais.CombineMDActivityResult{Path: "{{wd}}/AIS_1000_893_3251903"},
},
{
name: "Errors if the Arelda file doesn't exist",
params: ais.CombineMDActivityParams{
AreldaRelPath: "missing.xml",
WorkingDir: testDir(t),
},
wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): missing Arelda file: {{wd}}/missing.xml",
},
{
name: "Errors if the METS file doesn't exist",
params: ais.CombineMDActivityParams{
AreldaRelPath: "arelda.xml",
METSRelPath: "missing.xml",
WorkingDir: testDir(t),
},
wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): missing METS file: {{wd}}/missing.xml",
},
{
name: "Errors when the Arelda file is invalid",
params: ais.CombineMDActivityParams{
AreldaRelPath: "mets.xml",
WorkingDir: testDir(t),
},
wantErr: "activity error (type: combine-metadata-files, scheduledEventID: 0, startedEventID: 0, identity: ): name AIS file: get accession number: can't find ablieferungsnummer in \"mets.xml\"",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

ts := &temporalsdk_testsuite.WorkflowTestSuite{}
env := ts.NewTestActivityEnvironment()
env.RegisterActivityWithOptions(
ais.NewCombineMDActivity().Execute,
temporalsdk_activity.RegisterOptions{Name: ais.CombineMDActivityName},
)

tt.want.Path = strings.ReplaceAll(tt.want.Path, "{{wd}}", tt.params.WorkingDir)
tt.wantErr = strings.ReplaceAll(tt.wantErr, "{{wd}}", tt.params.WorkingDir)

future, err := env.ExecuteActivity(ais.CombineMDActivityName, tt.params)
if tt.wantErr != "" {
if err == nil {
t.Errorf("error is nil, expecting: %q", tt.wantErr)
} else {
assert.ErrorContains(t, err, tt.wantErr)
}

return
}
assert.NilError(t, err)

var got ais.CombineMDActivityResult
future.Get(&got)
assert.DeepEqual(t, got, tt.want)
})
}
}
43 changes: 16 additions & 27 deletions internal/ais/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,36 +151,25 @@ func (w *Workflow) SessionHandler(ctx temporalsdk_workflow.Context, params *Work
return "", e
}

var metadataPath string
if parseResult.UpdatedAreldaMetadataRelPath != "" {
var fetchMetadataResult FetchActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withRemoteActOpts(ctx),
FetchActivityName,
&FetchActivityParams{
AIPUUID: params.AIPUUID,
RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, parseResult.UpdatedAreldaMetadataRelPath),
Destination: filepath.Join(localDir, filepath.Base(parseResult.UpdatedAreldaMetadataRelPath)),
},
).Get(ctx, &fetchMetadataResult)
if e != nil {
return "", e
}
metadataPath = parseResult.UpdatedAreldaMetadataRelPath
} else {
metadataPath = parseResult.MetadataRelPath
}

if parseResult.UpdatedAreldaMetadataRelPath == "" && parseResult.MetadataRelPath != "" {
var fetchMetadataResult FetchActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withRemoteActOpts(ctx),
FetchActivityName,
&FetchActivityParams{
AIPUUID: params.AIPUUID,
RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, parseResult.MetadataRelPath),
Destination: filepath.Join(localDir, filepath.Base(parseResult.MetadataRelPath)),
},
).Get(ctx, &fetchMetadataResult)
if e != nil {
return "", e
}
var fetchMetadataResult FetchActivityResult
e = temporalsdk_workflow.ExecuteActivity(
withRemoteActOpts(ctx),
FetchActivityName,
&FetchActivityParams{
AIPUUID: params.AIPUUID,
RelativePath: fmt.Sprintf("%s/data/%s", aipDirName, metadataPath),
Destination: filepath.Join(localDir, filepath.Base(metadataPath)),
},
).Get(ctx, &fetchMetadataResult)
if e != nil {
return "", e
}

var zipResult archivezip.Result
Expand Down
Loading

0 comments on commit e13c1b7

Please sign in to comment.