Skip to content

Commit

Permalink
Merge pull request #65 from nlnwa/repair_block
Browse files Browse the repository at this point in the history
New option to repair syntax errors in warcfields-blocks
  • Loading branch information
johnerikhalse authored Jun 30, 2023
2 parents 25fd350 + 742e670 commit 0054f7c
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 57 deletions.
17 changes: 8 additions & 9 deletions block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"io"
"io/ioutil"
"strings"
"testing"
"testing/iotest"
Expand Down Expand Up @@ -181,7 +180,7 @@ func Test_warcfieldsBlock_BlockDigest(t *testing.T) {
require.NoError(t, err)
validation := &Validation{}
o := defaultWarcRecordOptions()
block, err := newWarcFieldsBlock(tt.data, d, validation, &o)
block, err := newWarcFieldsBlock(&o, &WarcFields{}, tt.data, d, validation)
require.NoError(t, err)
require.True(t, validation.Valid(), validation)

Expand Down Expand Up @@ -222,7 +221,7 @@ func Test_warcfieldsBlock_Cache(t *testing.T) {
require.NoError(t, err)
validation := &Validation{}
o := defaultWarcRecordOptions()
block, err := newWarcFieldsBlock(tt.data, d, validation, &o)
block, err := newWarcFieldsBlock(&o, &WarcFields{}, tt.data, d, validation)
require.NoError(t, err)
if tt.wantCacheErr {
require.False(t, validation.Valid(), validation)
Expand Down Expand Up @@ -261,7 +260,7 @@ func Test_warcfieldsBlock_IsCached(t *testing.T) {
require.NoError(t, err)
validation := &Validation{}
o := defaultWarcRecordOptions()
block, err := newWarcFieldsBlock(tt.data, d, validation, &o)
block, err := newWarcFieldsBlock(&o, &WarcFields{}, tt.data, d, validation)
require.NoError(t, err)
require.True(t, validation.Valid(), validation)

Expand Down Expand Up @@ -298,7 +297,7 @@ func Test_warcfieldsBlock_RawBytes(t *testing.T) {
require.NoError(t, err)
validation := &Validation{}
o := defaultWarcRecordOptions()
block, err := newWarcFieldsBlock(tt.data, d, validation, &o)
block, err := newWarcFieldsBlock(&o, &WarcFields{}, tt.data, d, validation)
require.NoError(t, err)
require.True(t, validation.Valid(), validation)

Expand Down Expand Up @@ -657,12 +656,12 @@ func validateCacheTest(t *testing.T, block Block, expectedContent string, expect
// Reading content twice should be ok
got, err := block.RawBytes()
require.NoError(t, err)
content, err := ioutil.ReadAll(got)
content, err := io.ReadAll(got)
require.NoError(t, err)
assert.Equal(t, expectedContent, string(content))
got, err = block.RawBytes()
require.NoError(t, err)
content, err = ioutil.ReadAll(got)
content, err = io.ReadAll(got)
require.NoError(t, err)
assert.Equal(t, expectedContent, string(content))

Expand Down Expand Up @@ -738,7 +737,7 @@ func validateRawBytesTest(t *testing.T, tt rawBytesTest, block Block, expectedCo
require.NoError(t, err)
}

content, err := ioutil.ReadAll(got)
content, err := io.ReadAll(got)
require.NoError(t, err)
assert.Equal(t, expectedContent, string(content))

Expand All @@ -747,7 +746,7 @@ func validateRawBytesTest(t *testing.T, tt rawBytesTest, block Block, expectedCo
got, err := block.RawBytes()
require.NoError(t, err)

content, err := ioutil.ReadAll(got)
content, err := io.ReadAll(got)
require.NoError(t, err)
assert.Equal(t, expectedContent, string(content))
} else {
Expand Down
5 changes: 5 additions & 0 deletions digest.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ func (d *digest) validate() error {
return nil
}

// updateDigest updates the digest-string to the computed value.
func (d *digest) updateDigest() {
d.hash = d.encoding.encode(d)
}

// newDigest creates a new digest from the value of a WARC digest-field or from scratch.
//
// digestString has the format: <algorithm>[:[<digestValue>]] where algorithm is one of md5, sha1, sha256, or sha512.
Expand Down
96 changes: 56 additions & 40 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,22 @@ import (
)

type warcRecordOptions struct {
warcVersion *WarcVersion
errSyntax errorPolicy
errSpec errorPolicy
errUnknownRecordType errorPolicy
skipParseBlock bool
addMissingRecordId bool
recordIdFunc func() (string, error)
addMissingContentLength bool
addMissingDigest bool
fixContentLength bool
fixDigest bool
fixSyntaxErrors bool
defaultDigestAlgorithm string
defaultDigestEncoding digestEncoding
bufferOptions []diskbuffer.Option
warcVersion *WarcVersion
errSyntax errorPolicy
errSpec errorPolicy
errUnknownRecordType errorPolicy
skipParseBlock bool
addMissingRecordId bool
recordIdFunc func() (string, error)
addMissingContentLength bool
addMissingDigest bool
fixContentLength bool
fixDigest bool
fixSyntaxErrors bool
fixWarcFieldsBlockErrors bool
defaultDigestAlgorithm string
defaultDigestEncoding digestEncoding
bufferOptions []diskbuffer.Option
}

// The errorPolicy constants describe how to handle WARC record errors.
Expand All @@ -48,6 +49,7 @@ const (
ErrFail errorPolicy = 2 // Fail on given error.
)

// defaultIdGenerator is the default function used to generate record ids.
var defaultIdGenerator = func() (string, error) {
return uuid.New().URN(), nil
}
Expand Down Expand Up @@ -76,20 +78,21 @@ func newFuncWarcRecordOption(f func(*warcRecordOptions)) *funcWarcRecordOption {
func defaultWarcRecordOptions() warcRecordOptions {
uuid.EnableRandPool()
return warcRecordOptions{
warcVersion: V1_1,
errSyntax: ErrWarn,
errSpec: ErrWarn,
errUnknownRecordType: ErrWarn,
skipParseBlock: false,
addMissingRecordId: true,
recordIdFunc: defaultIdGenerator,
addMissingContentLength: true,
addMissingDigest: true,
defaultDigestAlgorithm: "sha1",
defaultDigestEncoding: Base32,
fixContentLength: true,
fixDigest: true,
fixSyntaxErrors: true,
warcVersion: V1_1,
errSyntax: ErrWarn,
errSpec: ErrWarn,
errUnknownRecordType: ErrWarn,
skipParseBlock: false,
addMissingRecordId: true,
recordIdFunc: defaultIdGenerator,
addMissingContentLength: true,
addMissingDigest: true,
defaultDigestAlgorithm: "sha1",
defaultDigestEncoding: Base32,
fixContentLength: true,
fixDigest: true,
fixSyntaxErrors: true,
fixWarcFieldsBlockErrors: false,
}
}

Expand Down Expand Up @@ -203,7 +206,7 @@ func WithDefaultDigestEncoding(defaultDigestEncoding digestEncoding) WarcRecordO

// WithFixContentLength sets if a ContentLength header with value which do not match the actual content length should be set to the real value.
//
// This will not have any impact if SpecViolationPolicy is ErrIgnore
// # This will not have any impact if SpecViolationPolicy is ErrIgnore
//
// defaults to true
func WithFixContentLength(fixContentLength bool) WarcRecordOption {
Expand All @@ -214,7 +217,7 @@ func WithFixContentLength(fixContentLength bool) WarcRecordOption {

// WithFixDigest sets if a BlockDigest header or a PayloadDigest header with a value which do not match the actual content should be recalculated.
//
// This will not have any impact if SpecViolationPolicy is ErrIgnore
// # This will not have any impact if SpecViolationPolicy is ErrIgnore
//
// defaults to true
func WithFixDigest(fixDigest bool) WarcRecordOption {
Expand All @@ -225,7 +228,7 @@ func WithFixDigest(fixDigest bool) WarcRecordOption {

// WithFixSyntaxErrors sets if an attempt to fix syntax errors should be done when those are detected.
//
// This will not have any impact if SyntaxErrorPolicy is ErrIgnore
// # This will not have any impact if SyntaxErrorPolicy is ErrIgnore
//
// defaults to true
func WithFixSyntaxErrors(fixSyntaxErrors bool) WarcRecordOption {
Expand All @@ -234,6 +237,17 @@ func WithFixSyntaxErrors(fixSyntaxErrors bool) WarcRecordOption {
})
}

// WithFixWarcFieldsBlockErrors sets if an attempt to fix syntax errors in warcfields block should be done when those are detected.
//
// # This will not have any impact if SyntaxErrorPolicy is ErrIgnore
//
// defaults to false
func WithFixWarcFieldsBlockErrors(fixWarcFieldsBlockErrors bool) WarcRecordOption {
return newFuncWarcRecordOption(func(o *warcRecordOptions) {
o.fixWarcFieldsBlockErrors = fixWarcFieldsBlockErrors
})
}

// WithSkipParseBlock sets parser to skip detecting known block types.
//
// This implies that no payload digest can be computed.
Expand All @@ -247,10 +261,11 @@ func WithSkipParseBlock() WarcRecordOption {
//
// This option is for parsing as fast as possible and being as lenient as possible.
// Settings implied by this option are:
// SyntaxErrorPolicy = ErrIgnore
// SpecViolationPolicy = ErrIgnore
// UnknownRecordPolicy = ErrIgnore
// SkipParseBlock = true
//
// SyntaxErrorPolicy = ErrIgnore
// SpecViolationPolicy = ErrIgnore
// UnknownRecordPolicy = ErrIgnore
// SkipParseBlock = true
func WithNoValidation() WarcRecordOption {
return newFuncWarcRecordOption(func(o *warcRecordOptions) {
o.errSyntax = ErrIgnore
Expand All @@ -263,10 +278,11 @@ func WithNoValidation() WarcRecordOption {
// WithStrictValidation sets the parser to fail on first error or violation of WARC specification.
//
// Settings implied by this option are:
// SyntaxErrorPolicy = ErrFail
// SpecViolationPolicy = ErrFail
// UnknownRecordPolicy = ErrFail
// SkipParseBlock = false
//
// SyntaxErrorPolicy = ErrFail
// SpecViolationPolicy = ErrFail
// UnknownRecordPolicy = ErrFail
// SkipParseBlock = false
func WithStrictValidation() WarcRecordOption {
return newFuncWarcRecordOption(func(o *warcRecordOptions) {
o.errSyntax = ErrFail
Expand Down
11 changes: 7 additions & 4 deletions record.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err
return
}
if strings.HasPrefix(contentType, ApplicationWarcFields) {
wr.block, err = newWarcFieldsBlock(reader, blockDigest, validation, wr.opts)
wr.block, err = newWarcFieldsBlock(wr.opts, wr.headers, reader, blockDigest, validation)
return
}
}
Expand All @@ -414,9 +414,10 @@ func (wr *warcRecord) parseBlock(reader io.Reader, validation *Validation) (err
// If the record is not cached, it might not be possible to read any content from this record after validation.
//
// The result is dependent on the SpecViolationPolicy option:
// ErrIgnore: only fatal errors are returned.
// ErrWarn: all errors found will be added to the Validation.
// ErrFail: the first error is returned and no more validation is done.
//
// ErrIgnore: only fatal errors are returned.
// ErrWarn: all errors found will be added to the Validation.
// ErrFail: the first error is returned and no more validation is done.
func (wr *warcRecord) ValidateDigest(validation *Validation) error {
if wr.opts.errSpec > ErrIgnore {
if err := wr.Block().Cache(); err != nil {
Expand Down Expand Up @@ -471,6 +472,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {
validation.addError(fmt.Errorf("block: %w", err))
if wr.opts.fixDigest {
// Digest validation failed. But if fixDigest option is set, the calculated digest will be set.
blockDigest.updateDigest()
wr.WarcHeader().Set(WarcBlockDigest, blockDigest.format())
}
case ErrFail:
Expand Down Expand Up @@ -500,6 +502,7 @@ func (wr *warcRecord) ValidateDigest(validation *Validation) error {
validation.addError(fmt.Errorf("payload: %w", err))
// Digest validation failed. But if fixDigest option is set, the calculated digest will be set.
if wr.opts.fixDigest {
payloadDigest.updateDigest()
wr.WarcHeader().Set(WarcPayloadDigest, payloadDigest.format())
}
case ErrFail:
Expand Down
3 changes: 1 addition & 2 deletions unmarshaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/klauspost/compress/gzip"
"github.com/nlnwa/gowarc/internal/countingreader"
"io"
"io/ioutil"
)

type Unmarshaler interface {
Expand Down Expand Up @@ -147,7 +146,7 @@ func (u *unmarshaler) Unmarshal(b *bufio.Reader) (WarcRecord, int64, *Validation
_ = record.block.Close()
}()

_, err := io.Copy(ioutil.Discard, content)
_, err := io.Copy(io.Discard, content)

// Discarding 4 bytes which makes up the end of record marker (\r\n\r\n)
b, e := r.Peek(4)
Expand Down
54 changes: 53 additions & 1 deletion unmarshaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package gowarc
import (
"bufio"
"bytes"
"fmt"
"github.com/klauspost/compress/gzip"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -589,6 +590,7 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
"WARC-Type: metadata\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Content-Length: 18\r\n" +
"WARC-Block-Digest: sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY\r\n" +
"\r\n" +
"foo: bar\n" +
"food:bar\n" +
Expand All @@ -603,6 +605,7 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
&nameValue{Name: WarcType, Value: "metadata"},
&nameValue{Name: ContentType, Value: "application/warc-fields"},
&nameValue{Name: ContentLength, Value: "18"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY"},
},
&warcFieldsBlock{},
"foo: bar\nfood:bar\n",
Expand All @@ -615,6 +618,55 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
0,
false,
},
{
"metadata record missing carriage return in warc-fields block with fix syntax errors",
[]WarcRecordOption{
WithSpecViolationPolicy(ErrWarn),
WithSyntaxErrorPolicy(ErrWarn),
WithAddMissingDigest(true),
WithFixSyntaxErrors(true),
WithFixDigest(true),
WithAddMissingContentLength(false),
WithAddMissingRecordId(false),
WithFixContentLength(true),
WithFixWarcFieldsBlockErrors(true),
},
"WARC/1.0\r\n" +
"WARC-Date: 2017-03-06T04:03:53Z\r\n" +
"WARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\n" +
"WARC-Type: metadata\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Content-Length: 18\r\n" +
"WARC-Block-Digest: sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY\r\n" +
"\r\n" +
"foo: bar\n" +
"food:bar\n" +
"\r\n" +
"\r\n",
want{
V1_0,
Metadata,
&WarcFields{
&nameValue{Name: WarcDate, Value: "2017-03-06T04:03:53Z"},
&nameValue{Name: WarcRecordID, Value: "<urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>"},
&nameValue{Name: WarcType, Value: "metadata"},
&nameValue{Name: ContentType, Value: "application/warc-fields"},
&nameValue{Name: ContentLength, Value: "21"},
&nameValue{Name: WarcBlockDigest, Value: "sha1:U2AN4MFP7IITXSOLYH2QTIPVDNJOHBFO"},
},
&warcFieldsBlock{},
"Foo: bar\r\nFood: bar\r\n",
&Validation{
newWrappedSyntaxError("error in warc fields block", nil, newSyntaxError("missing carriage return", &position{1})),
newWrappedSyntaxError("error in warc fields block", nil, newSyntaxError("missing carriage return", &position{2})),
fmt.Errorf("content length mismatch. header: 18, actual: 21"),
fmt.Errorf("block: %w", fmt.Errorf("wrong digest: expected sha1:QYG3QQJ4ULYPJGSJL34IS3U7VUAJFSKY, computed: sha1:U2AN4MFP7IITXSOLYH2QTIPVDNJOHBFO")),
},
true,
},
0,
false,
},
{
"short response record",
[]WarcRecordOption{WithSpecViolationPolicy(ErrFail), WithSyntaxErrorPolicy(ErrFail), WithUnknownRecordTypePolicy(ErrIgnore)},
Expand Down Expand Up @@ -821,7 +873,7 @@ func Test_unmarshaler_Unmarshal(t *testing.T) {
require.NoError(err3)
}

assert.Equal(tt.want.validation, validation, "%s", validation.String())
assert.Equal(tt.want.validation, validation, "Want:\n %s\nGot:\n %s", tt.want.validation, validation.String())
})
}
}
Expand Down
Loading

0 comments on commit 0054f7c

Please sign in to comment.