Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Digest enhancement #71

Merged
merged 1 commit into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (

func Test_genericBlock_BlockDigest(t *testing.T) {
content := "foo"
digest := "sha1:0BEEC7B5EA3F0FDBC95D0DD47F3C5BC275DA8A33"
digest := "sha1:0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -57,7 +57,7 @@ func Test_genericBlock_BlockDigest(t *testing.T) {

func Test_genericBlock_Cache(t *testing.T) {
content := "foo"
digest := "sha1:0BEEC7B5EA3F0FDBC95D0DD47F3C5BC275DA8A33"
digest := "sha1:0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"

tests := []cacheTest{
{
Expand Down Expand Up @@ -126,7 +126,7 @@ func Test_genericBlock_IsCached(t *testing.T) {

func Test_genericBlock_RawBytes(t *testing.T) {
content := "foo"
digest := "sha1:0BEEC7B5EA3F0FDBC95D0DD47F3C5BC275DA8A33"
digest := "sha1:0beec7b5ea3f0fdbc95d0dd47f3c5bc275da8a33"

tests := []rawBytesTest{
{
Expand Down Expand Up @@ -158,7 +158,7 @@ func Test_genericBlock_RawBytes(t *testing.T) {

func Test_warcfieldsBlock_BlockDigest(t *testing.T) {
content := "foo: bar\r\ncontent-type:bb\r\n"
digest := "sha1:A1D43D400C5985BEE035C4E5A2E08F3D57989596"
digest := "sha1:a1d43d400c5985bee035c4e5a2e08f3d57989596"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -191,7 +191,7 @@ func Test_warcfieldsBlock_BlockDigest(t *testing.T) {

func Test_warcfieldsBlock_Cache(t *testing.T) {
content := "foo: bar\r\ncontent-type:bb\r\n"
digest := "sha1:A1D43D400C5985BEE035C4E5A2E08F3D57989596"
digest := "sha1:a1d43d400c5985bee035c4e5a2e08f3d57989596"

tests := []cacheTest{
{
Expand Down Expand Up @@ -272,7 +272,7 @@ func Test_warcfieldsBlock_IsCached(t *testing.T) {

func Test_warcfieldsBlock_RawBytes(t *testing.T) {
content := "foo: bar\r\ncontent-type:bb\r\n"
digest := "sha1:A1D43D400C5985BEE035C4E5A2E08F3D57989596"
digest := "sha1:a1d43d400c5985bee035c4e5a2e08f3d57989596"

tests := []rawBytesTest{
{
Expand Down Expand Up @@ -313,8 +313,8 @@ func Test_httpRequestBlock_BlockDigest(t *testing.T) {
"Referer: http://example.com/foo.html\n" +
"Connection: close\n" +
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n\n"
digest := "sha1:A3781FF1FC3FB52318F623E22C85D63D74C12932"
payloadDigest := "sha1:DA39A3EE5E6B4B0D3255BFEF95601890AFD80709"
digest := "sha1:a3781ff1fc3fb52318f623e22c85d63d74c12932"
payloadDigest := "sha1:da39a3ee5e6b4b0d3255bfef95601890afd80709"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -354,7 +354,7 @@ func Test_httpRequestBlock_Cache(t *testing.T) {
"Referer: http://example.com/foo.html\n" +
"Connection: close\n" +
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n\n"
digest := "sha1:A3781FF1FC3FB52318F623E22C85D63D74C12932"
digest := "sha1:a3781ff1fc3fb52318f623e22c85d63d74c12932"

tests := []cacheTest{
{
Expand Down Expand Up @@ -443,7 +443,7 @@ func Test_httpRequestBlock_RawBytes(t *testing.T) {
"Referer: http://example.com/foo.html\n" +
"Connection: close\n" +
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36\n\n"
digest := "sha1:A3781FF1FC3FB52318F623E22C85D63D74C12932"
digest := "sha1:a3781ff1fc3fb52318f623e22c85d63d74c12932"

tests := []rawBytesTest{
{
Expand Down Expand Up @@ -482,8 +482,8 @@ func Test_httpResponseBlock_BlockDigest(t *testing.T) {
content := "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"
digest := "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"
payloadDigest := "sha1:C37FFB221569C553A2476C22C7DAD429F3492977"
digest := "sha1:b285747ad7cc57aa74bce2e30b453c8d1cb71ba4"
payloadDigest := "sha1:c37ffb221569c553a2476c22c7dad429f3492977"

tests := []blockDigestTest{
{
Expand Down Expand Up @@ -520,7 +520,7 @@ func Test_httpResponseBlock_Cache(t *testing.T) {
content := "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"
digest := "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"
digest := "sha1:b285747ad7cc57aa74bce2e30b453c8d1cb71ba4"

tests := []cacheTest{
{
Expand Down Expand Up @@ -603,7 +603,7 @@ func Test_httpResponseBlock_RawBytes(t *testing.T) {
content := "HTTP/1.1 200 OK\nDate: Tue, 19 Sep 2016 17:18:40 GMT\nServer: Apache/2.0.54 (Ubuntu)\n" +
"Last-Modified: Mon, 16 Jun 2013 22:28:51 GMT\nETag: \"3e45-67e-2ed02ec0\"\nAccept-Ranges: bytes\n" +
"Content-Length: 19\nConnection: close\nContent-Type: text/plain\n\nThis is the content"
digest := "sha1:B285747AD7CC57AA74BCE2E30B453C8D1CB71BA4"
digest := "sha1:b285747ad7cc57aa74bce2e30b453c8d1cb71ba4"

tests := []rawBytesTest{
{
Expand Down
49 changes: 41 additions & 8 deletions digest.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package gowarc

import (
"bytes"
"crypto/md5"
"crypto/sha1"
"crypto/sha256"
Expand All @@ -36,7 +37,7 @@ func (d digestEncoding) encode(digest *digest) string {
dig := digest.Sum(nil)
switch d {
case Base16:
return strings.ToUpper(hex.EncodeToString(dig))
return strings.ToLower(hex.EncodeToString(dig))
case Base32:
return base32.StdEncoding.EncodeToString(dig)
case Base64:
Expand All @@ -46,6 +47,19 @@ func (d digestEncoding) encode(digest *digest) string {
}
}

func (d digestEncoding) decode(s string) ([]byte, error) {
switch d {
case Base16:
return hex.DecodeString(s)
case Base32:
return base32.StdEncoding.DecodeString(s)
case Base64:
return base64.StdEncoding.DecodeString(s)
default:
return []byte(s), nil
}
}

const (
unknown digestEncoding = 0
Base16 digestEncoding = 1
Expand Down Expand Up @@ -85,6 +99,22 @@ func detectEncoding(algorithm, digest string, defaultEncoding digestEncoding) di
return defaultEncoding
}

// normalizeAlgorithmName normalizes the algorithm name to the format used in WARC digest-fields.
func normalizeAlgorithmName(algorithm string) string {
algorithm = strings.ToLower(algorithm)

switch algorithm {
case "sha-1":
return "sha1"
case "sha-256":
return "sha256"
case "sha-512":
return "sha512"
default:
return algorithm
}
}

// digest is a utility for parsing, creation and validation of WARC block and payload digests.
//
// Typical usage is to create a digest from a WARC record's WARC-Block-Digest or WARC-Payload-Digest fields.
Expand Down Expand Up @@ -124,7 +154,11 @@ func (d *digest) format() string {
// digest.
func (d *digest) validate() error {
computed := d.encoding.encode(d)
if d.hash != computed {
dig, err := d.encoding.decode(d.hash)
if err != nil {
return err
}
if !bytes.Equal(dig, d.Sum(nil)) {
return fmt.Errorf("wrong digest: expected %s:%s, computed: %s:%s", d.name, d.hash, d.name, computed)
}
return nil
Expand All @@ -144,17 +178,16 @@ func (d *digest) updateDigest() {
func newDigest(digestString string, defaultEncoding digestEncoding) (*digest, error) {
t := strings.SplitN(digestString, ":", 2)
algorithm := t[0]
algorithm = strings.ToLower(algorithm)
if algorithm == "" {
return nil, fmt.Errorf("missing algorithm")
}
algorithm = normalizeAlgorithmName(algorithm)
var hash string
if len(t) > 1 {
hash = t[1]
}
encoding := detectEncoding(algorithm, hash, defaultEncoding)
if encoding < Base64 {
// base16 and base32 encodings are case insensitive.
switch encoding {
case Base16:
hash = strings.ToLower(hash)
case Base32:
hash = strings.ToUpper(hash)
}

Expand Down
41 changes: 25 additions & 16 deletions digest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,25 @@ func Test_newDigest(t *testing.T) {
wantDigest string
wantErr bool
}{
{"md5", "md5", "Some content", Base16, "md5", "md5:B53227DA4280F0E18270F21DD77C91D0", false},
{"md5 with base16 digest", "md5:12345", "Some content", Base16, "md5", "md5:B53227DA4280F0E18270F21DD77C91D0", false},
{"md5", "md5", "Some content", Base16, "md5", "md5:b53227da4280f0e18270f21dd77c91d0", false},
{"md5 with base16 digest", "md5:12345", "Some content", Base16, "md5", "md5:b53227da4280f0e18270f21dd77c91d0", false},
{"md5 with base32 digest", "md5:12345", "Some content", Base32, "md5", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A======", false},
{"md5 with base64 digest", "md5:12345", "Some content", Base64, "md5", "md5:tTIn2kKA8OGCcPId13yR0A==", false},
{"sha1", "sha1", "Some content", Base16, "sha1", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", false},
{"sha1 with base16 digest", "sha1:12345", "Some content", Base16, "sha1", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", false},
{"sha1", "sha1", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha1 with base16 digest", "sha1:12345", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha-1 with base16 digest", "sha-1:12345", "Some content", Base16, "sha1", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", false},
{"sha1 with base32 digest", "sha1:12345", "Some content", Base32, "sha1", "sha1:T4NG5T3U5H43DLSS5DVVQHKCBZR6QRJ2", false},
{"sha1 with base64 digest", "sha1:12345", "Some content", Base64, "sha1", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo=", false},
{"sha256", "sha256", "Some content", Base16, "sha256", "sha256:9C6609FC5111405EA3F5BB3D1F6B5A5EFD19A0CEC53D85893FD96D265439CD5B", false},
{"sha256 with base16 digest", "sha256:12345", "Some content", Base16, "sha256", "sha256:9C6609FC5111405EA3F5BB3D1F6B5A5EFD19A0CEC53D85893FD96D265439CD5B", false},
{"sha256", "sha256", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha-256", "sha256", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha256 with base16 digest", "sha256:12345", "Some content", Base16, "sha256", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", false},
{"sha256 with base32 digest", "sha256:12345", "Some content", Base32, "sha256", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ====", false},
{"sha256 with base64 digest", "sha256:12345", "Some content", Base64, "sha256", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs=", false},
{"sha512", "sha512", "Some content", Base16, "sha512", "sha512:B20D977718ED67F2BF7620EE2D982FD850C4883EC8D048440FE7B6A86CF6322FD791C47B0C7469DBEEF3E339032E1ABC4BCEBE5EFC104BC19A117BFEF4478605", false},
{"sha512 with base16 digest", "sha512:12345", "Some content", Base16, "sha512", "sha512:B20D977718ED67F2BF7620EE2D982FD850C4883EC8D048440FE7B6A86CF6322FD791C47B0C7469DBEEF3E339032E1ABC4BCEBE5EFC104BC19A117BFEF4478605", false},
{"sha512", "sha512", "Some content", Base16, "sha512", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", false},
{"sha512 with base16 digest", "sha512:12345", "Some content", Base16, "sha512", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", false},
{"sha512 with base32 digest", "sha512:12345", "Some content", Base32, "sha512", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI=", false},
{"sha512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", false},
{"sha-512 with base64 digest", "sha512:12345", "Some content", Base64, "sha512", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", false},
{"unknown algorithm", "mysecret:12345", "Some content", Base16, "mysecret", "mysecret:123", true},
{"unknown algorithm with digest", "mysecret:12345", "Some content", Base16, "mysecret", "mysecret:123", true},
}
Expand Down Expand Up @@ -81,42 +84,48 @@ func Test_digest_validate(t *testing.T) {
wantValid bool
}{
{"md5", "Some content", "md5", false},
{"md5 with base16 digest", "Some content", "md5:B53227DA4280F0E18270F21DD77C91D0", true},
{"md5 with base16 digest", "Some content", "md5:b53227da4280f0e18270f21dd77c91d0", true},
{"md5 with base32 digest", "Some content", "md5:WUZCPWSCQDYODATQ6IO5O7ER2A======", true},
{"md5 with base64 digest", "Some content", "md5:tTIn2kKA8OGCcPId13yR0A==", true},
{"md5 with wrong digest", "Some content", "md5:123", false},
{"sha1", "Some content", "sha1", false},
{"sha1 with base16 digest", "Some content", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", true},
{"sha1 with base16 digest", "Some content", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"SHA-1 with base16 digest", "Some content", "SHA-1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"sha1 with base32 digest", "Some content", "sha1:T4NG5T3U5H43DLSS5DVVQHKCBZR6QRJ2", true},
{"sha1 with base64 digest", "Some content", "sha1:nxpuz3Tp+bGuUujrWB1CDmPoRTo=", true},
{"sha1 with wrong digest", "Some content", "sha1:123", false},
{"sha256", "Some content", "sha256", false},
{"sha256 with base16 digest", "Some content", "sha256:9C6609FC5111405EA3F5BB3D1F6B5A5EFD19A0CEC53D85893FD96D265439CD5B", true},
{"sha256 with base16 digest", "Some content", "sha256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", true},
{"SHA-256 with base16 digest", "Some content", "SHA-256:9c6609fc5111405ea3f5bb3d1f6b5a5efd19a0cec53d85893fd96d265439cd5b", true},
{"sha256 with base32 digest", "Some content", "sha256:TRTAT7CRCFAF5I7VXM6R6222L36RTIGOYU6YLCJ73FWSMVBZZVNQ====", true},
{"sha256 with base64 digest", "Some content", "sha256:nGYJ/FERQF6j9bs9H2taXv0ZoM7FPYWJP9ltJlQ5zVs=", true},
{"sha256 with wrong digest", "Some content", "sha256:123", false},
{"sha512", "Some content", "sha512", false},
{"sha512 with base16 digest", "Some content", "sha512:B20D977718ED67F2BF7620EE2D982FD850C4883EC8D048440FE7B6A86CF6322FD791C47B0C7469DBEEF3E339032E1ABC4BCEBE5EFC104BC19A117BFEF4478605", true},
{"sha512 with base16 digest", "Some content", "sha512:b20d977718ed67f2bf7620ee2d982fd850c4883ec8d048440fe7b6a86cf6322fd791c47b0c7469dbeef3e339032e1abc4bcebe5efc104bc19a117bfef4478605", true},
{"sha512 with base32 digest", "Some content", "sha512:WIGZO5YY5VT7FP3WEDXC3GBP3BIMJCB6ZDIEQRAP463KQ3HWGIX5PEOEPMGHI2O353Z6GOIDFYNLYS6OXZPPYECLYGNBC6766RDYMBI=", true},
{"sha512 with base64 digest", "Some content", "sha512:sg2XdxjtZ/K/diDuLZgv2FDEiD7I0EhED+e2qGz2Mi/XkcR7DHRp2+7z4zkDLhq8S86+XvwQS8GaEXv+9EeGBQ==", true},
{"sha512 with wrong digest", "Some content", "sha512:123", false},
{"lovercase base16 encoding", "Some content", "sha1:9f1a6ecf74e9f9b1ae52e8eb581d420e63e8453a", true},
{"uppercase base16 encoding", "Some content", "sha1:9F1A6ECF74E9F9B1AE52E8EB581D420E63E8453A", true},
{"lovercase base32 encoding", "Some content", "sha1:t4ng5t3u5h43dlss5dvvqhkcbzr6qrj2", true},
{"lovercase base64 encoding", "Some content", "sha1:nxpuz3tp+bguuujrwb1cdmporto=", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d, _ := newDigest(tt.digestString, unknown)

assert := assert.New(t)
_, err := d.Write([]byte(tt.input))

d, err := newDigest(tt.digestString, unknown)
assert.NoError(err)
assert.NotNil(d)

_, err = d.Write([]byte(tt.input))
assert.NoError(err)

err = d.validate()
if !tt.wantValid {
assert.Error(err)
} else {
assert.NoError(err)
//assert.Equal(tt.digestString, d.format())
}
})
}
Expand Down
4 changes: 2 additions & 2 deletions example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func ExampleUnmarshaler() {
"WARC-Filename: temp-20170306040353.warc.gz\r\n" +
"WARC-Type: warcinfo\r\n" +
"Content-Type: application/warc-fields\r\n" +
"Warc-Block-Digest: sha1:AF4D582B4FFC017D07A947D841E392A821F754F3\r\n" +
"Warc-Block-Digest: sha1:af4d582b4ffc017d07a947d841e392a821f754f3\r\n" +
"Content-Length: 34\r\n" +
"\r\n" +
"format: WARC File Format 1.1\r\n" +
Expand All @@ -68,7 +68,7 @@ func ExampleUnmarshaler() {
// Output: Offset: 2, WARC record: version: WARC/1.1, type: warcinfo, id: urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008
// gowarc: Validation errors:
// 1: gowarc: record was found 2 bytes after expected offset
// 2: block: wrong digest: expected sha1:AF4D582B4FFC017D07A947D841E392A821F754F3, computed: sha1:8A936F9FD60D664CF95B1FFB40F1C4093E65BB40
// 2: block: wrong digest: expected sha1:af4d582b4ffc017d07a947d841e392a821f754f3, computed: sha1:8a936f9fd60d664cf95b1ffb40f1c4093e65bb40
}

func ExampleNewWarcFileWriter() {
Expand Down
Loading
Loading