Skip to content

Commit 5efc372

Browse files
committed
pkg/nixpath/chunker: add
This provides two different implementations to chunk data.
1 parent 60a2ce4 commit 5efc372

File tree

6 files changed

+173
-0
lines changed

6 files changed

+173
-0
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/alecthomas/kong v0.5.0
77
github.com/dgraph-io/badger/v3 v3.2103.2
88
github.com/google/go-cmp v0.5.5
9+
github.com/poolpOrg/go-fastcdc v0.0.0-20211130135149-aa8a1e8a10db
910
github.com/stretchr/testify v1.7.0
1011
)
1112

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
6767
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
6868
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
6969
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
70+
github.com/poolpOrg/go-fastcdc v0.0.0-20211130135149-aa8a1e8a10db h1:PfgdUkbymefXsGoYrYjCpZh1PcmQ3tDHKGt1fSDm1+o=
71+
github.com/poolpOrg/go-fastcdc v0.0.0-20211130135149-aa8a1e8a10db/go.mod h1:9KpZcfiu0ZuQsRGTJ8ggDEjKUndlV6TFf8IMKNhW1qA=
7072
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
7173
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
7274
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package chunker_test
2+
3+
import (
4+
"bytes"
5+
_ "embed"
6+
"errors"
7+
"io"
8+
"testing"
9+
10+
"github.com/nix-community/go-nix/pkg/nixpath/chunker"
11+
"github.com/stretchr/testify/assert"
12+
)
13+
14+
//go:embed simple.go
15+
var testData []byte
16+
17+
func TestChunkers(t *testing.T) {
18+
fastCDCChunker, err := chunker.NewFastCDCChunker(bytes.NewReader(testData))
19+
if err != nil {
20+
panic(err)
21+
}
22+
23+
chunkers := []struct {
24+
Name string
25+
Chunker chunker.Chunker
26+
}{
27+
{
28+
"Simple",
29+
chunker.NewSimpleChunker(bytes.NewReader(testData)),
30+
},
31+
{
32+
"FastCDC",
33+
fastCDCChunker,
34+
},
35+
}
36+
37+
for _, chunker := range chunkers {
38+
t.Run(chunker.Name, func(t *testing.T) {
39+
// grab data out of the chunker.
40+
// Ensure it matches testData.
41+
42+
var receivedData bytes.Buffer
43+
offset := uint64(0)
44+
45+
for {
46+
chunk, err := chunker.Chunker.Next()
47+
if err != nil {
48+
if errors.Is(err, io.EOF) {
49+
break
50+
}
51+
assert.NoError(t, err, "no other error other than EOF is accepted")
52+
}
53+
54+
// check the offset is sane
55+
assert.Equal(t, offset, chunk.Offset, "recorded offset size doesn't match passed offset size")
56+
57+
offset += uint64(len(chunk.Data))
58+
59+
// write the data into the receivedData buffer
60+
if _, err := receivedData.Write(chunk.Data); err != nil {
61+
panic(err)
62+
}
63+
}
64+
65+
// compare received chunk contents with what was passed into the chunker
66+
assert.Equal(t, testData, receivedData.Bytes())
67+
})
68+
}
69+
}

pkg/nixpath/chunker/fastcdc.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package chunker
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
"io"
7+
8+
fastcdc "github.com/poolpOrg/go-fastcdc"
9+
)
10+
11+
func NewFastCDCChunker(r io.Reader) (Chunker, error) { // nolint:ireturn
12+
fastcdc.NewChunkerOptions()
13+
chunkerOpts := fastcdc.NewChunkerOptions()
14+
15+
// FUTUREWORK: Test with different chunk sizes
16+
chunkerOpts.NormalSize = 64 * 2024
17+
chunkerOpts.MinSize = chunkerOpts.NormalSize / 4
18+
chunkerOpts.MaxSize = chunkerOpts.NormalSize * 4
19+
20+
c, err := fastcdc.NewChunker(r, chunkerOpts)
21+
if err != nil {
22+
return nil, fmt.Errorf("unable to initialize fastcdc: %w", err)
23+
}
24+
25+
return &FastCDCChunker{
26+
c: c,
27+
}, nil
28+
}
29+
30+
type FastCDCChunker struct {
31+
c *fastcdc.Chunker
32+
}
33+
34+
func (f *FastCDCChunker) Next() (*Chunk, error) {
35+
chunk, err := f.c.Next()
36+
if err != nil {
37+
if errors.Is(err, io.EOF) {
38+
return nil, err
39+
}
40+
41+
return nil, fmt.Errorf("error getting next chunk: %w", err)
42+
}
43+
44+
return &Chunk{
45+
Offset: chunk.Offset,
46+
Data: chunk.Data,
47+
}, nil
48+
}

pkg/nixpath/chunker/interface.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package chunker
2+
3+
type Chunk struct {
4+
Offset uint64
5+
Data []byte
6+
}
7+
8+
// Chunker describes the interface that a given chunker needs to implement.
9+
// Next() is periodically called until io.EOF is encountered.
10+
// In case of no error, Next() returns a new chunk.
11+
12+
type Chunker interface {
13+
Next() (*Chunk, error)
14+
}

pkg/nixpath/chunker/simple.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package chunker
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"io"
7+
)
8+
9+
func NewSimpleChunker(r io.Reader) Chunker { // nolint:ireturn
10+
return &SimpleChunker{
11+
r: r,
12+
}
13+
}
14+
15+
// SimpleChunker simply returns one chunk for all of the contents.
16+
type SimpleChunker struct {
17+
r io.Reader
18+
done bool
19+
}
20+
21+
func (s *SimpleChunker) Next() (*Chunk, error) {
22+
// if we already read everything, return io.EOF
23+
if s.done {
24+
return nil, io.EOF
25+
}
26+
27+
var buf bytes.Buffer
28+
29+
if _, err := io.Copy(&buf, s.r); err != nil {
30+
return nil, fmt.Errorf("error returning from reader: %w", err)
31+
}
32+
33+
s.done = true
34+
35+
return &Chunk{
36+
Offset: 0,
37+
Data: buf.Bytes(),
38+
}, nil
39+
}

0 commit comments

Comments
 (0)