Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/alecthomas/kong v0.5.0
github.com/dgraph-io/badger/v3 v3.2103.2
github.com/google/go-cmp v0.5.5
github.com/poolpOrg/go-fastcdc v0.0.0-20211130135149-aa8a1e8a10db
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will you be able to re-use that and be casync-compatible or is that a new system altogether?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a new chunking mechanism, with a much smaller interface. You pass in a reader to some data and can use some iterator interface to get chunks, while reading through the data.

nix-casync used desync, which provided a lot of functionality that we didn't use (.caidx). Also, the way it was designed required us to first write the data to be chunked to a (temporary) file.

The chunking method used to chunk up data shouldn't matter when it comes to substitution. However, using the same chunking method with similar parameters should yield more block reuse.

github.com/stretchr/testify v1.7.0
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/poolpOrg/go-fastcdc v0.0.0-20211130135149-aa8a1e8a10db h1:PfgdUkbymefXsGoYrYjCpZh1PcmQ3tDHKGt1fSDm1+o=
github.com/poolpOrg/go-fastcdc v0.0.0-20211130135149-aa8a1e8a10db/go.mod h1:9KpZcfiu0ZuQsRGTJ8ggDEjKUndlV6TFf8IMKNhW1qA=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
Expand Down
69 changes: 69 additions & 0 deletions pkg/nixpath/chunker/chunker_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package chunker_test

import (
"bytes"
_ "embed"
"errors"
"io"
"testing"

"github.com/nix-community/go-nix/pkg/nixpath/chunker"
"github.com/stretchr/testify/assert"
)

//go:embed simple.go
var testData []byte

func TestChunkers(t *testing.T) {
fastCDCChunker, err := chunker.NewFastCDCChunker(bytes.NewReader(testData))
if err != nil {
panic(err)
}

chunkers := []struct {
Name string
Chunker chunker.Chunker
}{
{
"Simple",
chunker.NewSimpleChunker(bytes.NewReader(testData)),
},
{
"FastCDC",
fastCDCChunker,
},
}

for _, chunker := range chunkers {
t.Run(chunker.Name, func(t *testing.T) {
// grab data out of the chunker.
// Ensure it matches testData.

var receivedData bytes.Buffer
offset := uint64(0)

for {
chunk, err := chunker.Chunker.Next()
if err != nil {
if errors.Is(err, io.EOF) {
break
}
assert.NoError(t, err, "no other error other than EOF is accepted")
}

// check the offset is sane
assert.Equal(t, offset, chunk.Offset, "recorded offset size doesn't match passed offset size")

offset += uint64(len(chunk.Data))

// write the data into the receivedData buffer
if _, err := receivedData.Write(chunk.Data); err != nil {
panic(err)
}
}

// compare received chunk contents with what was passed into the chunker
assert.Equal(t, testData, receivedData.Bytes())
})
}
}
48 changes: 48 additions & 0 deletions pkg/nixpath/chunker/fastcdc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package chunker

import (
"errors"
"fmt"
"io"

fastcdc "github.com/poolpOrg/go-fastcdc"
)

func NewFastCDCChunker(r io.Reader) (Chunker, error) { // nolint:ireturn
fastcdc.NewChunkerOptions()
chunkerOpts := fastcdc.NewChunkerOptions()

// FUTUREWORK: Test with different chunk sizes
chunkerOpts.NormalSize = 64 * 2024
chunkerOpts.MinSize = chunkerOpts.NormalSize / 4
chunkerOpts.MaxSize = chunkerOpts.NormalSize * 4
Comment on lines +16 to +18
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just used the values here I used in nix-casync initially. This probably should still be refined once we can ingest a bit of data.


c, err := fastcdc.NewChunker(r, chunkerOpts)
if err != nil {
return nil, fmt.Errorf("unable to initialize fastcdc: %w", err)
}

return &FastCDCChunker{
c: c,
}, nil
}

type FastCDCChunker struct {
c *fastcdc.Chunker
}

func (f *FastCDCChunker) Next() (*Chunk, error) {
chunk, err := f.c.Next()
if err != nil {
if errors.Is(err, io.EOF) {
return nil, err
}

return nil, fmt.Errorf("error getting next chunk: %w", err)
}

return &Chunk{
Offset: chunk.Offset,
Data: chunk.Data,
}, nil
}
14 changes: 14 additions & 0 deletions pkg/nixpath/chunker/interface.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package chunker

type Chunk struct {
Offset uint64
Data []byte
}

// Chunker describes the interface that a given chunker needs to implement.
// Next() is periodically called until io.EOF is encountered.
// In case of no error, Next() returns a new chunk.

type Chunker interface {
Next() (*Chunk, error)
}
39 changes: 39 additions & 0 deletions pkg/nixpath/chunker/simple.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package chunker

import (
"bytes"
"fmt"
"io"
)

func NewSimpleChunker(r io.Reader) Chunker { // nolint:ireturn
return &SimpleChunker{
r: r,
}
}

// SimpleChunker simply returns one chunk for all of the contents.
type SimpleChunker struct {
r io.Reader
done bool
}

func (s *SimpleChunker) Next() (*Chunk, error) {
// if we already read everything, return io.EOF
if s.done {
return nil, io.EOF
}

var buf bytes.Buffer

if _, err := io.Copy(&buf, s.r); err != nil {
return nil, fmt.Errorf("error returning from reader: %w", err)
}

s.done = true

return &Chunk{
Offset: 0,
Data: buf.Bytes(),
}, nil
}