-
Notifications
You must be signed in to change notification settings - Fork 16
pkg/nixpath/chunker: add #81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| package chunker_test | ||
|
|
||
| import ( | ||
| "bytes" | ||
| _ "embed" | ||
| "errors" | ||
| "io" | ||
| "testing" | ||
|
|
||
| "github.com/nix-community/go-nix/pkg/nixpath/chunker" | ||
| "github.com/stretchr/testify/assert" | ||
| ) | ||
|
|
||
| //go:embed simple.go | ||
| var testData []byte | ||
|
|
||
| func TestChunkers(t *testing.T) { | ||
| fastCDCChunker, err := chunker.NewFastCDCChunker(bytes.NewReader(testData)) | ||
| if err != nil { | ||
| panic(err) | ||
| } | ||
|
|
||
| chunkers := []struct { | ||
| Name string | ||
| Chunker chunker.Chunker | ||
| }{ | ||
| { | ||
| "Simple", | ||
| chunker.NewSimpleChunker(bytes.NewReader(testData)), | ||
| }, | ||
| { | ||
| "FastCDC", | ||
| fastCDCChunker, | ||
| }, | ||
| } | ||
|
|
||
| for _, chunker := range chunkers { | ||
| t.Run(chunker.Name, func(t *testing.T) { | ||
| // grab data out of the chunker. | ||
| // Ensure it matches testData. | ||
|
|
||
| var receivedData bytes.Buffer | ||
| offset := uint64(0) | ||
|
|
||
| for { | ||
| chunk, err := chunker.Chunker.Next() | ||
| if err != nil { | ||
| if errors.Is(err, io.EOF) { | ||
| break | ||
| } | ||
| assert.NoError(t, err, "no other error other than EOF is accepted") | ||
| } | ||
|
|
||
| // check the offset is sane | ||
| assert.Equal(t, offset, chunk.Offset, "recorded offset size doesn't match passed offset size") | ||
|
|
||
| offset += uint64(len(chunk.Data)) | ||
|
|
||
| // write the data into the receivedData buffer | ||
| if _, err := receivedData.Write(chunk.Data); err != nil { | ||
| panic(err) | ||
| } | ||
| } | ||
|
|
||
| // compare received chunk contents with what was passed into the chunker | ||
| assert.Equal(t, testData, receivedData.Bytes()) | ||
| }) | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| package chunker | ||
|
|
||
| import ( | ||
| "errors" | ||
| "fmt" | ||
| "io" | ||
|
|
||
| fastcdc "github.com/poolpOrg/go-fastcdc" | ||
| ) | ||
|
|
||
| func NewFastCDCChunker(r io.Reader) (Chunker, error) { // nolint:ireturn | ||
| fastcdc.NewChunkerOptions() | ||
| chunkerOpts := fastcdc.NewChunkerOptions() | ||
|
|
||
| // FUTUREWORK: Test with different chunk sizes | ||
| chunkerOpts.NormalSize = 64 * 2024 | ||
| chunkerOpts.MinSize = chunkerOpts.NormalSize / 4 | ||
| chunkerOpts.MaxSize = chunkerOpts.NormalSize * 4 | ||
|
Comment on lines
+16
to
+18
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just used the values here I used in nix-casync initially. This probably should still be refined once we can ingest a bit of data. |
||
|
|
||
| c, err := fastcdc.NewChunker(r, chunkerOpts) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("unable to initialize fastcdc: %w", err) | ||
| } | ||
|
|
||
| return &FastCDCChunker{ | ||
| c: c, | ||
| }, nil | ||
| } | ||
|
|
||
| type FastCDCChunker struct { | ||
| c *fastcdc.Chunker | ||
| } | ||
|
|
||
| func (f *FastCDCChunker) Next() (*Chunk, error) { | ||
| chunk, err := f.c.Next() | ||
| if err != nil { | ||
| if errors.Is(err, io.EOF) { | ||
| return nil, err | ||
| } | ||
|
|
||
| return nil, fmt.Errorf("error getting next chunk: %w", err) | ||
| } | ||
|
|
||
| return &Chunk{ | ||
| Offset: chunk.Offset, | ||
| Data: chunk.Data, | ||
| }, nil | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| package chunker | ||
|
|
||
| type Chunk struct { | ||
| Offset uint64 | ||
| Data []byte | ||
| } | ||
|
|
||
| // Chunker describes the interface that a given chunker needs to implement. | ||
| // Next() is periodically called until io.EOF is encountered. | ||
| // In case of no error, Next() returns a new chunk. | ||
|
|
||
| type Chunker interface { | ||
| Next() (*Chunk, error) | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| package chunker | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "fmt" | ||
| "io" | ||
| ) | ||
|
|
||
| func NewSimpleChunker(r io.Reader) Chunker { // nolint:ireturn | ||
| return &SimpleChunker{ | ||
| r: r, | ||
| } | ||
| } | ||
|
|
||
| // SimpleChunker simply returns one chunk for all of the contents. | ||
| type SimpleChunker struct { | ||
| r io.Reader | ||
| done bool | ||
| } | ||
|
|
||
| func (s *SimpleChunker) Next() (*Chunk, error) { | ||
| // if we already read everything, return io.EOF | ||
| if s.done { | ||
| return nil, io.EOF | ||
| } | ||
|
|
||
| var buf bytes.Buffer | ||
|
|
||
| if _, err := io.Copy(&buf, s.r); err != nil { | ||
| return nil, fmt.Errorf("error returning from reader: %w", err) | ||
| } | ||
|
|
||
| s.done = true | ||
|
|
||
| return &Chunk{ | ||
| Offset: 0, | ||
| Data: buf.Bytes(), | ||
| }, nil | ||
| } |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will you be able to re-use that and be casync-compatible or is that a new system altogether?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a new chunking mechanism, with a much smaller interface. You pass in a reader to some data and can use some iterator interface to get chunks, while reading through the data.
nix-casync used
desync, which provided a lot of functionality that we didn't use (.caidx). Also, the way it was designed required us to first write the data to be chunked to a (temporary) file.The chunking method used to chunk up data shouldn't matter when it comes to substitution. However, using the same chunking method with similar parameters should yield more block reuse.