Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly compute UncompressedSize on zstd:chunked pull, don’t set it on estargz #2130

Merged
merged 3 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions drivers/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,14 @@ type Driver interface {
type DriverWithDifferOutput struct {
Differ Differ
Target string
Size int64
Size int64 // Size of the uncompressed layer, -1 if unknown. Must be known if UncompressedDigest is set.
UIDs []uint32
GIDs []uint32
UncompressedDigest digest.Digest
CompressedDigest digest.Digest
Metadata string
BigData map[string][]byte
TarSplit []byte
TarSplit []byte // nil if not available
TOCDigest digest.Digest
// RootDirMode is the mode of the root directory of the layer, if specified.
RootDirMode *os.FileMode
Expand Down
15 changes: 9 additions & 6 deletions layers.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,12 @@ type Layer struct {
TOCDigest digest.Digest `json:"toc-digest,omitempty"`

// UncompressedSize is the length of the blob that was last passed to
// ApplyDiff() or create(), after we decompressed it. If
// UncompressedDigest is not set, this should be treated as if it were
// an uninitialized value.
// ApplyDiff() or create(), after we decompressed it.
//
// - If UncompressedDigest is set, this must be set to a valid value.
// - Otherwise, if TOCDigest is set, this is either valid or -1.
// - If neither of this digests is set, this should be treated as if it were
// an uninitialized value.
Comment on lines +141 to +144
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need indentation for list items (same as for code blocks, i.e. an extra space).

Suggested change
// - If UncompressedDigest is set, this must be set to a valid value.
// - Otherwise, if TOCDigest is set, this is either valid or -1.
// - If neither of this digests is set, this should be treated as if it were
// an uninitialized value.
// - If UncompressedDigest is set, this must be set to a valid value.
// - Otherwise, if TOCDigest is set, this is either valid or -1.
// - If neither of this digests is set, this should be treated as if it were
// an uninitialized value.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, gofmt didn’t want to help with this (and struct field comments are formatted in HTML just as an uninterpreted code block), so I punted.

You’re right, this is the right thing to do. Fixed in #2136 .

UncompressedSize int64 `json:"diff-size,omitempty"`

// CompressionType is the type of compression which we detected on the blob
Expand Down Expand Up @@ -1214,8 +1217,8 @@ func (r *layerStore) Size(name string) (int64, error) {
// We use the presence of a non-empty digest as an indicator that the size value was intentionally set, and that
// a zero value is not just present because it was never set to anything else (which can happen if the layer was
// created by a version of this library that didn't keep track of digest and size information).
if layer.TOCDigest != "" || layer.UncompressedDigest != "" {
return layer.UncompressedSize, nil
if layer.UncompressedDigest != "" || layer.TOCDigest != "" {
return layer.UncompressedSize, nil // This may return -1 if only TOCDigest is set
}
return -1, nil
}
Expand Down Expand Up @@ -2510,7 +2513,7 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver
return err
}

if len(diffOutput.TarSplit) != 0 {
if diffOutput.TarSplit != nil {
tsdata := bytes.Buffer{}
compressor, err := pgzip.NewWriterLevel(&tsdata, pgzip.BestSpeed)
if err != nil {
Expand Down
34 changes: 32 additions & 2 deletions pkg/chunked/compression_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func readEstargzChunkedManifest(blobStream ImageSourceSeekable, blobSize int64,
}

// readZstdChunkedManifest reads the zstd:chunked manifest from the seekable stream blobStream.
// Returns (manifest blob, parsed manifest, tar-split blob, manifest offset).
// Returns (manifest blob, parsed manifest, tar-split blob or nil, manifest offset).
func readZstdChunkedManifest(blobStream ImageSourceSeekable, tocDigest digest.Digest, annotations map[string]string) ([]byte, *internal.TOC, []byte, int64, error) {
offsetMetadata := annotations[internal.ManifestInfoKey]
if offsetMetadata == "" {
Expand Down Expand Up @@ -214,7 +214,7 @@ func readZstdChunkedManifest(blobStream ImageSourceSeekable, tocDigest digest.Di
return nil, nil, nil, 0, fmt.Errorf("unmarshaling TOC: %w", err)
}

decodedTarSplit := []byte{}
var decodedTarSplit []byte = nil
if toc.TarSplitDigest != "" {
if tarSplitChunk.Offset <= 0 {
return nil, nil, nil, 0, fmt.Errorf("TOC requires a tar-split, but the %s annotation does not describe a position", internal.TarSplitInfoKey)
Expand Down Expand Up @@ -288,6 +288,36 @@ func ensureTOCMatchesTarSplit(toc *internal.TOC, tarSplit []byte) error {
return nil
}

// tarSizeFromTarSplit computes the total tarball size, using only the tarSplit metadata
func tarSizeFromTarSplit(tarSplit []byte) (int64, error) {
var res int64 = 0

unpacker := storage.NewJSONUnpacker(bytes.NewReader(tarSplit))
for {
entry, err := unpacker.Next()
if err != nil {
if err == io.EOF {
break
}
return -1, fmt.Errorf("reading tar-split entries: %w", err)
}
switch entry.Type {
case storage.SegmentType:
res += int64(len(entry.Payload))
case storage.FileType:
// entry.Size is the “logical size”, which might not be the physical size for sparse entries;
// but the way tar-split/tar/asm.WriteOutputTarStream combines FileType entries and returned files contents,
// sparse files are not supported.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we don't error out today?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we should be erroring out, that’s something that should happen at tar-split construction.

The physical size outright isn’t available in the tar-split format, so here this consumer is only documenting the impact of that assumption.

(As is, sparse files are barely supported by archive/tar or the tar-split fork: they are indicated either by a special file type, or as a regular file a special PAX record; and there is no API to expose the data / hole segments. Also, c/storage/pkg/archive does not special-case them at all.)

// Also https://github.com/opencontainers/image-spec/blob/main/layer.md says
// > Sparse files SHOULD NOT be used because they lack consistent support across tar implementations.
res += entry.Size
default:
return -1, fmt.Errorf("unexpected tar-split entry type %q", entry.Type)
}
}
return res, nil
}

// ensureTimePointersMatch ensures that a and b are equal
func ensureTimePointersMatch(a, b *time.Time) error {
// We didn’t always use “timeIfNotZero” when creating the TOC, so treat time.IsZero the same as nil.
Expand Down
45 changes: 45 additions & 0 deletions pkg/chunked/compression_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package chunked

import (
"bytes"
"io"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/vbatts/tar-split/archive/tar"
"github.com/vbatts/tar-split/tar/asm"
"github.com/vbatts/tar-split/tar/storage"
)

func TestTarSizeFromTarSplit(t *testing.T) {
var tarball bytes.Buffer
tarWriter := tar.NewWriter(&tarball)
for _, e := range someFiles {
tf, err := typeToTarType(e.Type)
require.NoError(t, err)
err = tarWriter.WriteHeader(&tar.Header{
Typeflag: tf,
Name: e.Name,
Size: e.Size,
Mode: e.Mode,
})
require.NoError(t, err)
data := make([]byte, e.Size)
_, err = tarWriter.Write(data)
require.NoError(t, err)
}
err := tarWriter.Close()
require.NoError(t, err)
expectedTarSize := int64(tarball.Len())

var tarSplit bytes.Buffer
tsReader, err := asm.NewInputTarStream(&tarball, storage.NewJSONPacker(&tarSplit), storage.NewDiscardFilePutter())
require.NoError(t, err)
_, err = io.Copy(io.Discard, tsReader)
require.NoError(t, err)

res, err := tarSizeFromTarSplit(tarSplit.Bytes())
require.NoError(t, err)
assert.Equal(t, expectedTarSize, res)
}
81 changes: 41 additions & 40 deletions pkg/chunked/storage_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ type chunkedDiffer struct {
// is no TOC referenced by the manifest.
blobDigest digest.Digest

blobSize int64
blobSize int64
uncompressedTarSize int64 // -1 if unknown

pullOptions map[string]string

Expand Down Expand Up @@ -216,6 +217,7 @@ func makeConvertFromRawDiffer(store storage.Store, blobDigest digest.Digest, blo
fsVerityDigests: make(map[string]string),
blobDigest: blobDigest,
blobSize: blobSize,
uncompressedTarSize: -1, // Will be computed later
convertToZstdChunked: true,
copyBuffer: makeCopyBuffer(),
layersCache: layersCache,
Expand All @@ -229,24 +231,33 @@ func makeZstdChunkedDiffer(store storage.Store, blobSize int64, tocDigest digest
if err != nil {
return nil, fmt.Errorf("read zstd:chunked manifest: %w", err)
}
var uncompressedTarSize int64 = -1
if tarSplit != nil {
uncompressedTarSize, err = tarSizeFromTarSplit(tarSplit)
if err != nil {
return nil, fmt.Errorf("computing size from tar-split")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: errors.New, or fmt.Errorf("computing site from tar-split: %w", err), or just err.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Fixed in #2136 .

}
}

layersCache, err := getLayersCache(store)
if err != nil {
return nil, err
}

return &chunkedDiffer{
fsVerityDigests: make(map[string]string),
blobSize: blobSize,
tocDigest: tocDigest,
copyBuffer: makeCopyBuffer(),
fileType: fileTypeZstdChunked,
layersCache: layersCache,
manifest: manifest,
toc: toc,
pullOptions: pullOptions,
stream: iss,
tarSplit: tarSplit,
tocOffset: tocOffset,
fsVerityDigests: make(map[string]string),
blobSize: blobSize,
uncompressedTarSize: uncompressedTarSize,
tocDigest: tocDigest,
copyBuffer: makeCopyBuffer(),
fileType: fileTypeZstdChunked,
layersCache: layersCache,
manifest: manifest,
toc: toc,
pullOptions: pullOptions,
stream: iss,
tarSplit: tarSplit,
tocOffset: tocOffset,
}, nil
}

Expand All @@ -261,16 +272,17 @@ func makeEstargzChunkedDiffer(store storage.Store, blobSize int64, tocDigest dig
}

return &chunkedDiffer{
fsVerityDigests: make(map[string]string),
blobSize: blobSize,
tocDigest: tocDigest,
copyBuffer: makeCopyBuffer(),
fileType: fileTypeEstargz,
layersCache: layersCache,
manifest: manifest,
pullOptions: pullOptions,
stream: iss,
tocOffset: tocOffset,
fsVerityDigests: make(map[string]string),
blobSize: blobSize,
uncompressedTarSize: -1, // We would have to read and decompress the whole layer
tocDigest: tocDigest,
copyBuffer: makeCopyBuffer(),
fileType: fileTypeEstargz,
layersCache: layersCache,
manifest: manifest,
pullOptions: pullOptions,
stream: iss,
tocOffset: tocOffset,
}, nil
}

Expand Down Expand Up @@ -1153,7 +1165,6 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff

var compressedDigest digest.Digest
var uncompressedDigest digest.Digest
var convertedBlobSize int64

if c.convertToZstdChunked {
fd, err := unix.Open(dest, unix.O_TMPFILE|unix.O_RDWR|unix.O_CLOEXEC, 0o600)
Expand Down Expand Up @@ -1185,7 +1196,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff
if err != nil {
return graphdriver.DriverWithDifferOutput{}, err
}
convertedBlobSize = tarSize
c.uncompressedTarSize = tarSize
// fileSource is a O_TMPFILE file descriptor, so we
// need to keep it open until the entire file is processed.
defer fileSource.Close()
Expand Down Expand Up @@ -1255,6 +1266,7 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff
TOCDigest: c.tocDigest,
UncompressedDigest: uncompressedDigest,
CompressedDigest: compressedDigest,
Size: c.uncompressedTarSize,
}

// When the hard links deduplication is used, file attributes are ignored because setting them
Expand All @@ -1268,19 +1280,12 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff

var missingParts []missingPart

mergedEntries, totalSizeFromTOC, err := c.mergeTocEntries(c.fileType, toc.Entries)
mergedEntries, err := c.mergeTocEntries(c.fileType, toc.Entries)
if err != nil {
return output, err
}

output.UIDs, output.GIDs = collectIDs(mergedEntries)
if convertedBlobSize > 0 {
// if the image was converted, store the original tar size, so that
// it can be recreated correctly.
output.Size = convertedBlobSize
} else {
output.Size = totalSizeFromTOC
}

if err := maybeDoIDRemap(mergedEntries, options); err != nil {
return output, err
Expand Down Expand Up @@ -1597,9 +1602,7 @@ func mustSkipFile(fileType compressedFileType, e internal.FileMetadata) bool {
return false
}

func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []internal.FileMetadata) ([]fileMetadata, int64, error) {
var totalFilesSize int64

func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []internal.FileMetadata) ([]fileMetadata, error) {
countNextChunks := func(start int) int {
count := 0
for _, e := range entries[start:] {
Expand Down Expand Up @@ -1629,10 +1632,8 @@ func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []i
continue
}

totalFilesSize += e.Size

if e.Type == TypeChunk {
return nil, -1, fmt.Errorf("chunk type without a regular file")
return nil, fmt.Errorf("chunk type without a regular file")
}

if e.Type == TypeReg {
Expand Down Expand Up @@ -1668,7 +1669,7 @@ func (c *chunkedDiffer) mergeTocEntries(fileType compressedFileType, entries []i
lastChunkOffset = mergedEntries[i].chunks[j].Offset
}
}
return mergedEntries, totalFilesSize, nil
return mergedEntries, nil
}

// validateChunkChecksum checks if the file at $root/$path[offset:chunk.ChunkSize] has the
Expand Down
2 changes: 1 addition & 1 deletion store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2201,7 +2201,7 @@ func (s *store) ImageSize(id string) (int64, error) {
}
// The UncompressedSize is only valid if there's a digest to go with it.
n := layer.UncompressedSize
if layer.UncompressedDigest == "" {
if layer.UncompressedDigest == "" || n == -1 {
// Compute the size.
n, err = layerStore.DiffSize("", layer.ID)
if err != nil {
Expand Down