Skip to content

Commit

Permalink
git-annex: create modules/annex (#21)
Browse files Browse the repository at this point in the history
This moves the `annexObjectPath()` helper out of the tests and into a
dedicated sub-package as `annex.ContentLocation()`, and expands it with
`.Pointer()` (which validates using `git annex examinekey`),
`.IsAnnexed()` and `.Content()` to make it a more useful module.

The tests retain their own wrapper version of `ContentLocation()`
because I tried to follow close to the API modules/lfs uses, which in
terms of abstract `git.Blob` and `git.TreeEntry` objects, not in terms
of `repoPath string`s which are more convenient for the tests.
  • Loading branch information
kousu committed Nov 29, 2023
1 parent fa6acca commit a763664
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 18 deletions.
154 changes: 154 additions & 0 deletions modules/annex/annex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys.
// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path,
// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock
// filesystem, living only in process RAM). We must have the on-disk path to do anything
// useful with git-annex because all of its interesting data is on-disk under .git/annex/.

package annex

import (
"errors"
"fmt"
"os"
"path"
"strings"

"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)

const (
// > The maximum size of a pointer file is 32 kb.
// - https://git-annex.branchable.com/internals/pointer_file/
// It's unclear if that's kilobytes or kibibytes; assuming kibibytes:
blobSizeCutoff = 32 * 1024
)

// ErrInvalidPointer occurs if the pointer's value doesn't parse
var ErrInvalidPointer = errors.New("Not a git-annex pointer")

// Gets the content of the blob as raw text, up to n bytes.
// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit)
func getBlobContent(b *git.Blob, n int) (string, error) {
dataRc, err := b.DataAsync()
if err != nil {
return "", err
}
defer dataRc.Close()
buf := make([]byte, n)
n, _ = util.ReadAtMost(dataRc, buf)
buf = buf[:n]
return string(buf), nil
}

func Pointer(blob *git.Blob) (string, error) {
// git-annex doesn't seem fully spec what its pointer are, but
// the fullest description is here:
// https://git-annex.branchable.com/internals/pointer_file/

// a pointer can be:
// the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY'
// the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY'
//
// in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because
// git.Blob.DataAsync() works like open() + readlink(), handling both cases in one.

if blob.Size() > blobSizeCutoff {
// > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file.
// https://git-annex.branchable.com/internals/pointer_file/

// It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too.
return "", ErrInvalidPointer
}

pointer, err := getBlobContent(blob, blobSizeCutoff)
if err != nil {
return "", fmt.Errorf("error reading %s: %w", blob.Name(), err)
}

// the spec says a pointer file can contain multiple lines each with a pointer in them
// but that makes no sense to me, so I'm just ignoring all but the first
lines := strings.Split(pointer, "\n")
if len(lines) < 1 {
return "", ErrInvalidPointer
}
pointer = lines[0]

// in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it
if !strings.Contains(pointer, "/annex/") {
return "", ErrInvalidPointer
}

// extract $KEY
pointer = path.Base(strings.TrimSpace(pointer))

// ask git-annex's opinion on $KEY
// XXX: this is probably a bit slow, especially if this operation gets run often
// and examinekey is not that strict:
// - it doesn't enforce that the "BACKEND" tag is one it knows,
// - it doesn't enforce that the fields and their format fit the "BACKEND" tag
// so maybe this is a wasteful step
_, examineStderr, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "examinekey").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path})
if err != nil {
// TODO: make ErrInvalidPointer into a type capable of wrapping err
if strings.TrimSpace(examineStderr) == "git-annex: bad key" {
return "", ErrInvalidPointer
}
return "", err
}

return pointer, nil
}

// return the absolute path of the content pointed to by the annex pointer stored in the git object
// errors if the content is not found in this repo
func ContentLocation(blob *git.Blob) (string, error) {
pointer, err := Pointer(blob)
if err != nil {
return "", err
}

contentLocation, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path})
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", blob.Repo().Path, pointer, err)
}
contentLocation = strings.TrimSpace(contentLocation)
contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals
contentLocation = path.Join(blob.Repo().Path, contentLocation)

return contentLocation, nil
}

// returns a stream open to the annex content
func Content(blob *git.Blob) (*os.File, error) {
contentLocation, err := ContentLocation(blob)
if err != nil {
return nil, err
}

return os.Open(contentLocation)
}

// whether the object appears to be a valid annex pointer
// does *not* verify if the content is actually in this repo;
// for that, use ContentLocation()
func IsAnnexed(blob *git.Blob) (bool, error) {
if !setting.Annex.Enabled {
return false, nil
}

// Pointer() is written to only return well-formed pointers
// so the test is just to see if it errors
_, err := Pointer(blob)
if err != nil {
if errors.Is(err, ErrInvalidPointer) {
return false, nil
}
return false, err
}
return true, nil
}
4 changes: 4 additions & 0 deletions modules/git/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import (

// This file contains common functions between the gogit and !gogit variants for git Blobs

func (b *Blob) Repo() *Repository {
return b.repo
}

// Name returns name of the tree entry this blob object was created from (or empty string)
func (b *Blob) Name() string {
return b.name
Expand Down
3 changes: 2 additions & 1 deletion modules/git/blob_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ import (

// Blob represents a Git object.
type Blob struct {
ID SHA1
ID SHA1
repo *Repository

gogitEncodedObj plumbing.EncodedObject
name string
Expand Down
1 change: 1 addition & 0 deletions modules/git/repo_blob_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ func (repo *Repository) getBlob(id SHA1) (*Blob, error) {

return &Blob{
ID: id,
repo: repo,
gogitEncodedObj: encodedObj,
}, nil
}
1 change: 1 addition & 0 deletions modules/git/tree_entry_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func (te *TreeEntry) Blob() *Blob {

return &Blob{
ID: te.gogitTreeEntry.Hash,
repo: te.ptree.repo,
gogitEncodedObj: encodedObj,
name: te.Name(),
}
Expand Down
39 changes: 22 additions & 17 deletions tests/integration/git_annex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"code.gitea.io/gitea/models/db"
"code.gitea.io/gitea/models/perm"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/annex"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
api "code.gitea.io/gitea/modules/structs"
Expand Down Expand Up @@ -788,13 +789,13 @@ func doAnnexDownloadTest(remoteRepoPath, repoPath string) (err error) {
}

// verify the file was downloaded
localObjectPath, err := annexObjectPath(repoPath, "large.bin")
localObjectPath, err := contentLocation(repoPath, "large.bin")
if err != nil {
return err
}
// localObjectPath := path.Join(repoPath, "large.bin") // or, just compare against the checked-out file

remoteObjectPath, err := annexObjectPath(remoteRepoPath, "large.bin")
remoteObjectPath, err := contentLocation(remoteRepoPath, "large.bin")
if err != nil {
return err
}
Expand Down Expand Up @@ -841,13 +842,13 @@ func doAnnexUploadTest(remoteRepoPath, repoPath string) (err error) {
}

// verify the file was uploaded
localObjectPath, err := annexObjectPath(repoPath, "contribution.bin")
localObjectPath, err := contentLocation(repoPath, "contribution.bin")
if err != nil {
return err
}
// localObjectPath := path.Join(repoPath, "contribution.bin") // or, just compare against the checked-out file

remoteObjectPath, err := annexObjectPath(remoteRepoPath, "contribution.bin")
remoteObjectPath, err := contentLocation(remoteRepoPath, "contribution.bin")
if err != nil {
return err
}
Expand Down Expand Up @@ -1001,26 +1002,30 @@ Find the path in .git/annex/objects/ of the contents for a given annexed file.
TODO: pass a parameter to allow examining non-HEAD branches
*/
func annexObjectPath(repoPath, file string) (string, error) {
// NB: `git annex lookupkey` is more reliable, but doesn't work in bare repos.
annexKey, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "show").AddDynamicArguments("HEAD:" + file).RunStdString(&git.RunOpts{Dir: repoPath})
func contentLocation(repoPath, file string) (path string, err error) {
path = ""

repo, err := git.OpenRepository(git.DefaultContext, repoPath)
if err != nil {
return path, nil
}

commitID, err := repo.GetRefCommitID("HEAD") // NB: to examine a *branch*, prefix with "refs/branch/", or call repo.GetBranchCommitID(); ditto for tags
if err != nil {
return "", fmt.Errorf("in %s: %w", repoPath, err) // the error from git prints the filename but not repo
return path, nil
}

// There are two formats an annexed file pointer might be:
// * a symlink to .git/annex/objects/$HASHDIR/$ANNEX_KEY/$ANNEX_KEY - used by files created with 'git annex add'
// * a text file containing /annex/objects/$ANNEX_KEY - used by files for which 'git add' was configured to run git-annex-smudge
// This recovers $ANNEX_KEY from either case:
annexKey = path.Base(strings.TrimSpace(annexKey))
commit, err := repo.GetCommit(commitID)
if err != nil {
return path, nil
}

contentPath, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(annexKey).RunStdString(&git.RunOpts{Dir: repoPath})
treeEntry, err := commit.GetTreeEntryByPath(file)
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be annexed: %w", repoPath, file, err)
return path, nil
}
contentPath = strings.TrimSpace(contentPath)

return path.Join(repoPath, contentPath), nil
return annex.ContentLocation(treeEntry.Blob())
}

/* like withKeyFile(), but automatically sets it the account given in ctx for use by git-annex */
Expand Down

0 comments on commit a763664

Please sign in to comment.