Skip to content

Commit

Permalink
add: use .knowignore in top-level ingestion path if exists (#145)
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 authored Oct 18, 2024
1 parent 027f53b commit f4306a8
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 10 deletions.
28 changes: 21 additions & 7 deletions pkg/client/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,25 @@ import (
func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID string, ingestionFunc func(path string, metadata map[string]any) error, paths ...string) (int, error) {
ingestedFilesCount := 0

var ignorePatterns []gitignore.Pattern
var ignoreFilePatterns []gitignore.Pattern
var err error
if opts.IgnoreFile != "" {
ignorePatterns, err = readIgnoreFile(opts.IgnoreFile)
ignoreFilePatterns, err = readIgnoreFile(opts.IgnoreFile)
if err != nil {
return ingestedFilesCount, fmt.Errorf("failed to read ignore file %q: %w", opts.IgnoreFile, err)
}
}

var ignoreExtensionsPatterns []gitignore.Pattern
if len(opts.IgnoreExtensions) > 0 {
for _, ext := range opts.IgnoreExtensions {
if ext != "" {
p := "*." + strings.TrimPrefix(ext, ".")
ignorePatterns = append(ignorePatterns, gitignore.ParsePattern(p, nil))
ignoreExtensionsPatterns = append(ignoreExtensionsPatterns, gitignore.ParsePattern(p, nil))
}
}
}

ignorePatterns = append(ignorePatterns, DefaultIgnorePatterns...)

ignore := gitignore.NewMatcher(ignorePatterns)

if opts.Concurrency < 1 {
opts.Concurrency = 10
}
Expand All @@ -56,6 +53,23 @@ func ingestPaths(ctx context.Context, c Client, opts *IngestPathsOpts, datasetID

for _, p := range paths {
path := p

// Build ignore matcher using patterns in increasing priority
// 1. Default ignore file
// 2. User-provided ignore file
// 3. User-provided ignore extensions
// 4. Default ignore patterns
var currentIgnorePatterns []gitignore.Pattern
defaultIgnoreFilePatterns, err := useDefaultIgnoreFileIfExists(path)
if err != nil {
return ingestedFilesCount, fmt.Errorf("failed to use default ignore file: %w", err)
}
currentIgnorePatterns = append(defaultIgnoreFilePatterns, ignoreFilePatterns...)
currentIgnorePatterns = append(currentIgnorePatterns, ignoreExtensionsPatterns...)
currentIgnorePatterns = append(currentIgnorePatterns, DefaultIgnorePatterns...)

ignore := gitignore.NewMatcher(currentIgnorePatterns)

var touchedFilePaths []string

if strings.HasPrefix(filepath.Base(filepath.Clean(path)), ".") {
Expand Down
46 changes: 43 additions & 3 deletions pkg/client/ignore.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,56 @@ import (
"github.com/go-git/go-git/v5/plumbing/format/gitignore"
)

const DefaultIgnoreFile = ".knowignore"

var DefaultIgnorePatterns = []gitignore.Pattern{
gitignore.ParsePattern(MetadataFilename, nil), // Knowledge Metadata file
gitignore.ParsePattern("~$*", nil), // MS Office temp files
gitignore.ParsePattern("$*", nil), // Likely hidden/tempfiles
gitignore.ParsePattern(DefaultIgnoreFile, nil), // Default ignore patterns
gitignore.ParsePattern(MetadataFilename, nil), // Knowledge Metadata file
gitignore.ParsePattern("~$*", nil), // MS Office temp files
gitignore.ParsePattern("$*", nil), // Likely hidden/tempfiles
}

func isIgnored(ignore gitignore.Matcher, path string) bool {
return ignore.Match(strings.Split(path, string(filepath.Separator)), false)
}

func readDefaultIgnoreFile(dirPath string) ([]gitignore.Pattern, error) {

ignoreFilePath := filepath.Join(dirPath, DefaultIgnoreFile)
_, err := os.Stat(ignoreFilePath)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, fmt.Errorf("failed to check if ignore file %q exists: %w", ignoreFilePath, err)
}

return readIgnoreFile(ignoreFilePath)
}

func useDefaultIgnoreFileIfExists(path string) ([]gitignore.Pattern, error) {

var err error
path, err = filepath.Abs(path)
if err != nil {
return nil, fmt.Errorf("failed to get absolute path: %w", err)
}
finfo, err := os.Stat(path)
if err != nil {
return nil, fmt.Errorf("failed to check if path %q exists: %w", path, err)
}
if !finfo.IsDir() {
path = filepath.Dir(path)
}

ignorePatterns, err := readDefaultIgnoreFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read default ignore file: %w", err)
}

return ignorePatterns, nil
}

func readIgnoreFile(path string) ([]gitignore.Pattern, error) {
stat, err := os.Stat(path)
if err != nil {
Expand Down

0 comments on commit f4306a8

Please sign in to comment.