From 86e276027a2e4ac718b19b9ff336339cf1403672 Mon Sep 17 00:00:00 2001 From: edisonguo Date: Wed, 10 Jun 2020 14:35:08 +1000 Subject: [PATCH] added pattern expression for posix crawler (#438) --- crawl/crawl.go | 8 +- crawl/extractor/info_posix.go | 138 ++++++++++++++++++++++++---------- 2 files changed, 104 insertions(+), 42 deletions(-) diff --git a/crawl/crawl.go b/crawl/crawl.go index 1dafce39..47f258fb 100644 --- a/crawl/crawl.go +++ b/crawl/crawl.go @@ -38,7 +38,8 @@ func main() { ncMetadata := false var outputFormat string posix := false - var regexPattern string + var filePattern string + followSymlink := false if len(os.Args) > 2 { @@ -52,7 +53,7 @@ func main() { flagSet.BoolVar(&landsatYaml, "landsat_yaml", false, "Extract landsat metadata from its yaml files") flagSet.StringVar(&outputFormat, "fmt", "raw", "Output format. Valid values include raw and tsv") flagSet.BoolVar(&posix, "posix", false, "Extract POSIX metadata from input directory") - flagSet.StringVar(®exPattern, "regex", "", "regex pattern for POSIX crawl") + flagSet.StringVar(&filePattern, "pattern", "", "pattern expression for POSIX crawl") flagSet.BoolVar(&followSymlink, "followSymlink", false, "Extract POSIX metadata from input directory") flagSet.Parse(os.Args[2:]) @@ -95,7 +96,8 @@ func main() { concLimit = DefaultPosixCrawlConcLimit } for _, path = range pathList { - extr.ExtractPosix(path, concLimit, regexPattern, followSymlink, outputFormat) + err := extr.ExtractPosix(path, concLimit, filePattern, followSymlink, outputFormat) + ensure(err) } return } diff --git a/crawl/extractor/info_posix.go b/crawl/extractor/info_posix.go index 87af329d..1f43294c 100644 --- a/crawl/extractor/info_posix.go +++ b/crawl/extractor/info_posix.go @@ -7,24 +7,56 @@ import ( "os" "path" "path/filepath" - "regexp" "strings" "sync" "syscall" "time" + + goeval "github.com/edisonguo/govaluate" ) -func ExtractPosix(rootDir string, conc int, pattern string, followSymlink bool, outputFormat string) { +func ExtractPosix(rootDir string, conc int, pattern string, followSymlink bool, outputFormat string) error { absRootDir, err := filepath.Abs(rootDir) if err != nil { - os.Stderr.Write([]byte(err.Error() + "\n")) - return + return err } - crawler := NewPosixCrawler(conc, pattern, followSymlink, outputFormat) + + expr, err := parsePatternExpression(pattern) + if err != nil { + return err + } + + crawler := NewPosixCrawler(conc, expr, followSymlink, outputFormat) err = crawler.Crawl(absRootDir) if err != nil { os.Stderr.Write([]byte(err.Error() + "\n")) } + return nil +} + +func parsePatternExpression(pattern string) (*goeval.EvaluableExpression, error) { + if len(strings.TrimSpace(pattern)) == 0 { + return nil, nil + } + + expr, err := goeval.NewEvaluableExpression(pattern) + if err != nil { + return nil, err + } + + validVariables := map[string]struct{}{"path": struct{}{}, "type": struct{}{}} + for _, token := range expr.Tokens() { + if token.Kind == goeval.VARIABLE { + varName, ok := token.Value.(string) + if !ok { + return nil, fmt.Errorf("variable token '%v' failed to cast string", token.Value) + } + if _, found := validVariables[varName]; !found { + return nil, fmt.Errorf("variable %v is not supported. Valid variables are %v", varName, validVariables) + } + } + } + return expr, nil } const DefaultMaxPosixErrors = 1000 @@ -34,27 +66,25 @@ type PosixCrawler struct { Outputs chan *PosixInfo Error chan error wg sync.WaitGroup - concLimit chan bool - pattern *regexp.Regexp + concLimit chan struct{} + outputDone chan struct{} + pattern *goeval.EvaluableExpression followSymlink bool outputFormat string } -func NewPosixCrawler(conc int, pattern string, followSymlink bool, outputFormat string) *PosixCrawler { +func NewPosixCrawler(conc int, pattern *goeval.EvaluableExpression, followSymlink bool, outputFormat string) *PosixCrawler { crawler := &PosixCrawler{ SubDirs: make(chan string, 4096), Outputs: make(chan *PosixInfo, 4096), Error: make(chan error, 100), wg: sync.WaitGroup{}, - concLimit: make(chan bool, conc), + concLimit: make(chan struct{}, conc), + outputDone: make(chan struct{}, 1), + pattern: pattern, followSymlink: followSymlink, outputFormat: outputFormat, } - - if len(strings.TrimSpace(pattern)) > 0 { - crawler.pattern = regexp.MustCompile(pattern) - } - return crawler } @@ -62,12 +92,12 @@ func (pc *PosixCrawler) Crawl(currPath string) error { go pc.outputResult() pc.wg.Add(1) - pc.concLimit <- false + pc.concLimit <- struct{}{} pc.crawlDir(currPath) pc.wg.Wait() close(pc.Outputs) - pc.outputResult() + <-pc.outputDone close(pc.Error) var errors []string @@ -105,8 +135,8 @@ func (pc *PosixCrawler) crawlDir(currPath string) { filePath := path.Join(currPath, fileName) fileMode := fi.Mode() - if pc.followSymlink && (fileMode&os.ModeSymlink == os.ModeSymlink) { - newFi, newPath, err := pc.resolveSymlink(currPath, fileName) + if pc.followSymlink && fileMode&os.ModeSymlink == os.ModeSymlink { + newFi, err := pc.resolveSymlink(currPath, fileName) if err != nil { select { case pc.Error <- err: @@ -116,25 +146,35 @@ func (pc *PosixCrawler) crawlDir(currPath string) { } fi = newFi - fileName = fi.Name() - filePath = path.Join(newPath, fileName) fileMode = fi.Mode() } - if fileMode.IsDir() { - pc.wg.Add(1) - go func(p string) { - pc.concLimit <- false - pc.crawlDir(p) - }(filePath) + validFileMode := fileMode.IsDir() || fileMode.IsRegular() + if !validFileMode { continue } - if !fileMode.IsRegular() { - continue + if pc.pattern != nil { + result, err := pc.evaluatePatternExpression(filePath, fileMode) + if err != nil { + select { + case pc.Error <- err: + default: + } + continue + } + + if !result { + continue + } } - if pc.pattern != nil && !pc.pattern.MatchString(filePath) { + if fileMode.IsDir() { + pc.wg.Add(1) + go func(p string) { + pc.concLimit <- struct{}{} + pc.crawlDir(p) + }(filePath) continue } @@ -163,12 +203,33 @@ func readDir(path string) ([]os.FileInfo, error) { return list, err } -func (pc *PosixCrawler) resolveSymlink(currPath string, linkName string) (os.FileInfo, string, error) { +func (pc *PosixCrawler) evaluatePatternExpression(filePath string, fileMode os.FileMode) (bool, error) { + var fileType string + if fileMode.IsDir() { + fileType = "d" + } else if fileMode.IsRegular() { + fileType = "f" + } + + parameters := map[string]interface{}{"type": fileType, "path": filePath} + result, err := pc.pattern.Evaluate(parameters) + if err != nil { + return false, fmt.Errorf("pattern expression: %v", err) + } + + val, ok := result.(bool) + if !ok { + return false, fmt.Errorf("pattern expression: result '%v' is not boolean", result) + } + return val, nil +} + +func (pc *PosixCrawler) resolveSymlink(currPath string, linkName string) (os.FileInfo, error) { filePath := currPath linkName = path.Join(filePath, linkName) fileName, err := os.Readlink(linkName) if err != nil { - return nil, "", err + return nil, err } if !path.IsAbs(fileName) { fileName = path.Join(filePath, fileName) @@ -177,36 +238,34 @@ func (pc *PosixCrawler) resolveSymlink(currPath string, linkName string) (os.Fil } isSymlink := true - filesSeen := make(map[string]bool) + filesSeen := make(map[string]struct{}) for { fi, err := os.Lstat(fileName) if err != nil { - return nil, "", err + return nil, err } if _, found := filesSeen[fileName]; found { - return nil, "", fmt.Errorf("circular symlink: %v", linkName) + return nil, fmt.Errorf("circular symlink: %v", linkName) } - filesSeen[fileName] = false + filesSeen[fileName] = struct{}{} isSymlink = fi.Mode()&os.ModeSymlink == os.ModeSymlink if isSymlink { fileName, err = os.Readlink(fileName) if err != nil { - return nil, "", err + return nil, err } if !path.IsAbs(fileName) { fileName = path.Join(filePath, fileName) fileName = filepath.Clean(fileName) filePath = filepath.Dir(fileName) } - continue } else { - return fi, filePath, nil + return fi, nil } } - } func (pc *PosixCrawler) outputResult() { @@ -218,4 +277,5 @@ func (pc *PosixCrawler) outputResult() { } fmt.Printf("%s\n", rec) } + pc.outputDone <- struct{}{} }