Skip to content

Commit

Permalink
added pattern expression for posix crawler (#438)
Browse files Browse the repository at this point in the history
  • Loading branch information
edisonguo authored Jun 10, 2020
1 parent e7b2b60 commit 86e2760
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 42 deletions.
8 changes: 5 additions & 3 deletions crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ func main() {
ncMetadata := false
var outputFormat string
posix := false
var regexPattern string
var filePattern string

followSymlink := false

if len(os.Args) > 2 {
Expand All @@ -52,7 +53,7 @@ func main() {
flagSet.BoolVar(&landsatYaml, "landsat_yaml", false, "Extract landsat metadata from its yaml files")
flagSet.StringVar(&outputFormat, "fmt", "raw", "Output format. Valid values include raw and tsv")
flagSet.BoolVar(&posix, "posix", false, "Extract POSIX metadata from input directory")
flagSet.StringVar(&regexPattern, "regex", "", "regex pattern for POSIX crawl")
flagSet.StringVar(&filePattern, "pattern", "", "pattern expression for POSIX crawl")
flagSet.BoolVar(&followSymlink, "followSymlink", false, "Extract POSIX metadata from input directory")
flagSet.Parse(os.Args[2:])

Expand Down Expand Up @@ -95,7 +96,8 @@ func main() {
concLimit = DefaultPosixCrawlConcLimit
}
for _, path = range pathList {
extr.ExtractPosix(path, concLimit, regexPattern, followSymlink, outputFormat)
err := extr.ExtractPosix(path, concLimit, filePattern, followSymlink, outputFormat)
ensure(err)
}
return
}
Expand Down
138 changes: 99 additions & 39 deletions crawl/extractor/info_posix.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,56 @@ import (
"os"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"syscall"
"time"

goeval "github.com/edisonguo/govaluate"
)

func ExtractPosix(rootDir string, conc int, pattern string, followSymlink bool, outputFormat string) {
func ExtractPosix(rootDir string, conc int, pattern string, followSymlink bool, outputFormat string) error {
absRootDir, err := filepath.Abs(rootDir)
if err != nil {
os.Stderr.Write([]byte(err.Error() + "\n"))
return
return err
}
crawler := NewPosixCrawler(conc, pattern, followSymlink, outputFormat)

expr, err := parsePatternExpression(pattern)
if err != nil {
return err
}

crawler := NewPosixCrawler(conc, expr, followSymlink, outputFormat)
err = crawler.Crawl(absRootDir)
if err != nil {
os.Stderr.Write([]byte(err.Error() + "\n"))
}
return nil
}

func parsePatternExpression(pattern string) (*goeval.EvaluableExpression, error) {
if len(strings.TrimSpace(pattern)) == 0 {
return nil, nil
}

expr, err := goeval.NewEvaluableExpression(pattern)
if err != nil {
return nil, err
}

validVariables := map[string]struct{}{"path": struct{}{}, "type": struct{}{}}
for _, token := range expr.Tokens() {
if token.Kind == goeval.VARIABLE {
varName, ok := token.Value.(string)
if !ok {
return nil, fmt.Errorf("variable token '%v' failed to cast string", token.Value)
}
if _, found := validVariables[varName]; !found {
return nil, fmt.Errorf("variable %v is not supported. Valid variables are %v", varName, validVariables)
}
}
}
return expr, nil
}

const DefaultMaxPosixErrors = 1000
Expand All @@ -34,40 +66,38 @@ type PosixCrawler struct {
Outputs chan *PosixInfo
Error chan error
wg sync.WaitGroup
concLimit chan bool
pattern *regexp.Regexp
concLimit chan struct{}
outputDone chan struct{}
pattern *goeval.EvaluableExpression
followSymlink bool
outputFormat string
}

func NewPosixCrawler(conc int, pattern string, followSymlink bool, outputFormat string) *PosixCrawler {
func NewPosixCrawler(conc int, pattern *goeval.EvaluableExpression, followSymlink bool, outputFormat string) *PosixCrawler {
crawler := &PosixCrawler{
SubDirs: make(chan string, 4096),
Outputs: make(chan *PosixInfo, 4096),
Error: make(chan error, 100),
wg: sync.WaitGroup{},
concLimit: make(chan bool, conc),
concLimit: make(chan struct{}, conc),
outputDone: make(chan struct{}, 1),
pattern: pattern,
followSymlink: followSymlink,
outputFormat: outputFormat,
}

if len(strings.TrimSpace(pattern)) > 0 {
crawler.pattern = regexp.MustCompile(pattern)
}

return crawler
}

func (pc *PosixCrawler) Crawl(currPath string) error {
go pc.outputResult()

pc.wg.Add(1)
pc.concLimit <- false
pc.concLimit <- struct{}{}
pc.crawlDir(currPath)
pc.wg.Wait()

close(pc.Outputs)
pc.outputResult()
<-pc.outputDone

close(pc.Error)
var errors []string
Expand Down Expand Up @@ -105,8 +135,8 @@ func (pc *PosixCrawler) crawlDir(currPath string) {
filePath := path.Join(currPath, fileName)
fileMode := fi.Mode()

if pc.followSymlink && (fileMode&os.ModeSymlink == os.ModeSymlink) {
newFi, newPath, err := pc.resolveSymlink(currPath, fileName)
if pc.followSymlink && fileMode&os.ModeSymlink == os.ModeSymlink {
newFi, err := pc.resolveSymlink(currPath, fileName)
if err != nil {
select {
case pc.Error <- err:
Expand All @@ -116,25 +146,35 @@ func (pc *PosixCrawler) crawlDir(currPath string) {
}

fi = newFi
fileName = fi.Name()
filePath = path.Join(newPath, fileName)
fileMode = fi.Mode()
}

if fileMode.IsDir() {
pc.wg.Add(1)
go func(p string) {
pc.concLimit <- false
pc.crawlDir(p)
}(filePath)
validFileMode := fileMode.IsDir() || fileMode.IsRegular()
if !validFileMode {
continue
}

if !fileMode.IsRegular() {
continue
if pc.pattern != nil {
result, err := pc.evaluatePatternExpression(filePath, fileMode)
if err != nil {
select {
case pc.Error <- err:
default:
}
continue
}

if !result {
continue
}
}

if pc.pattern != nil && !pc.pattern.MatchString(filePath) {
if fileMode.IsDir() {
pc.wg.Add(1)
go func(p string) {
pc.concLimit <- struct{}{}
pc.crawlDir(p)
}(filePath)
continue
}

Expand Down Expand Up @@ -163,12 +203,33 @@ func readDir(path string) ([]os.FileInfo, error) {
return list, err
}

func (pc *PosixCrawler) resolveSymlink(currPath string, linkName string) (os.FileInfo, string, error) {
func (pc *PosixCrawler) evaluatePatternExpression(filePath string, fileMode os.FileMode) (bool, error) {
var fileType string
if fileMode.IsDir() {
fileType = "d"
} else if fileMode.IsRegular() {
fileType = "f"
}

parameters := map[string]interface{}{"type": fileType, "path": filePath}
result, err := pc.pattern.Evaluate(parameters)
if err != nil {
return false, fmt.Errorf("pattern expression: %v", err)
}

val, ok := result.(bool)
if !ok {
return false, fmt.Errorf("pattern expression: result '%v' is not boolean", result)
}
return val, nil
}

func (pc *PosixCrawler) resolveSymlink(currPath string, linkName string) (os.FileInfo, error) {
filePath := currPath
linkName = path.Join(filePath, linkName)
fileName, err := os.Readlink(linkName)
if err != nil {
return nil, "", err
return nil, err
}
if !path.IsAbs(fileName) {
fileName = path.Join(filePath, fileName)
Expand All @@ -177,36 +238,34 @@ func (pc *PosixCrawler) resolveSymlink(currPath string, linkName string) (os.Fil
}

isSymlink := true
filesSeen := make(map[string]bool)
filesSeen := make(map[string]struct{})

for {
fi, err := os.Lstat(fileName)
if err != nil {
return nil, "", err
return nil, err
}

if _, found := filesSeen[fileName]; found {
return nil, "", fmt.Errorf("circular symlink: %v", linkName)
return nil, fmt.Errorf("circular symlink: %v", linkName)
}
filesSeen[fileName] = false
filesSeen[fileName] = struct{}{}

isSymlink = fi.Mode()&os.ModeSymlink == os.ModeSymlink
if isSymlink {
fileName, err = os.Readlink(fileName)
if err != nil {
return nil, "", err
return nil, err
}
if !path.IsAbs(fileName) {
fileName = path.Join(filePath, fileName)
fileName = filepath.Clean(fileName)
filePath = filepath.Dir(fileName)
}
continue
} else {
return fi, filePath, nil
return fi, nil
}
}

}

func (pc *PosixCrawler) outputResult() {
Expand All @@ -218,4 +277,5 @@ func (pc *PosixCrawler) outputResult() {
}
fmt.Printf("%s\n", rec)
}
pc.outputDone <- struct{}{}
}

0 comments on commit 86e2760

Please sign in to comment.