diff --git a/Makefile b/Makefile index fe46da0..8a903eb 100644 --- a/Makefile +++ b/Makefile @@ -12,8 +12,8 @@ check: gofmt -w ./.. goimports -w ./.. -build: ./main/main.go - go build -o $(EXEC_NAME) ./main/main.go +build: ./main.go + go build -o $(EXEC_NAME) ./main.go clean: $(EXEC_NAME) rm $(EXEC_NAME) diff --git a/build.bat b/build.bat index 16ad770..08e2c43 100644 --- a/build.bat +++ b/build.bat @@ -24,6 +24,6 @@ echo[ ::build echo Building... -go build -o %EXEC_NAME% ./main/main.go +go build -o %EXEC_NAME% ./main.go if ERRORLEVEL 1 exit /b %ERRORLEVEL% :: fail if error occurred echo Build complete! \ No newline at end of file diff --git a/main/main.go b/main.go similarity index 87% rename from main/main.go rename to main.go index 5e1cf01..bc323af 100644 --- a/main/main.go +++ b/main.go @@ -10,6 +10,7 @@ import ( "github.com/UTDNebula/api-tools/parser" "github.com/UTDNebula/api-tools/scrapers" "github.com/UTDNebula/api-tools/uploader" + "github.com/UTDNebula/api-tools/utils" ) func main() { @@ -47,6 +48,9 @@ func main() { upload := flag.Bool("upload", false, "Puts the tool into upload mode.") replace := flag.Bool("replace", false, "Alongside -upload, specifies that uploaded data should replace existing data rather than being merged.") + // Flags for logging + verbose := flag.Bool("verbose", false, "Enables verbose logging, good for debugging purposes.") + // Parse flags flag.Parse() @@ -66,7 +70,14 @@ func main() { } defer logFile.Close() - log.SetOutput(logFile) + // Set logging output destination to a SplitWriter that writes to both the log file and stdout + log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) + // Do verbose logging if verbose flag specified + if *verbose { + log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile | utils.Lverbose) + } else { + log.SetFlags(log.Ltime) + } // Perform actions based on flags switch { diff --git a/parser/gradeLoader.go b/parser/gradeLoader.go index cc6695a..bc6abdc 100644 --- a/parser/gradeLoader.go +++ b/parser/gradeLoader.go @@ -15,7 +15,7 @@ func loadGrades(csvDir string) map[string]map[string][]int { gradeMap := make(map[string]map[string][]int) if csvDir == "" { - log.Print("No grade data CSV directory specified. Grade data will not be included.\n") + log.Print("No grade data CSV directory specified. Grade data will not be included.") return gradeMap } diff --git a/parser/parser.go b/parser/parser.go index a2e3672..58718dd 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -42,7 +42,7 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { // Load grade data from csv in advance GradeMap = loadGrades(csvPath) if len(GradeMap) != 0 { - log.Printf("Loaded grade distributions for %d semesters.\n\n", len(GradeMap)) + log.Printf("Loaded grade distributions for %d semesters.", len(GradeMap)) } // Try to load any existing profile data @@ -51,9 +51,9 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { // Find paths of all scraped data paths := utils.GetAllFilesWithExtension(inDir, ".html") if !skipValidation { - log.Printf("Parsing and validating %d files...\n", len(paths)) + log.Printf("Parsing and validating %d files...", len(paths)) } else { - log.Printf("Parsing %d files WITHOUT VALIDATION...\n", len(paths)) + log.Printf("Parsing %d files WITHOUT VALIDATION...", len(paths)) } // Parse all data @@ -61,9 +61,9 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { parse(path) } - log.Printf("\nParsing complete. Created %d courses, %d sections, and %d professors.\n", len(Courses), len(Sections), len(Professors)) + log.Printf("\nParsing complete. Created %d courses, %d sections, and %d professors.", len(Courses), len(Sections), len(Professors)) - log.Print("\nParsing course requisites...\n") + log.Print("\nParsing course requisites...") // Initialize matchers at runtime for requisite parsing; this is necessary to avoid circular reference errors with compile-time initialization initMatchers() @@ -71,12 +71,12 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { for _, course := range Courses { ReqParsers[course.Id]() } - log.Print("Finished parsing course requisites!\n") + log.Print("Finished parsing course requisites!") if !skipValidation { - log.Print("\nStarting validation stage...\n") + log.Print("\nStarting validation stage...") validate() - log.Print("\nValidation complete!\n") + log.Print("\nValidation complete!") } // Make outDir if it doesn't already exist @@ -93,7 +93,8 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) { // Internal parse function func parse(path string) { - log.Printf("Parsing %s...\n", path) + + utils.VPrintf("Parsing %s...", path) // Open data file for reading fptr, err := os.Open(path) @@ -156,5 +157,5 @@ func parse(path string) { // Try to create the course and section based on collected info courseRef := parseCourse(courseNum, session, rowInfo, classInfo) parseSection(courseRef, classNum, syllabusURI, session, rowInfo, classInfo) - log.Print("Parsed!\n") + utils.VPrint("Parsed!") } diff --git a/parser/profileLoader.go b/parser/profileLoader.go index 9382911..0d16aa7 100644 --- a/parser/profileLoader.go +++ b/parser/profileLoader.go @@ -12,13 +12,13 @@ import ( func loadProfiles(inDir string) { fptr, err := os.Open(fmt.Sprintf("%s/profiles.json", inDir)) if err != nil { - log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.\n") + log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.") return } decoder := json.NewDecoder(fptr) - log.Print("Beginning profile load.\n") + log.Print("Beginning profile load.") // Read open bracket _, err = decoder.Token() @@ -46,6 +46,6 @@ func loadProfiles(inDir string) { panic(err) } - log.Printf("Loaded %d profiles!\n\n", profileCount) + log.Printf("Loaded %d profiles!", profileCount) fptr.Close() } diff --git a/parser/requisiteParser.go b/parser/requisiteParser.go index 8d89630..095f150 100644 --- a/parser/requisiteParser.go +++ b/parser/requisiteParser.go @@ -94,7 +94,7 @@ func ORMatcher(group string, subgroups []string) interface{} { func CourseMinGradeMatcher(group string, subgroups []string) interface{} { icn, err := findICN(subgroups[1], subgroups[2]) if err != nil { - log.Printf("WARN: %s\n", err) + log.Printf("WARN: %s", err) return OtherMatcher(group, subgroups) } return schema.NewCourseRequirement(icn, subgroups[3]) @@ -103,7 +103,7 @@ func CourseMinGradeMatcher(group string, subgroups []string) interface{} { func CourseMatcher(group string, subgroups []string) interface{} { icn, err := findICN(subgroups[1], subgroups[2]) if err != nil { - log.Printf("WARN: %s\n", err) + log.Printf("WARN: %s", err) return OtherMatcher(group, subgroups) } return schema.NewCourseRequirement(icn, "D") @@ -411,7 +411,6 @@ func getReqParser(course *schema.Course, hasEnrollmentReqs bool, enrollmentReqs if len(parsedChunks) > 0 { *reqPtr = schema.NewCollectionRequirement("REQUISITES", len(parsedChunks), parsedChunks) } - log.Printf("\n\n") } } } @@ -469,7 +468,6 @@ func joinAdjacentOthers(reqs []interface{}, joinString string) []interface{} { if temp.Description != "" { joinedReqs = append(joinedReqs, temp) } - //log.Printf("JOINEDREQS ARE: %v\n", joinedReqs) return joinedReqs } @@ -499,22 +497,18 @@ func parseGroup(grp string) interface{} { if matches != nil { // If an applicable matcher has been found, return the result of calling its handler result := matcher.Handler(grp, matches) - log.Printf("'%s' -> %T\n", grp, result) + utils.VPrintf("'%s' -> %T", grp, result) return result } } - // Panic if no matcher was able to be found for a given group -- this means we need to add handling for it!!! - //log.Panicf("NO MATCHER FOUND FOR GROUP '%s'\nSTACK IS: %#v\n", grp, requisiteList) - //log.Printf("NO MATCHER FOR: '%s'\n", grp) - log.Printf("'%s' -> parser.OtherRequirement\n", grp) - //var temp string - //fmt.Scanf("%s", temp) + // If the group couldn't be parsed, give up and make it an OtherRequirement + utils.VPrintf("'%s' -> parser.OtherRequirement", grp) return *schema.NewOtherRequirement(ungroupText(grp), "") } // Outermost function for parsing a chunk of requisite text (potentially containing multiple nested text groups) func parseChunk(chunk string) interface{} { - log.Printf("\nPARSING CHUNK: '%s'\n", chunk) + utils.VPrintf("\nPARSING CHUNK: '%s'", chunk) // Extract parenthesized groups from chunk text parseText, parseGroups := groupParens(chunk) // Initialize the requisite list and group list diff --git a/parser/validator.go b/parser/validator.go index ec52c7d..78ae337 100644 --- a/parser/validator.go +++ b/parser/validator.go @@ -14,7 +14,7 @@ func validate() { } }() - log.Printf("\nValidating courses...\n") + log.Printf("\nValidating courses...") courseKeys := utils.GetMapKeys(Courses) for i := 0; i < len(courseKeys)-1; i++ { course1 := Courses[courseKeys[i]] @@ -22,7 +22,7 @@ func validate() { for j := i + 1; j < len(courseKeys); j++ { course2 := Courses[courseKeys[j]] if course2.Catalog_year == course1.Catalog_year && course2.Course_number == course1.Course_number && course2.Subject_prefix == course1.Subject_prefix { - log.Printf("Duplicate course found for %s%s!\n", course1.Subject_prefix, course1.Course_number) + log.Printf("Duplicate course found for %s%s!", course1.Subject_prefix, course1.Course_number) log.Printf("Course 1: %v\n\nCourse 2: %v", course1, course2) log.Panic("Courses failed to validate!") } @@ -31,21 +31,21 @@ func validate() { for _, sectionId := range course1.Sections { section, exists := Sections[sectionId] if !exists { - log.Printf("Nonexistent section reference found for %s%s!\n", course1.Subject_prefix, course1.Course_number) - log.Printf("Referenced section ID: %s\nCourse ID: %s\n", sectionId, course1.Id) + log.Printf("Nonexistent section reference found for %s%s!", course1.Subject_prefix, course1.Course_number) + log.Printf("Referenced section ID: %s\nCourse ID: %s", sectionId, course1.Id) log.Panic("Courses failed to validate!") } if section.Course_reference != course1.Id { - log.Printf("Inconsistent section reference found for %s%s! The course references the section, but not vice-versa!\n", course1.Subject_prefix, course1.Course_number) - log.Printf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s\n", sectionId, course1.Id, section.Course_reference) + log.Printf("Inconsistent section reference found for %s%s! The course references the section, but not vice-versa!", course1.Subject_prefix, course1.Course_number) + log.Printf("Referenced section ID: %s\nCourse ID: %s\nSection course reference: %s", sectionId, course1.Id, section.Course_reference) log.Panic("Courses failed to validate!") } } } courseKeys = nil - log.Print("No invalid courses!\n\n") + log.Print("No invalid courses!") - log.Print("Validating sections...\n") + log.Print("Validating sections...") sectionKeys := utils.GetMapKeys(Sections) for i := 0; i < len(sectionKeys)-1; i++ { section1 := Sections[sectionKeys[i]] @@ -55,7 +55,7 @@ func validate() { if section2.Section_number == section1.Section_number && section2.Course_reference == section1.Course_reference && section2.Academic_session == section1.Academic_session { - log.Print("Duplicate section found!\n") + log.Print("Duplicate section found!") log.Printf("Section 1: %v\n\nSection 2: %v", section1, section2) log.Panic("Sections failed to validate!") } @@ -64,8 +64,8 @@ func validate() { for _, profId := range section1.Professors { professorKey, exists := ProfessorIDMap[profId] if !exists { - log.Printf("Nonexistent professor reference found for section ID %s!\n", section1.Id) - log.Printf("Referenced professor ID: %s\n", profId) + log.Printf("Nonexistent professor reference found for section ID %s!", section1.Id) + log.Printf("Referenced professor ID: %s", profId) log.Panic("Sections failed to validate!") } profRefsSection := false @@ -76,23 +76,23 @@ func validate() { } } if !profRefsSection { - log.Printf("Inconsistent professor reference found for section ID %s! The section references the professor, but not vice-versa!\n", section1.Id) - log.Printf("Referenced professor ID: %s\n", profId) + log.Printf("Inconsistent professor reference found for section ID %s! The section references the professor, but not vice-versa!", section1.Id) + log.Printf("Referenced professor ID: %s", profId) log.Panic("Sections failed to validate!") } } // Make sure section isn't referencing a nonexistant course _, exists := CourseIDMap[section1.Course_reference] if !exists { - log.Printf("Nonexistent course reference found for section ID %s!\n", section1.Id) - log.Printf("Referenced course ID: %s\n", section1.Course_reference) + log.Printf("Nonexistent course reference found for section ID %s!", section1.Id) + log.Printf("Referenced course ID: %s", section1.Course_reference) log.Panic("Sections failed to validate!") } } sectionKeys = nil - log.Printf("No invalid sections!\n\n") + log.Printf("No invalid sections!") - log.Printf("Validating professors...\n") + log.Printf("Validating professors...") profKeys := utils.GetMapKeys(Professors) // Check for duplicate professors by comparing first_name, last_name, and sections as a compound key for i := 0; i < len(profKeys)-1; i++ { @@ -102,11 +102,11 @@ func validate() { if prof2.First_name == prof1.First_name && prof2.Last_name == prof1.Last_name && prof2.Profile_uri == prof1.Profile_uri { - log.Printf("Duplicate professor found!\n") + log.Printf("Duplicate professor found!") log.Printf("Professor 1: %v\n\nProfessor 2: %v", prof1, prof2) log.Panic("Professors failed to validate!") } } } - log.Printf("No invalid professors!\n\n") + log.Printf("No invalid professors!") } diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 97a0801..f317793 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -1,3 +1,7 @@ +/* + This file contains the code for the coursebook scraper. +*/ + package scrapers import ( @@ -18,10 +22,10 @@ import ( ) func initChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) { - log.Printf("Initializing chromedp...\n") + log.Printf("Initializing chromedp...") allocCtx, cancelFnc := chromedp.NewExecAllocator(context.Background()) chromedpCtx, _ = chromedp.NewContext(allocCtx) - log.Printf("Initialized chromedp!\n") + log.Printf("Initialized chromedp!") return } @@ -36,7 +40,7 @@ func refreshToken(chromedpCtx context.Context) map[string][]string { log.Panic("LOGIN_PASSWORD is missing from .env!") } - log.Printf("Getting new token...\n") + utils.VPrintf("Getting new token...") _, err := chromedp.RunResponse(chromedpCtx, chromedp.ActionFunc(func(ctx context.Context) error { err := network.ClearBrowserCookies().Do(ctx) @@ -63,7 +67,7 @@ func refreshToken(chromedpCtx context.Context) map[string][]string { for i, cookie := range cookies { cookieStrs[i] = fmt.Sprintf("%s=%s", cookie.Name, cookie.Value) if cookie.Name == "PTGSESSID" { - fmt.Printf("Got new token: PTGSESSID = %s\n", cookie.Value) + utils.VPrintf("Got new token: PTGSESSID = %s", cookie.Value) gotToken = true } } @@ -150,7 +154,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) { // String builder to store accumulated course HTML data for both class levels courseBuilder := strings.Builder{} - log.Printf("Finding sections for course prefix %s...\n", coursePrefix) + log.Printf("Finding sections for course prefix %s...", coursePrefix) // Get courses for term and prefix, split by grad and undergrad to avoid 300 section cap for _, clevel := range []string{"clevel_u", "clevel_g"} { @@ -178,7 +182,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) { for _, matchSet := range smatches { sectionIDs = append(sectionIDs, matchSet[1]) } - log.Printf("Found %d sections for course prefix %s\n", len(sectionIDs), coursePrefix) + log.Printf("Found %d sections for course prefix %s", len(sectionIDs), coursePrefix) // Get HTML data for all section IDs sectionsInCoursePrefix := 0 @@ -211,7 +215,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) { fptr.Close() // Report success, refresh token periodically - fmt.Printf("Got section: %s\n", id) + utils.VPrintf("Got section: %s", id) if sectionIndex%30 == 0 && sectionIndex != 0 { // Ratelimit? What ratelimit? coursebookHeaders = refreshToken(chromedpCtx) @@ -220,7 +224,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) { } sectionsInCoursePrefix++ } - log.Printf("\nFinished scraping course prefix %s. Got %d sections.\n", coursePrefix, sectionsInCoursePrefix) + log.Printf("\nFinished scraping course prefix %s. Got %d sections.", coursePrefix, sectionsInCoursePrefix) totalSections += sectionsInCoursePrefix } log.Printf("\nDone scraping term! Scraped a total of %d sections.", totalSections) diff --git a/scrapers/evaluations.go b/scrapers/evaluations.go index 539a59c..80b9457 100644 --- a/scrapers/evaluations.go +++ b/scrapers/evaluations.go @@ -1,3 +1,8 @@ +/* + This file contains the code for the professor evaluation scraper. + NOTE: This scraper is NOT production ready! See https://github.com/UTDNebula/api-tools/issues/6 for details. +*/ + package scrapers import ( @@ -37,7 +42,7 @@ func ScrapeEvals(inDir string) { _, fileName := filepath.Split(path) sectionID := fileName[:len(fileName)-5] - log.Printf("Finding eval for %s\n", sectionID) + log.Printf("Finding eval for %s", sectionID) // Get eval info evalURL := fmt.Sprintf("https://coursebook.utdallas.edu/ues-report/%s", sectionID) @@ -57,10 +62,10 @@ func ScrapeEvals(inDir string) { panic(err) } fptr.Close() - log.Print("Eval found and downloaded!\n") + log.Print("Eval found and downloaded!") return err } else { - log.Print("No eval found!\n") + log.Print("No eval found!") return nil } }, chromedp.AtLeast(0)), diff --git a/scrapers/events.go b/scrapers/events.go index 053db2a..28ad666 100644 --- a/scrapers/events.go +++ b/scrapers/events.go @@ -1,3 +1,7 @@ +/* + This file contains the code for the events scraper. +*/ + package scrapers import ( @@ -10,6 +14,7 @@ import ( "regexp" "time" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/runtime" @@ -33,7 +38,7 @@ func ScrapeEvents(outDir string) { events := []schema.Event{} - log.Printf("Scraping event page links\n") + log.Printf("Scraping event page links") //Grab all links to event pages var pageLinks []string = []string{} _, err = chromedp.RunResponse(chromedpCtx, @@ -55,7 +60,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped event page links!\n") + log.Printf("Scraped event page links!") for _, page := range pageLinks { //Navigate to page and get page summary @@ -75,7 +80,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Navigated to page %s\n", summary) + utils.VPrintf("Navigated to page %s", summary) // Grab date/time of the event var dateTimeStart time.Time @@ -119,7 +124,7 @@ func ScrapeEvents(outDir string) { if err != nil { continue } - log.Printf("Scraped time: %s to %s \n", dateTimeStart, dateTimeEnd) + utils.VPrintf("Scraped time: %s to %s ", dateTimeStart, dateTimeEnd) //Grab Location of Event var location string = "" @@ -136,7 +141,7 @@ func ScrapeEvents(outDir string) { if err != nil { continue } - log.Printf("Scraped location: %s, \n", location) + utils.VPrintf("Scraped location: %s, ", location) //Get description of event var description string = "" @@ -153,7 +158,7 @@ func ScrapeEvents(outDir string) { if err != nil { continue } - log.Printf("Scraped description: %s, \n", description) + utils.VPrintf("Scraped description: %s, ", description) //Grab Event Type var eventType []string = []string{} @@ -170,7 +175,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped event type: %s\n", eventType) + utils.VPrintf("Scraped event type: %s", eventType) //Grab Target Audience targetAudience := []string{} @@ -187,7 +192,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped target audience: %s, \n", targetAudience) + utils.VPrintf("Scraped target audience: %s, ", targetAudience) //Grab Topic topic := []string{} @@ -204,7 +209,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped topic: %s, \n", topic) + utils.VPrintf("Scraped topic: %s, ", topic) //Grab Event Tags tags := []string{} @@ -221,7 +226,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped tags: %s, \n", tags) + utils.VPrintf("Scraped tags: %s, ", tags) //Grab Website var eventWebsite string = "" @@ -242,7 +247,7 @@ func ScrapeEvents(outDir string) { if err != nil { continue } - log.Printf("Scraped website: %s, \n", eventWebsite) + utils.VPrintf("Scraped website: %s, ", eventWebsite) //Grab Department var eventDepartment []string = []string{} @@ -259,7 +264,7 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped department: %s, \n", eventDepartment) + utils.VPrintf("Scraped department: %s, ", eventDepartment) //Grab Contact information var contactInformationName string = "" @@ -297,9 +302,9 @@ func ScrapeEvents(outDir string) { if err != nil { panic(err) } - log.Printf("Scraped contact name info: %s\n", contactInformationName) - log.Printf("Scraped contact email info: %s\n", contactInformationEmail) - log.Printf("Scraped contact phone info: %s\n", contactInformationPhone) + utils.VPrintf("Scraped contact name info: %s", contactInformationName) + utils.VPrintf("Scraped contact email info: %s", contactInformationEmail) + utils.VPrintf("Scraped contact phone info: %s", contactInformationPhone) events = append(events, schema.Event{ Id: schema.IdWrapper(primitive.NewObjectID().Hex()), diff --git a/scrapers/organizations.go b/scrapers/organizations.go index 119ae00..7c02cb2 100644 --- a/scrapers/organizations.go +++ b/scrapers/organizations.go @@ -1,3 +1,7 @@ +/* + This file contains the code for the student organization scraper. +*/ + package scrapers import ( @@ -17,6 +21,7 @@ import ( "strings" "time" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "github.com/chromedp/cdproto/browser" "github.com/chromedp/cdproto/network" @@ -119,7 +124,7 @@ func scrapeData(ctx context.Context, outdir string) error { }) tempDir, _ := filepath.Abs(filepath.Join(outdir, "tmp")) - log.Printf("Downloading CSV to %s ...\n", tempDir) + utils.VPrintf("Downloading CSV to %s ...", tempDir) if err := os.MkdirAll(tempDir, 0755); err != nil { return err } @@ -184,7 +189,7 @@ func processCsv(ctx context.Context, inputPath string, storageFilePath string) e return err } - log.Printf("Processing row %d\n", i) + utils.VPrintf("Processing row %d", i) org, err := parseCsvRecord(ctx, entry) if err != nil { return err @@ -219,7 +224,7 @@ func parseCsvRecord(ctx context.Context, entry []string) (*schema.Organization, imageData, err := retrieveImage(ctx, entry[5]) if err != nil { - log.Printf("Error retrieving image for %s: %v\n", entry[0], err) + utils.VPrintf("Error retrieving image for %s: %v", entry[0], err) } return &schema.Organization{ Id: schema.IdWrapper(primitive.NewObjectID().Hex()), @@ -263,7 +268,7 @@ func retrieveImage(ctx context.Context, imageUri string) (string, error) { requestUrl := baseUrlStruct.ResolveReference(urlStruct).String() - //log.Printf("loading image %s\n", requestUrl) + //log.Printf("loading image %s", requestUrl) // method adapted from https://github.com/chromedp/examples/blob/master/download_image/main.go ctx, cancel := context.WithTimeout(ctx, 10*time.Second) @@ -289,20 +294,20 @@ func retrieveImage(ctx context.Context, imageUri string) (string, error) { }) if err := chromedp.Run(ctx, chromedp.Navigate(requestUrl)); err != nil { - log.Printf("Error navigating to %s: %v\n", requestUrl, err) + log.Printf("Error navigating to %s: %v", requestUrl, err) return "", err } // wait for image request to finish <-done - //log.Printf("Done retrieving image from %s\n", requestUrl) + //log.Printf("Done retrieving image from %s", requestUrl) var buf []byte if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error { var err error buf, err = network.GetResponseBody(requestID).Do(ctx) if err != nil { - log.Printf("Error getting response body for %s: %v\n", requestUrl, err) + log.Printf("Error getting response body for %s: %v", requestUrl, err) } return err })); err != nil { diff --git a/scrapers/profiles.go b/scrapers/profiles.go index fc031f1..8b6b5d5 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -1,3 +1,7 @@ +/* + This file contains the code for the professor profile scraper. +*/ + package scrapers import ( @@ -11,6 +15,7 @@ import ( "strconv" "strings" + "github.com/UTDNebula/api-tools/utils" "github.com/UTDNebula/nebula-api/api/schema" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/cdproto/runtime" @@ -54,11 +59,11 @@ func parseList(list []string) (string, schema.Location) { for _, element := range list { element = strings.Trim(element, " ") - log.Printf("Element is: %s\n", element) + utils.VPrintf("Element is: %s", element) if strings.Contains(element, "-") { phoneNumber = element } else if primaryLocationRegex.MatchString(element) || fallbackLocationRegex.MatchString(element) { - log.Printf("Element match is: %s\n", element) + utils.VPrintf("Element match is: %s", element) office = parseLocation(element) break } @@ -148,16 +153,16 @@ func ScrapeProfiles(outDir string) { var professors []schema.Professor - log.Print("Scraping professor links...\n") + log.Print("Scraping professor links...") professorLinks := scrapeProfessorLinks(chromedpCtx) - log.Print("Scraped professor links!\n\n") + log.Print("Scraped professor links!") for _, link := range professorLinks { // Navigate to the link and get the names var firstName, lastName string - log.Print("Scraping name...\n") + utils.VPrint("Scraping name...") _, err := chromedp.RunResponse(chromedpCtx, chromedp.Navigate(link), @@ -175,7 +180,7 @@ func ScrapeProfiles(outDir string) { // Get the image uri var imageUri string - log.Print("Scraping imageUri...\n") + utils.VPrint("Scraping imageUri...") err = chromedp.Run(chromedpCtx, chromedp.ActionFunc(func(ctx context.Context) error { @@ -215,7 +220,7 @@ func ScrapeProfiles(outDir string) { // Get the titles titles := make([]string, 0, 3) - log.Print("Scraping titles...\n") + utils.VPrint("Scraping titles...") err = chromedp.Run(chromedpCtx, chromedp.QueryAfter("//h6", @@ -237,7 +242,7 @@ func ScrapeProfiles(outDir string) { // Get the email var email string - log.Print("Scraping email...\n") + utils.VPrint("Scraping email...") err = chromedp.Run(chromedpCtx, chromedp.Text("//a[contains(@id,'☄️')]", &email, chromedp.AtLeast(0)), @@ -249,14 +254,14 @@ func ScrapeProfiles(outDir string) { // Get the phone number and office location var texts []string - log.Print("Scraping list text...\n") + utils.VPrint("Scraping list text...") err = chromedp.Run(chromedpCtx, chromedp.QueryAfter("div.contact_info > div", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { var tempText string err := chromedp.Text("div.contact_info > div", &tempText).Do(ctx) - texts = strings.Split(tempText, "\n") + texts = strings.Split(tempText, "") return err }, ), @@ -265,9 +270,9 @@ func ScrapeProfiles(outDir string) { panic(err) } - log.Print("Parsing list...\n") + utils.VPrint("Parsing list...") phoneNumber, office := parseList(texts) - log.Printf("Parsed list! #: %s, Office: %v\n\n", phoneNumber, office) + utils.VPrintf("Parsed list! #: %s, Office: %v", phoneNumber, office) professors = append(professors, schema.Professor{ Id: schema.IdWrapper(primitive.NewObjectID().Hex()), @@ -283,7 +288,7 @@ func ScrapeProfiles(outDir string) { Sections: []schema.IdWrapper{}, }) - log.Printf("Scraped profile for %s %s!\n\n", firstName, lastName) + utils.VPrintf("Scraped profile for %s %s!", firstName, lastName) } // Write professor data to output file diff --git a/uploader/utils.go b/uploader/database.go similarity index 93% rename from uploader/utils.go rename to uploader/database.go index 6f4229f..9029b5e 100644 --- a/uploader/utils.go +++ b/uploader/database.go @@ -1,3 +1,7 @@ +/* + This file is responsible for providing various useful database functions. +*/ + package uploader /* import ( diff --git a/uploader/uploader.go b/uploader/uploader.go index 3f8fad6..6c201cc 100644 --- a/uploader/uploader.go +++ b/uploader/uploader.go @@ -1,3 +1,7 @@ +/* + This file is responsible for handling uploading of parsed data to MongoDB. +*/ + package uploader /* diff --git a/utils/logger.go b/utils/logger.go new file mode 100644 index 0000000..53f761c --- /dev/null +++ b/utils/logger.go @@ -0,0 +1,110 @@ +/* + This file contains a log.Logger wrapper that provides some "verbose-only" variants of built-in logging functions. + These "verbose-only" functions, all of which start with 'V', will only print if the custom 'verbose' flag is specified in + the log.Logger being used. + + Additionally, a "SplitWriter" implementation of io.Writer is provided which supports writing to +*/ + +package utils + +import ( + "io" + "log" +) + +// Custom io.Writer for routing writing to multiple sub-writers +type SplitWriter struct { + writers []io.Writer +} + +// Constructor for utils.SplitWriter +func NewSplitWriter(writers ...io.Writer) *SplitWriter { + return &SplitWriter{writers: writers} +} + +// Writes the specified bytes to every sub-writer of the SplitWriter +func (splitWriter *SplitWriter) Write(p []byte) (n int, err error) { + type writeResult struct { + n int + err error + } + // Perform synchronous write across writers with result channel + c := make(chan writeResult) + for _, w := range splitWriter.writers { + go func(writer io.Writer) { + n, err := writer.Write(p) + c <- writeResult{n, err} + }(w) + } + // Wait for all results from channel, reports total bytes written and immediately returns on error + for range splitWriter.writers { + res := <-c + if res.err == nil { + n += res.n + } else { + break + } + } + return n, err +} + +// Verbose logging flag, only works with the utils.Logger verbose functions +const Lverbose = 1 << 7 + +// Extension of log.Logger that supports a verbose logging flag; verbose printing functions start with 'V' +type Logger struct { + log.Logger +} + +func NewLogger(out io.Writer, prefix string, flag int) *Logger { + return &Logger{*log.New(out, prefix, flag)} +} + +// Verbose-only variant of Logger.Printf +func (logger *Logger) VPrintf(format string, vars ...any) { + flags := logger.Flags() + if flags&Lverbose != 0 { + logger.Printf(format, vars...) + } +} + +// Verbose-only variant of Logger.Print +func (logger *Logger) VPrint(text string) { + flags := logger.Flags() + if flags&Lverbose != 0 { + logger.Print(text) + } +} + +// Verbose-only variant of Logger.Println +func (logger *Logger) VPrintln(text string) { + flags := logger.Flags() + if flags&Lverbose != 0 { + logger.Println(text) + } +} + +// Verbose-only variant of log.Printf +func VPrintf(format string, vars ...any) { + flags := log.Flags() + if flags&Lverbose != 0 { + log.Printf(format, vars...) + } +} + +// Verbose-only variant of log.Print +func VPrint(text string) { + flags := log.Flags() + if flags&Lverbose != 0 { + log.Print(text) + } +} + +// Verbose-only variant of log.Println +func VPrintln(text string) { + flags := log.Flags() + if flags&Lverbose != 0 { + log.Println(text) + } +} diff --git a/utils/methods.go b/utils/methods.go index e081af5..df6d6ad 100644 --- a/utils/methods.go +++ b/utils/methods.go @@ -1,4 +1,6 @@ -// This file contains utility methods used throughout various files in this repo. +/* + This file contains utility methods used throughout various files in this repo. +*/ package utils diff --git a/utils/regexes.go b/utils/regexes.go index e389f40..c939df2 100644 --- a/utils/regexes.go +++ b/utils/regexes.go @@ -1,6 +1,8 @@ -package utils +/* + This file simply acts as a space to store useful regexp pattern constants for consistency across the project. +*/ -// This file simply acts as a space to store useful regexp pattern constants for consistency across the project. +package utils // Subject, i.e. HIST const R_SUBJECT string = `[A-Z]{2,4}`