Skip to content

Commit

Permalink
Move coursebook scraper methods (#30)
Browse files Browse the repository at this point in the history
* Move initChromeDp

* Move RefreshToken

* Update go.sum and .gitignore
  • Loading branch information
democat3457 authored Sep 20, 2024
1 parent c2c45c0 commit 25e11f9
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 97 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
<<<<<<< HEAD
=======
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

# dependencies
Expand Down Expand Up @@ -43,6 +41,7 @@ deploy_log.sh
.idea/
.vscode/
.firebase/
/api-tools

# output data and logs
data/
Expand Down
7 changes: 0 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,8 @@ github.com/bytedance/sonic v1.11.5/go.mod h1:X2PC2giUdj/Cv2lliWFLk6c/DUQok5rViJS
github.com/bytedance/sonic/loader v0.1.0/go.mod h1:UmRT+IRTGKz/DAkzcEGzyVqQFJ7H9BqwBO3pm9H/+HY=
github.com/bytedance/sonic/loader v0.1.1 h1:c+e5Pt1k/cy5wMveRDyk2X4B9hF4g7an8N3zCYjJFNM=
github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d h1:x9d0XwRV3aWw1gAZtv0LrI39U+Efjp0mtyXRyikGb9Y=
github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg=
github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y=
github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
Expand Down Expand Up @@ -45,7 +40,6 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
Expand Down Expand Up @@ -147,7 +141,6 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
Expand Down
89 changes: 3 additions & 86 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,100 +6,17 @@ package scrapers

import (
"bytes"
"context"
"errors"
"fmt"
"log"
"net/http"
"os"
"strconv"
"strings"
"time"

"github.com/UTDNebula/api-tools/utils"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/joho/godotenv"
)

func initChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) {
log.Printf("Initializing chromedp...")
headlessEnv, present := os.LookupEnv("HEADLESS_MODE")
doHeadless, _ := strconv.ParseBool(headlessEnv)
if present && doHeadless {
chromedpCtx, cancelFnc = chromedp.NewContext(context.Background())
log.Printf("Initialized chromedp!")
} else {
allocCtx, _ := chromedp.NewExecAllocator(context.Background())
chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx)
}
return
}

// This function generates a fresh auth token and returns the new headers
func refreshToken(chromedpCtx context.Context) map[string][]string {
netID, present := os.LookupEnv("LOGIN_NETID")
if !present {
log.Panic("LOGIN_NETID is missing from .env!")
}
password, present := os.LookupEnv("LOGIN_PASSWORD")
if !present {
log.Panic("LOGIN_PASSWORD is missing from .env!")
}

utils.VPrintf("Getting new token...")
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
err := network.ClearBrowserCookies().Do(ctx)
return err
}),
chromedp.Navigate(`https://wat.utdallas.edu/login`),
chromedp.WaitVisible(`form#login-form`),
chromedp.SendKeys(`input#netid`, netID),
chromedp.SendKeys(`input#password`, password),
chromedp.WaitVisible(`input#login-button`),
chromedp.Click(`input#login-button`),
//chromedp.WaitVisible(`body`),
)
if err != nil {
panic(err)
}

var cookieStrs []string
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(`https://coursebook.utdallas.edu/`),
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := network.GetCookies().Do(ctx)
cookieStrs = make([]string, len(cookies))
gotToken := false
for i, cookie := range cookies {
cookieStrs[i] = fmt.Sprintf("%s=%s", cookie.Name, cookie.Value)
if cookie.Name == "PTGSESSID" {
utils.VPrintf("Got new token: PTGSESSID = %s", cookie.Value)
gotToken = true
}
}
if !gotToken {
return errors.New("failed to get a new token")
}
return err
}),
)
if err != nil {
panic(err)
}

return map[string][]string{
"Host": {"coursebook.utdallas.edu"},
"User-Agent": {"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"},
"Accept": {"text/html"},
"Accept-Language": {"en-US"},
"Content-Type": {"application/x-www-form-urlencoded"},
"Cookie": cookieStrs,
"Connection": {"keep-alive"},
}
}

func ScrapeCoursebook(term string, startPrefix string, outDir string) {

// Load env vars
Expand All @@ -108,7 +25,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
}

// Start chromedp
chromedpCtx, cancel := initChromeDp()
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

// Find index of starting prefix, if one has been given
Expand Down Expand Up @@ -156,7 +73,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
panic(err)
}
// Get a fresh token at the start of each new prefix because we can lol
coursebookHeaders := refreshToken(chromedpCtx)
coursebookHeaders := utils.RefreshToken(chromedpCtx)
// Give coursebook some time to recognize the new token
time.Sleep(500 * time.Millisecond)
// String builder to store accumulated course HTML data for both class levels
Expand Down Expand Up @@ -226,7 +143,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
utils.VPrintf("Got section: %s", id)
if sectionIndex%30 == 0 && sectionIndex != 0 {
// Ratelimit? What ratelimit?
coursebookHeaders = refreshToken(chromedpCtx)
coursebookHeaders = utils.RefreshToken(chromedpCtx)
// Give coursebook some time to recognize the new token
time.Sleep(500 * time.Millisecond)
}
Expand Down
2 changes: 1 addition & 1 deletion scrapers/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var trailingSpaceRegex *regexp.Regexp = regexp.MustCompile(`(\s{2,}?\s{2,})|(\n)

func ScrapeEvents(outDir string) {

chromedpCtx, cancel := initChromeDp()
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

err := os.MkdirAll(outDir, 0777)
Expand Down
2 changes: 1 addition & 1 deletion scrapers/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func scrapeProfessorLinks(chromedpCtx context.Context) []string {

func ScrapeProfiles(outDir string) {

chromedpCtx, cancel := initChromeDp()
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

err := os.MkdirAll(outDir, 0777)
Expand Down
86 changes: 86 additions & 0 deletions utils/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,101 @@
package utils

import (
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
)

// Initializes Chrome DevTools Protocol
func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) {
log.Printf("Initializing chromedp...")
headlessEnv, present := os.LookupEnv("HEADLESS_MODE")
doHeadless, _ := strconv.ParseBool(headlessEnv)
if present && doHeadless {
chromedpCtx, cancelFnc = chromedp.NewContext(context.Background())
log.Printf("Initialized chromedp!")
} else {
allocCtx, _ := chromedp.NewExecAllocator(context.Background())
chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx)
}
return
}

// This function generates a fresh auth token and returns the new headers
func RefreshToken(chromedpCtx context.Context) map[string][]string {
netID, present := os.LookupEnv("LOGIN_NETID")
if !present {
log.Panic("LOGIN_NETID is missing from .env!")
}
password, present := os.LookupEnv("LOGIN_PASSWORD")
if !present {
log.Panic("LOGIN_PASSWORD is missing from .env!")
}

VPrintf("Getting new token...")
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.ActionFunc(func(ctx context.Context) error {
err := network.ClearBrowserCookies().Do(ctx)
return err
}),
chromedp.Navigate(`https://wat.utdallas.edu/login`),
chromedp.WaitVisible(`form#login-form`),
chromedp.SendKeys(`input#netid`, netID),
chromedp.SendKeys(`input#password`, password),
chromedp.WaitVisible(`input#login-button`),
chromedp.Click(`input#login-button`),
//chromedp.WaitVisible(`body`),
)
if err != nil {
panic(err)
}

var cookieStrs []string
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(`https://coursebook.utdallas.edu/`),
chromedp.ActionFunc(func(ctx context.Context) error {
cookies, err := network.GetCookies().Do(ctx)
cookieStrs = make([]string, len(cookies))
gotToken := false
for i, cookie := range cookies {
cookieStrs[i] = fmt.Sprintf("%s=%s", cookie.Name, cookie.Value)
if cookie.Name == "PTGSESSID" {
VPrintf("Got new token: PTGSESSID = %s", cookie.Value)
gotToken = true
}
}
if !gotToken {
return errors.New("failed to get a new token")
}
return err
}),
)
if err != nil {
panic(err)
}

return map[string][]string{
"Host": {"coursebook.utdallas.edu"},
"User-Agent": {"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0"},
"Accept": {"text/html"},
"Accept-Language": {"en-US"},
"Content-Type": {"application/x-www-form-urlencoded"},
"Cookie": cookieStrs,
"Connection": {"keep-alive"},
}
}

// Encodes and writes the given data as tab-indented JSON to the given filepath.
func WriteJSON(filepath string, data interface{}) error {
fptr, err := os.Create(filepath)
Expand Down

0 comments on commit 25e11f9

Please sign in to comment.