Skip to content

Commit

Permalink
update 哈佛
Browse files Browse the repository at this point in the history
  • Loading branch information
deweizhu committed Aug 23, 2024
1 parent 25ce0e7 commit c87bb94
Show file tree
Hide file tree
Showing 20 changed files with 1,201 additions and 105 deletions.
4 changes: 1 addition & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,4 @@
.fleet/
*.xml
*.7z
*.bak
*.sh
*.txt
*.bak
125 changes: 125 additions & 0 deletions app/Nationaljp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package app

import (
"bookget/config"
"bookget/lib/gohttp"
"bookget/lib/util"
"context"
"fmt"
"log"
"net/http/cookiejar"
"net/url"
"regexp"
)

type Nationaljp struct {
dt *DownloadTask
extId string
}

func (p *Nationaljp) Init(iTask int, sUrl string) (msg string, err error) {
p.dt = new(DownloadTask)
p.dt.UrlParsed, err = url.Parse(sUrl)
p.dt.Url = sUrl
p.dt.Index = iTask
p.dt.BookId = p.getBookId(p.dt.Url)
if p.dt.BookId == "" {
return "requested URL was not found.", err
}
p.dt.Jar, _ = cookiejar.New(nil)
p.extId = "jp2"
return p.download()
}

func (p *Nationaljp) getBookId(sUrl string) (bookId string) {
m := regexp.MustCompile(`(?i)BID=([A-z0-9_-]+)`).FindStringSubmatch(sUrl)
if m != nil {
return m[1]
}
return ""
}

func (p *Nationaljp) download() (msg string, err error) {
name := util.GenNumberSorted(p.dt.Index)
log.Printf("Get %s %s\n", name, p.dt.Url)

respVolume, err := p.getVolumes(p.dt.Url, p.dt.Jar)
if err != nil {
fmt.Println(err)
return "getVolumes", err
}
p.dt.SavePath = CreateDirectory(p.dt.UrlParsed.Host, p.dt.BookId, "")
for i, vol := range respVolume {
if !config.VolumeRange(i) {
continue
}
vid := util.GenNumberSorted(i + 1)
fileName := vid + ".zip"
dest := p.dt.SavePath + fileName
if FileExist(dest) {
continue
}
log.Printf(" %d/%d volume, %s\n", i+1, len(respVolume), p.extId)
p.do(i+1, vol, dest)
}
return msg, err
}

func (p *Nationaljp) do(index int, id, dest string) (msg string, err error) {
apiUrl := "https://" + p.dt.UrlParsed.Host + "/acv/auto_conversion/download"
data := fmt.Sprintf("DL_TYPE=%s&id_%d=%s", p.extId, index, id)
ctx := context.Background()
opts := gohttp.Options{
DestFile: dest,
Overwrite: false,
Concurrency: config.Conf.Threads,
CookieFile: config.Conf.CookieFile,
CookieJar: p.dt.Jar,
Headers: map[string]interface{}{
"User-Agent": config.Conf.UserAgent,
"Content-Type": "application/x-www-form-urlencoded",
},
Body: []byte(data),
}
_, err = gohttp.Post(ctx, apiUrl, opts)
return "", err
}

func (p *Nationaljp) getVolumes(sUrl string, jar *cookiejar.Jar) (volumes []string, err error) {
apiUrl := fmt.Sprintf("https://%s/DAS/meta/listPhoto?LANG=default&BID=%s&ID=&NO=&TYPE=dljpeg&DL_TYPE=jpeg", p.dt.UrlParsed.Host, p.dt.BookId)
bs, err := getBody(apiUrl, nil)
if err != nil {
return
}
text := string(bs)
//<input type="checkbox" class="check" name="id_2" posi="2" value="M2016092111023960474"
//取册数
matches := regexp.MustCompile(`<input[^>]+posi=["']([0-9]+)["'][^>]+value=["']([A-Za-z0-9]+)["']`).FindAllStringSubmatch(text, -1)
if matches == nil {
return
}
iLen := len(matches)
for _, match := range matches {
//跳过全选复选框
if iLen > 1 && (match[1] == "0" || match[2] == "") {
continue
}
volumes = append(volumes, match[2])
}
return volumes, nil
}

func (p *Nationaljp) getCanvases(sUrl string, jar *cookiejar.Jar) (canvases []string, err error) {
//TODO implement me
panic("implement me")
}

func (p *Nationaljp) getBody(sUrl string, jar *cookiejar.Jar) ([]byte, error) {
//TODO implement me
panic("implement me")
}

func (p *Nationaljp) postBody(sUrl string, d []byte) ([]byte, error) {
//TODO implement me
panic("implement me")
}
170 changes: 170 additions & 0 deletions app/Onbdigital.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package app

import (
"bookget/config"
"bookget/lib/gohttp"
"bookget/lib/util"
"context"
"encoding/json"
"errors"
"fmt"
"log"
"net/http/cookiejar"
"net/url"
"regexp"
"sync"
)

type OnbDigital struct {
dt *DownloadTask
}

type OnbResponse struct {
ImageData []struct {
ImageID string `json:"imageID"`
OrderNumber string `json:"orderNumber"`
QueryArgs string `json:"queryArgs"`
} `json:"imageData"`
}

func (p *OnbDigital) Init(iTask int, sUrl string) (msg string, err error) {
p.dt = new(DownloadTask)
p.dt.UrlParsed, err = url.Parse(sUrl)
p.dt.Url = sUrl
p.dt.Index = iTask
p.dt.BookId = p.getBookId(p.dt.Url)
if p.dt.BookId == "" {
return "requested URL was not found.", err
}
p.dt.Jar, _ = cookiejar.New(nil)
return p.download()
}

func (p *OnbDigital) getBookId(sUrl string) (bookId string) {
if m := regexp.MustCompile(`doc=([^&]+)`).FindStringSubmatch(sUrl); m != nil {
bookId = m[1]
}
return bookId
}

func (p *OnbDigital) download() (msg string, err error) {
name := util.GenNumberSorted(p.dt.Index)
log.Printf("Get %s %s\n", name, p.dt.Url)
respVolume, err := p.getVolumes(p.dt.Url, p.dt.Jar)
if err != nil {
fmt.Println(err)
return "getVolumes", err
}
p.dt.SavePath = CreateDirectory(p.dt.UrlParsed.Host, p.dt.BookId, "")
for i, vol := range respVolume {
if !config.VolumeRange(i) {
continue
}
canvases, err := p.getCanvases(vol, p.dt.Jar)
if err != nil || canvases == nil {
fmt.Println(err)
continue
}
log.Printf(" %d/%d volume, %d pages \n", i+1, len(respVolume), len(canvases))
p.do(canvases)
}
return msg, err
}

func (p *OnbDigital) do(imgUrls []string) (msg string, err error) {
if imgUrls == nil {
return "", nil
}
size := len(imgUrls)
fmt.Println()
var wg sync.WaitGroup
q := QueueNew(int(config.Conf.Threads))
for i, uri := range imgUrls {
if uri == "" || !config.PageRange(i, size) {
continue
}
sortId := util.GenNumberSorted(i + 1)
filename := sortId + config.Conf.FileExt
dest := p.dt.SavePath + filename
if FileExist(dest) {
continue
}
imgUrl := uri
fmt.Println()
log.Printf("Get %d/%d %s\n", i+1, size, imgUrl)
wg.Add(1)
q.Go(func() {
defer wg.Done()
ctx := context.Background()
opts := gohttp.Options{
DestFile: dest,
Overwrite: false,
Concurrency: 1,
CookieFile: config.Conf.CookieFile,
CookieJar: p.dt.Jar,
Headers: map[string]interface{}{
"User-Agent": config.Conf.UserAgent,
},
}
gohttp.FastGet(ctx, imgUrl, opts)
fmt.Println()
})
}
wg.Wait()
fmt.Println()
return "", err
}

func (p *OnbDigital) getVolumes(sUrl string, jar *cookiejar.Jar) (volumes []string, err error) {
//刷新cookie
_, err = p.getBody(sUrl, jar)
if err != nil {
return
}
volumes = append(volumes, sUrl)
return volumes, nil
}

func (p *OnbDigital) getCanvases(sUrl string, jar *cookiejar.Jar) (canvases []string, err error) {
apiUrl := "https://" + p.dt.UrlParsed.Host + "/OnbViewer/service/viewer/imageData?doc=" + p.dt.BookId + "&from=1&to=3000"
bs, err := p.getBody(apiUrl, jar)
if err != nil {
return
}
var result = new(OnbResponse)
if err = json.Unmarshal(bs, result); err != nil {
log.Printf("json.Unmarshal failed: %s\n", err)
return
}
serverUrl := "https://" + p.dt.UrlParsed.Host + "/OnbViewer/image?"
for _, m := range result.ImageData {
imgUrl := serverUrl + m.QueryArgs + "&w=2400&q=70"
canvases = append(canvases, imgUrl)
}
return canvases, err
}

func (p *OnbDigital) getBody(sUrl string, jar *cookiejar.Jar) ([]byte, error) {
ctx := context.Background()
cli := gohttp.NewClient(ctx, gohttp.Options{
CookieFile: config.Conf.CookieFile,
CookieJar: jar,
Headers: map[string]interface{}{
"User-Agent": config.Conf.UserAgent,
},
})
resp, err := cli.Get(sUrl)
if err != nil {
return nil, err
}
bs, _ := resp.GetBody()
if bs == nil {
return nil, errors.New(fmt.Sprintf("ErrCode:%d, %s", resp.GetStatusCode(), resp.GetReasonPhrase()))
}
return bs, nil
}

func (p *OnbDigital) postBody(sUrl string, d []byte) ([]byte, error) {
//TODO implement me
panic("implement me")
}
8 changes: 5 additions & 3 deletions app/cafaedu.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ type CafaEduItem struct {
}

type CafaEdu struct {
dt *DownloadTask
dt *DownloadTask
ServerUrl string
}

func (p *CafaEdu) Init(iTask int, sUrl string) (msg string, err error) {
Expand All @@ -61,6 +62,7 @@ func (p *CafaEdu) Init(iTask int, sUrl string) (msg string, err error) {
return "requested URL was not found.", err
}
p.dt.Jar, _ = cookiejar.New(nil)
p.ServerUrl = "dlibgate.cafa.edu.cn"
return p.download()
}

Expand Down Expand Up @@ -118,7 +120,7 @@ func (p *CafaEdu) getVolumes(sUrl string, jar *cookiejar.Jar) (volumes []string,
if err != nil {
return nil, err
}
jsonUrl := fmt.Sprintf("https://%s/api/viewer/lgiiif?url=/srv/www/limbgallery/medias/%s/&max=%d", p.dt.UrlParsed.Host, iiifId, 10000)
jsonUrl := fmt.Sprintf("https://%s/api/viewer/lgiiif?url=/srv/www/limbgallery/medias/%s/&max=%d", p.ServerUrl, iiifId, 10000)
volumes = append(volumes, jsonUrl)
return volumes, err
}
Expand All @@ -142,7 +144,7 @@ func (p *CafaEdu) getCanvases(apiUrl string, jar *cookiejar.Jar) (canvases []str
} else {
//JPEG URL
//https://dlibgate.cafa.edu.cn/i/?IIIF=/1b/86/7e/68/1b867e68-807a-44e1-b16b-a86775dc0b16/iiif/GJ05685_000001.tif/full/full/0/default.jpg
imgUrl := "https://" + p.dt.UrlParsed.Host + canvase.Id + "/" + config.Conf.Format
imgUrl := "https://" + p.ServerUrl + canvase.Id + "/" + config.Conf.Format
canvases = append(canvases, imgUrl)
}
}
Expand Down
4 changes: 2 additions & 2 deletions app/downloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ func WaitNewCookie() {
if FileExist(config.Conf.CookieFile) {
break
}
time.Sleep(time.Second * 3)
util.PrintSleepTime(10)
}
}()
wg.Wait()
Expand All @@ -206,7 +206,7 @@ func WaitNewCookieWithMsg(uri string) {
if FileExist(config.Conf.CookieFile) {
break
}
time.Sleep(time.Second * 3)
util.PrintSleepTime(8)
}
}()
wg.Wait()
Expand Down
Loading

0 comments on commit c87bb94

Please sign in to comment.