Skip to content

Commit

Permalink
[fixed]京都大學人文科學研究所
Browse files Browse the repository at this point in the history
  • Loading branch information
zhudw committed Dec 6, 2023
1 parent eaee912 commit 3848dba
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 160 deletions.
219 changes: 219 additions & 0 deletions app/kyotou.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
package app

import (
"bookget/config"
"bookget/lib/gohttp"
"bookget/lib/util"
"context"
"fmt"
"log"
"net/http/cookiejar"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"sync"
)

type Kyotou struct {
dt *DownloadTask
}

func (p *Kyotou) Init(iTask int, sUrl string) (msg string, err error) {
p.dt = new(DownloadTask)
p.dt.UrlParsed, err = url.Parse(sUrl)
p.dt.Url = sUrl
p.dt.Index = iTask
p.dt.Jar, _ = cookiejar.New(nil)
p.dt.BookId = p.getBookId(p.dt.Url)
if p.dt.BookId == "" {
return "requested URL was not found.", err
}
return p.download()
}

func (p *Kyotou) getBookId(sUrl string) (bookId string) {
if strings.Contains(sUrl, "menu") {
return getBookId(sUrl)
}
return ""
}

func (p *Kyotou) download() (msg string, err error) {
name := util.GenNumberSorted(p.dt.Index)
log.Printf("Get %s %s\n", name, p.dt.Url)

respVolume, err := p.getVolumes(p.dt.Url, p.dt.Jar)
if err != nil {
fmt.Println(err)
return "getVolumes", err
}
sizeVol := len(respVolume)
for i, vol := range respVolume {
if config.Conf.Volume > 0 && config.Conf.Volume != i+1 {
continue
}
vid := util.GenNumberSorted(i + 1)
if sizeVol == 1 {
p.dt.VolumeId = p.dt.BookId
} else {
p.dt.VolumeId = p.dt.BookId + "_vol." + vid
}
p.dt.SavePath = config.CreateDirectory(p.dt.Url, p.dt.VolumeId)
canvases, err := p.getCanvases(vol, p.dt.Jar)
if err != nil || canvases == nil {
fmt.Println(err)
continue
}
log.Printf(" %d/%d volume, %d pages \n", i+1, sizeVol, len(canvases))
p.do(canvases)
}
return "", nil
}

func (p *Kyotou) do(imgUrls []string) (msg string, err error) {
if imgUrls == nil {
return "", nil
}
size := len(imgUrls)
fmt.Println()
var wg sync.WaitGroup
q := QueueNew(int(config.Conf.Threads))
for i, uri := range imgUrls {
if uri == "" || !config.PageRange(i, size) {
continue
}
sortId := util.GenNumberSorted(i + 1)
filename := sortId + config.Conf.FileExt
dest := p.dt.SavePath + string(os.PathSeparator) + filename
if FileExist(dest) {
continue
}
imgUrl := uri
fmt.Println()
log.Printf("Get %d/%d %s\n", i+1, size, imgUrl)
wg.Add(1)
q.Go(func() {
defer wg.Done()
ctx := context.Background()
opts := gohttp.Options{
DestFile: dest,
Overwrite: false,
Concurrency: 1,
CookieFile: config.Conf.CookieFile,
CookieJar: p.dt.Jar,
Headers: map[string]interface{}{
"User-Agent": config.Conf.UserAgent,
},
}
gohttp.FastGet(ctx, imgUrl, opts)
fmt.Println()
})
}
wg.Wait()
fmt.Println()
return "", err
}

func (p *Kyotou) getVolumes(sUrl string, jar *cookiejar.Jar) (volumes []string, err error) {
bs, err := getBody(sUrl, nil)
if err != nil {
return
}
//取册数
matches := regexp.MustCompile(`href=["']?(.+?)\.html["']?`).FindAllSubmatch(bs, -1)
if matches == nil {
return
}
pos := strings.LastIndex(sUrl, "/")
hostUrl := sUrl[:pos]
volumes = make([]string, 0, len(matches))
for _, v := range matches {
text := string(v[1])
if strings.Contains(text, "top") {
continue
}
linkUrl := fmt.Sprintf("%s/%s.html", hostUrl, text)
volumes = append(volumes, linkUrl)
}
return volumes, err
}

func (p *Kyotou) getCanvases(sUrl string, jar *cookiejar.Jar) (canvases []string, err error) {
bs, err := getBody(sUrl, nil)
if err != nil {
return
}
startPos, ok := p.getVolStartPos(bs)
if !ok {
return
}
maxPage, ok := p.getVolMaxPage(bs)
if !ok {
return
}
bookNumber, ok := p.getBookNumber(bs)
if !ok {
return
}
pos := strings.LastIndex(sUrl, "/")
pos1 := strings.LastIndex(sUrl[:pos], "/")
hostUrl := sUrl[:pos1]
maxPos := startPos + maxPage
for i := 1; i < maxPos; i++ {
sortId := util.GenNumberSorted(i)
imgUrl := fmt.Sprintf("%s/L/%s%s.jpg", hostUrl, bookNumber, sortId)
canvases = append(canvases, imgUrl)
}
return canvases, err
}

func (p *Kyotou) getBody(sUrl string, jar *cookiejar.Jar) ([]byte, error) {
//TODO implement me
panic("implement me")
}

func (p *Kyotou) postBody(sUrl string, d []byte) ([]byte, error) {
//TODO implement me
panic("implement me")
}

func (p *Kyotou) getBookNumber(bs []byte) (bookNumber string, ok bool) {
//当前开始位置
match := regexp.MustCompile(`var[\s]+bookNum[\s]+=["'\s]*([A-z0-9]+)["'\s]*;`).FindStringSubmatch(string(bs))
if match == nil {
return "", false
}
return match[1], true
}

func (p *Kyotou) getVolStartPos(bs []byte) (startPos int, ok bool) {
//当前开始位置
match := regexp.MustCompile(`var[\s]+volStartPos[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(string(bs))
if match == nil {
return 0, false
}
startPos, _ = strconv.Atoi(match[1])
return startPos, true
}

func (p *Kyotou) getVolCurPage(bs []byte) (curPage int, ok bool) {
//当前开始位置
match := regexp.MustCompile(`var[\s]+curPage[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(string(bs))
if match == nil {
return 0, false
}
curPage, _ = strconv.Atoi(match[1])
return curPage, true
}

func (p *Kyotou) getVolMaxPage(bs []byte) (maxPage int, ok bool) {
//当前开始位置
match := regexp.MustCompile(`var[\s]+volMaxPage[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(string(bs))
if match == nil {
return 0, false
}
maxPage, _ = strconv.Atoi(match[1])
return maxPage, true
}
2 changes: 1 addition & 1 deletion config/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (

var Conf Input

const version = "1.2.5"
const version = "231206"

// initSeq false = 最小值 <= 当前页码 <= 最大值
func initSeq() {
Expand Down
2 changes: 1 addition & 1 deletion router/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func FactoryRouter(siteID string, sUrl []string) (map[string]interface{}, error)
//[日本]国書数据库(古典籍)
Router["kokusho.nijl.ac.jp"] = new(KokushoNijlAc)
//[日本]京都大学人文科学研究所 东方学数字图书博物馆
Router["kanji.zinbun.kyoto-u.ac.jp"] = new(KanjiZinbunKyotouAc)
Router["kanji.zinbun.kyoto-u.ac.jp"] = new(Kyotou)

//[日本]駒澤大学 电子贵重书库
Router["repo.komazawa-u.ac.jp"] = new(NormalIIIF)
Expand Down
8 changes: 4 additions & 4 deletions router/japan.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bookget/app"
"bookget/site/Japan/emuseum"
"bookget/site/Japan/gprime"
"bookget/site/Japan/kanjikyoto"
"bookget/site/Japan/kokusho"
"bookget/site/Japan/kyoto"
"bookget/site/Japan/national"
Expand Down Expand Up @@ -87,11 +86,12 @@ func (p KokushoNijlAc) getRouterInit(sUrl []string) (map[string]interface{}, err
return nil, nil
}

type KanjiZinbunKyotouAc struct{}
type Kyotou struct{}

func (p KanjiZinbunKyotouAc) getRouterInit(sUrl []string) (map[string]interface{}, error) {
func (p Kyotou) getRouterInit(sUrl []string) (map[string]interface{}, error) {
for i, s := range sUrl {
kanjikyoto.Init(i+1, s)
var kyotou app.Kyotou
kyotou.Init(i+1, s)
}
return nil, nil
}
Expand Down
Loading

0 comments on commit 3848dba

Please sign in to comment.