From 3848dba24d106a9e34ef51554bc7aafdd0def3c6 Mon Sep 17 00:00:00 2001 From: zhudw Date: Wed, 6 Dec 2023 18:17:06 +0800 Subject: [PATCH] =?UTF-8?q?[fixed]=E4=BA=AC=E9=83=BD=E5=A4=A7=E5=AD=B8?= =?UTF-8?q?=E4=BA=BA=E6=96=87=E7=A7=91=E5=AD=B8=E7=A0=94=E7=A9=B6=E6=89=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/kyotou.go | 219 ++++++++++++++++++++++++++++ config/init.go | 2 +- router/init.go | 2 +- router/japan.go | 8 +- site/Japan/kanjikyoto/kanjikyoto.go | 154 ------------------- 5 files changed, 225 insertions(+), 160 deletions(-) create mode 100644 app/kyotou.go delete mode 100644 site/Japan/kanjikyoto/kanjikyoto.go diff --git a/app/kyotou.go b/app/kyotou.go new file mode 100644 index 0000000..012eac6 --- /dev/null +++ b/app/kyotou.go @@ -0,0 +1,219 @@ +package app + +import ( + "bookget/config" + "bookget/lib/gohttp" + "bookget/lib/util" + "context" + "fmt" + "log" + "net/http/cookiejar" + "net/url" + "os" + "regexp" + "strconv" + "strings" + "sync" +) + +type Kyotou struct { + dt *DownloadTask +} + +func (p *Kyotou) Init(iTask int, sUrl string) (msg string, err error) { + p.dt = new(DownloadTask) + p.dt.UrlParsed, err = url.Parse(sUrl) + p.dt.Url = sUrl + p.dt.Index = iTask + p.dt.Jar, _ = cookiejar.New(nil) + p.dt.BookId = p.getBookId(p.dt.Url) + if p.dt.BookId == "" { + return "requested URL was not found.", err + } + return p.download() +} + +func (p *Kyotou) getBookId(sUrl string) (bookId string) { + if strings.Contains(sUrl, "menu") { + return getBookId(sUrl) + } + return "" +} + +func (p *Kyotou) download() (msg string, err error) { + name := util.GenNumberSorted(p.dt.Index) + log.Printf("Get %s %s\n", name, p.dt.Url) + + respVolume, err := p.getVolumes(p.dt.Url, p.dt.Jar) + if err != nil { + fmt.Println(err) + return "getVolumes", err + } + sizeVol := len(respVolume) + for i, vol := range respVolume { + if config.Conf.Volume > 0 && config.Conf.Volume != i+1 { + continue + } + vid := util.GenNumberSorted(i + 1) + if sizeVol == 1 { + p.dt.VolumeId = p.dt.BookId + } else { + p.dt.VolumeId = p.dt.BookId + "_vol." + vid + } + p.dt.SavePath = config.CreateDirectory(p.dt.Url, p.dt.VolumeId) + canvases, err := p.getCanvases(vol, p.dt.Jar) + if err != nil || canvases == nil { + fmt.Println(err) + continue + } + log.Printf(" %d/%d volume, %d pages \n", i+1, sizeVol, len(canvases)) + p.do(canvases) + } + return "", nil +} + +func (p *Kyotou) do(imgUrls []string) (msg string, err error) { + if imgUrls == nil { + return "", nil + } + size := len(imgUrls) + fmt.Println() + var wg sync.WaitGroup + q := QueueNew(int(config.Conf.Threads)) + for i, uri := range imgUrls { + if uri == "" || !config.PageRange(i, size) { + continue + } + sortId := util.GenNumberSorted(i + 1) + filename := sortId + config.Conf.FileExt + dest := p.dt.SavePath + string(os.PathSeparator) + filename + if FileExist(dest) { + continue + } + imgUrl := uri + fmt.Println() + log.Printf("Get %d/%d %s\n", i+1, size, imgUrl) + wg.Add(1) + q.Go(func() { + defer wg.Done() + ctx := context.Background() + opts := gohttp.Options{ + DestFile: dest, + Overwrite: false, + Concurrency: 1, + CookieFile: config.Conf.CookieFile, + CookieJar: p.dt.Jar, + Headers: map[string]interface{}{ + "User-Agent": config.Conf.UserAgent, + }, + } + gohttp.FastGet(ctx, imgUrl, opts) + fmt.Println() + }) + } + wg.Wait() + fmt.Println() + return "", err +} + +func (p *Kyotou) getVolumes(sUrl string, jar *cookiejar.Jar) (volumes []string, err error) { + bs, err := getBody(sUrl, nil) + if err != nil { + return + } + //取册数 + matches := regexp.MustCompile(`href=["']?(.+?)\.html["']?`).FindAllSubmatch(bs, -1) + if matches == nil { + return + } + pos := strings.LastIndex(sUrl, "/") + hostUrl := sUrl[:pos] + volumes = make([]string, 0, len(matches)) + for _, v := range matches { + text := string(v[1]) + if strings.Contains(text, "top") { + continue + } + linkUrl := fmt.Sprintf("%s/%s.html", hostUrl, text) + volumes = append(volumes, linkUrl) + } + return volumes, err +} + +func (p *Kyotou) getCanvases(sUrl string, jar *cookiejar.Jar) (canvases []string, err error) { + bs, err := getBody(sUrl, nil) + if err != nil { + return + } + startPos, ok := p.getVolStartPos(bs) + if !ok { + return + } + maxPage, ok := p.getVolMaxPage(bs) + if !ok { + return + } + bookNumber, ok := p.getBookNumber(bs) + if !ok { + return + } + pos := strings.LastIndex(sUrl, "/") + pos1 := strings.LastIndex(sUrl[:pos], "/") + hostUrl := sUrl[:pos1] + maxPos := startPos + maxPage + for i := 1; i < maxPos; i++ { + sortId := util.GenNumberSorted(i) + imgUrl := fmt.Sprintf("%s/L/%s%s.jpg", hostUrl, bookNumber, sortId) + canvases = append(canvases, imgUrl) + } + return canvases, err +} + +func (p *Kyotou) getBody(sUrl string, jar *cookiejar.Jar) ([]byte, error) { + //TODO implement me + panic("implement me") +} + +func (p *Kyotou) postBody(sUrl string, d []byte) ([]byte, error) { + //TODO implement me + panic("implement me") +} + +func (p *Kyotou) getBookNumber(bs []byte) (bookNumber string, ok bool) { + //当前开始位置 + match := regexp.MustCompile(`var[\s]+bookNum[\s]+=["'\s]*([A-z0-9]+)["'\s]*;`).FindStringSubmatch(string(bs)) + if match == nil { + return "", false + } + return match[1], true +} + +func (p *Kyotou) getVolStartPos(bs []byte) (startPos int, ok bool) { + //当前开始位置 + match := regexp.MustCompile(`var[\s]+volStartPos[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(string(bs)) + if match == nil { + return 0, false + } + startPos, _ = strconv.Atoi(match[1]) + return startPos, true +} + +func (p *Kyotou) getVolCurPage(bs []byte) (curPage int, ok bool) { + //当前开始位置 + match := regexp.MustCompile(`var[\s]+curPage[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(string(bs)) + if match == nil { + return 0, false + } + curPage, _ = strconv.Atoi(match[1]) + return curPage, true +} + +func (p *Kyotou) getVolMaxPage(bs []byte) (maxPage int, ok bool) { + //当前开始位置 + match := regexp.MustCompile(`var[\s]+volMaxPage[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(string(bs)) + if match == nil { + return 0, false + } + maxPage, _ = strconv.Atoi(match[1]) + return maxPage, true +} diff --git a/config/init.go b/config/init.go index b578faa..db4b1d9 100644 --- a/config/init.go +++ b/config/init.go @@ -7,7 +7,7 @@ import ( var Conf Input -const version = "1.2.5" +const version = "231206" // initSeq false = 最小值 <= 当前页码 <= 最大值 func initSeq() { diff --git a/router/init.go b/router/init.go index 592d348..db7cc69 100644 --- a/router/init.go +++ b/router/init.go @@ -112,7 +112,7 @@ func FactoryRouter(siteID string, sUrl []string) (map[string]interface{}, error) //[日本]国書数据库(古典籍) Router["kokusho.nijl.ac.jp"] = new(KokushoNijlAc) //[日本]京都大学人文科学研究所 东方学数字图书博物馆 - Router["kanji.zinbun.kyoto-u.ac.jp"] = new(KanjiZinbunKyotouAc) + Router["kanji.zinbun.kyoto-u.ac.jp"] = new(Kyotou) //[日本]駒澤大学 电子贵重书库 Router["repo.komazawa-u.ac.jp"] = new(NormalIIIF) diff --git a/router/japan.go b/router/japan.go index 89fda2c..79dcb71 100644 --- a/router/japan.go +++ b/router/japan.go @@ -4,7 +4,6 @@ import ( "bookget/app" "bookget/site/Japan/emuseum" "bookget/site/Japan/gprime" - "bookget/site/Japan/kanjikyoto" "bookget/site/Japan/kokusho" "bookget/site/Japan/kyoto" "bookget/site/Japan/national" @@ -87,11 +86,12 @@ func (p KokushoNijlAc) getRouterInit(sUrl []string) (map[string]interface{}, err return nil, nil } -type KanjiZinbunKyotouAc struct{} +type Kyotou struct{} -func (p KanjiZinbunKyotouAc) getRouterInit(sUrl []string) (map[string]interface{}, error) { +func (p Kyotou) getRouterInit(sUrl []string) (map[string]interface{}, error) { for i, s := range sUrl { - kanjikyoto.Init(i+1, s) + var kyotou app.Kyotou + kyotou.Init(i+1, s) } return nil, nil } diff --git a/site/Japan/kanjikyoto/kanjikyoto.go b/site/Japan/kanjikyoto/kanjikyoto.go deleted file mode 100644 index ba2591d..0000000 --- a/site/Japan/kanjikyoto/kanjikyoto.go +++ /dev/null @@ -1,154 +0,0 @@ -package kanjikyoto - -import ( - "bookget/config" - curl "bookget/lib/curl" - util "bookget/lib/util" - "fmt" - "log" - "regexp" - "strconv" - "strings" -) - -func Init(iTask int, taskUrl string) (msg string, err error) { - bookId := "" - //m := regexp.MustCompile(`/html/([A-Za-z0-9_-]+).html`).FindStringSubmatch(taskUrl) - m := regexp.MustCompile(`/html/([A-Za-z0-9_-]+)menu.html`).FindStringSubmatch(taskUrl) - if m != nil { - bookId = m[1] - config.CreateDirectory(taskUrl, bookId) - StartDownload(iTask, taskUrl, bookId) - } - return "", err -} - -func StartDownload(iTask int, taskUrl, bookId string) { - name := util.GenNumberSorted(iTask) - log.Printf("Get %s %s\n", name, taskUrl) - - bookUrls, err := getMultiplebooks(taskUrl) - if err != nil { - return - } - size := len(bookUrls) - imageUrls, e := getImages(bookUrls[size-1]) - if e != nil { - return - } - size = len(imageUrls) - log.Printf(" %d pages.\n", size) - for i, uri := range imageUrls { - if !config.PageRange(i, size) { - continue - } - if uri == "" { - continue - } - ext := util.FileExt(uri) - sortId := util.GenNumberSorted(i + 1) - log.Printf("Get %s %s\n", sortId, uri) - fileName := sortId + ext - dest := config.GetDestPath(taskUrl, bookId, fileName) - curl.FastGet(uri, dest, nil, true) - } - return -} - -func getMultiplebooks(taskUrl string) (bookUrls []string, err error) { - bs, err := curl.Get(taskUrl, nil) - if err != nil { - return - } - text := string(bs) - //取册数 - matches := regexp.MustCompile(`href=["']?(.+?)\.html["']?`).FindAllStringSubmatch(text, -1) - if matches == nil { - return - } - pos := strings.LastIndex(taskUrl, "/") - hostUrl := taskUrl[:pos] - links := make([]string, 0, len(matches)) - for _, v := range matches { - if strings.Contains(v[1], "top") { - continue - } - s := fmt.Sprintf("%s/%s.html", hostUrl, v[1]) - links = append(links, s) - } - - return links, err -} - -func getImages(volumeUrl string) (imageUrls []string, err error) { - bs, err := curl.Get(volumeUrl, nil) - if err != nil { - return - } - text := string(bs) - - startPos, ok := getVolStartPos(&text) - if !ok { - return - } - maxPage, ok := getVolMaxPage(&text) - if !ok { - return - } - bookNumber, ok := getBookNumber(&text) - if !ok { - return - } - //curPage, _ := getVolCurPage(&text) - //if !ok { - // return - //} - pos := strings.LastIndex(volumeUrl, "/") - pos1 := strings.LastIndex(volumeUrl[:pos], "/") - hostUrl := volumeUrl[:pos1] - maxPos := startPos + maxPage - for i := 1; i < maxPos; i++ { - sortId := util.GenNumberSorted(i) - imgUrl := fmt.Sprintf("%s/L/%s%s.jpg", hostUrl, bookNumber, sortId) - imageUrls = append(imageUrls, imgUrl) - } - return -} -func getBookNumber(text *string) (bookNumber string, ok bool) { - //当前开始位置 - match := regexp.MustCompile(`var[\s]+bookNum[\s]+=["'\s]*([A-z0-9]+)["'\s]*;`).FindStringSubmatch(*text) - if match == nil { - return "", false - } - return match[1], true -} - -func getVolStartPos(text *string) (startPos int, ok bool) { - //当前开始位置 - match := regexp.MustCompile(`var[\s]+volStartPos[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(*text) - if match == nil { - return 0, false - } - startPos, _ = strconv.Atoi(match[1]) - return startPos, true -} - -func getVolCurPage(text *string) (curPage int, ok bool) { - //当前开始位置 - match := regexp.MustCompile(`var[\s]+curPage[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(*text) - if match == nil { - return 0, false - } - curPage, _ = strconv.Atoi(match[1]) - return curPage, true -} - -func getVolMaxPage(text *string) (maxPage int, ok bool) { - //当前开始位置 - match := regexp.MustCompile(`var[\s]+volMaxPage[\s]*=[\s]*([0-9]+)[\s]*;`).FindStringSubmatch(*text) - if match == nil { - return 0, false - } - maxPage, _ = strconv.Atoi(match[1]) - return maxPage, true -}