Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tumblr server error 官方服务端api出现异常 #100

Open
qiuker521 opened this issue Oct 19, 2018 · 3 comments
Open

tumblr server error 官方服务端api出现异常 #100

qiuker521 opened this issue Oct 19, 2018 · 3 comments

Comments

@qiuker521
Copy link

qiuker521 commented Oct 19, 2018

如下链接中的博客(这个id对应的这条博客),tumblr官方会返回一个蛋疼的xml格式。

并且如果你访问start = 14200 且 num = 50的话,前后 50 条都不会展示出来。
但是start=14200 且 num = 1就没问题。
但是start=14202 且 num = 1就没问题。

这种错误导致的异常并不少见。

such as https://xxx.tumblr.com/api/read?num=1&start=14201

the tumblr server returns a wrong xml format.

I mean NO.14201 in this blog.
NO.14200 and NO.14202 does not suck.

and, it occurs often.

我个人意见是遇到这种解析错误,可以一条一条解析,解析50次。

if i meet the error myself, i do spide NO.14200-NO.14249 in a 50-times loop.

@dixudx
Copy link
Owner

dixudx commented Dec 1, 2018

@qiuker521 可以提交个PR来fix

@qiuker521
Copy link
Author

@jessefeinman you misunderstand my issue.

I mean,

get the page https://caddy-smashing.tumblr.com/api/read?num=50&start=14200 returns an error,
it is because the tumblr official api returns a wrong xml, what's more, it is because
this post: https://caddy-smashing.tumblr.com/api/read?num=1&start=14237 returns an error,

so the entire 50-blog pages returns a wrong xml format, and the spider raise an exception.

yes, the page https://caddy-smashing.tumblr.com/api/read?num=37&start=14200 is OK.
image

yes, the page https://caddy-smashing.tumblr.com/api/read?num=38&start=14200 is WRONG.
image

I got your commit, it adds the EACH_POST_AS_SEPARATE_JSON variable , but actually it does not fix the issue.

Not i want to scan it one-by-one from the start,

but is to catch the exception, and scan it one-by-one the 50-blog page.

@qiuker521
Copy link
Author

I actually use go myself, inspired by the spider, so here is my fix in golang as an respect:
我也是根据这个爬虫用go重写的,所以贴自己的go代码以示尊敬吧:

import (
	"crypto/tls"
	"encoding/json"
	"fmt"
	"github.com/bitly/go-simplejson"
	"github.com/jinzhu/gorm"
	"io/ioutil"
	"log"
	"net/http"
	"strconv"
	"strings"
	"time"
	"web/model"
)

var db *gorm.DB

func Init(db1 *gorm.DB) {
	db = db1
}

var currIds []int

func Do() error {
	var uploaders = []model.Uploader{}

	if db.Dialect().GetName() != "mysql" {
		db.Where(&model.Uploader{Site: model.TUMBLR}).Order("random()").Find(&uploaders)
	} else {
		db.Where(&model.Uploader{Site: model.TUMBLR}).Order("rand()").Find(&uploaders)
	}
	names := []string{}
	for _, v := range uploaders {
		names = append(names, v.Name)
	}
	DoNames(names)
	return nil
}

func DoNames(names []string) {

	for _, v := range names {

		currIds = []int{}

		log.Println("doing get content of", v)
		var uploader = model.Uploader{}

		db.Where(&model.Uploader{Site: "tumblr", Name: v}).Find(&uploader)

		totalCount, err := Count(uploader.Name)
		if err != nil {
			log.Println(uploader.Name, err)
			continue
		}
		log.Println(uploader.Name, totalCount)

		var currCount int
		db.Model(&model.TumblrContent{}).Where(&model.TumblrContent{TumblrName: uploader.Name}).Count(&currCount)
		log.Printf("tumblr total %d have %d name %s", totalCount, currCount, uploader.Name)

		//here we did not minus the missing,cause it should be a small number.
		for page := 0; page*50 < (totalCount - currCount); page++ {
			FixPage(&uploader, page*50, totalCount, 50)
		}

		db.Model(&model.TumblrContent{}).Where(&model.TumblrContent{TumblrName: uploader.Name}).Count(&currCount)

		if currCount < (totalCount - uploader.Missing) {
			log.Printf("start fix %s, total %d, expect %d", uploader.Name, currCount, totalCount)
			AllLoop(&uploader, totalCount)
			uploader.Missing = totalCount - len(currIds)
			db.Save(&uploader)
		}

		log.Printf("count check: %d of %d, missing %d of %s", currCount, totalCount, uploader.Missing, uploader.Name)
	}
}

var missing = 0

func AllLoop(v *model.Uploader, totalCount int) {
	missing = 0
	currIds = []int{}
	for page := 0; page*50 <= totalCount; page++ {
		FixPage(v, page*50, totalCount, 50)
	}
}

func FixPage(uploader *model.Uploader, start, total, limit int) {
	log.Printf("tumblr count [%d+%d/%d] of %s", start, limit, total, uploader.Name)
	startTime := time.Now()

	resp, err := doHttp(fmt.Sprintf("https://%s.tumblr.com/api/read?num=%d&start=%d&format=json", uploader.Name, limit, start))
	if err != nil {
		log.Println(uploader.Name, err)
		return
	}

	res, err := ioutil.ReadAll(resp.Body)
	resp.Body.Close()
	if err != nil {
		log.Println(uploader.Name, err)
		return
	}
	stopTime := time.Now()

	durationTime := stopTime.Sub(startTime)
	if time.Second >= durationTime {
		s1 := time.Millisecond * 500
		time.Sleep(s1 - durationTime)
	}

	resS := string(res)

	resS = strings.TrimSpace(resS)

	resS = strings.TrimRight(strings.TrimLeft(resS, "var tumblr_api_read = "), ";")

	js, err := simplejson.NewJson([]byte(resS))

	if err != nil {
		log.Println(resp.Status)
		log.Println(uploader.Name, err)
		return
	}

//HERE FIX THE ISSUE 100
	if strings.TrimSpace(resS) == "[]" && limit != 1 {
		for i := 0; i < limit; i++ {
			FixPage(uploader, start+i, total, 1)
		}
		return
	} else if strings.TrimSpace(resS) == "[]" && limit == 1 {
		missing++
	}

	postsArr, err := js.Get("posts").Array()
	if err != nil {
		log.Println(uploader.Name, err)
		return
	}

	for i, _ := range postsArr {
		var id int

		id, _ = strconv.Atoi(js.Get("posts").GetIndex(i).Get("id").MustString())
		if id == 0 {
			id = js.Get("posts").GetIndex(i).Get("id").MustInt()
		}

		content1 := js.Get("posts").GetIndex(i)

		if err != nil {
			log.Println(uploader.Name, err)
			continue
		}

		currIds = append(currIds, id)

		content, _ := json.Marshal(content1)
		var count uint64
		db.Model(&model.TumblrContent{}).Where(&model.TumblrContent{TumblrName: uploader.Name, TumblrId: uint64(id)}).Count(&count)

		if count < 1 {
			type1 := js.Get("posts").GetIndex(i).Get("type").MustString()
			reblog := js.Get("posts").GetIndex(i).Get("reblogged-from-url").MustString()
			c, _ := model.NewTumblrContentBuilder().
				TumblrId(uint64(id)).
				TumblrName(uploader.Name).
				TumblrType(type1).
				TumblrReblogUrl(reblog).
				Content(string(content)).
				Build()
			db.Create(&c)
			Migrate(db, uploader, id)
		}

	}

}
//count all of a tumblr blog
func Count(name string) (int, error) {
	var count1 int
	url1 := fmt.Sprintf("https://%s.tumblr.com/api/read?num=50&start=0&format=json", name)
	log.Println(url1)
	resp, err := doHttp(url1)

	if err != nil {
		log.Println(name, err)
		return 0, err
	}
	defer resp.Body.Close() // Don't forget close the response body

	res, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Println(name, err)
		return 0, err
	}
	resS := string(res)

	resS = strings.TrimRight(strings.TrimLeft(resS, "var tumblr_api_read = "), ";")

	js, err := simplejson.NewJson([]byte(resS))
	if err != nil {
		log.Println(name, err)
		return 0, err
	}

	count1 = js.Get("posts-total").MustInt()
	return count1, nil

}

func doHttp(url1 string) (*http.Response, error) {
	c := new(http.Client)
	c.Timeout = 100 * time.Second

	c.Transport = &http.Transport{
		TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
		Proxy:        http.ProxyFromEnvironment,
	}

	req, _ := http.NewRequest("GET", url1, nil)
	req.Proto = "HTTP/1.1"
	req.ProtoMajor = 1
	req.ProtoMinor = 1
	req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36")
	resp, err := c.Do(req)
	return resp, err
}

func DoAsync() {
	go Do()
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants