-
Notifications
You must be signed in to change notification settings - Fork 0
/
jiqizhixin.py
123 lines (101 loc) · 2.84 KB
/
jiqizhixin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
#coding: utf-8
''' 抓取机器之心第一层链接的内容
'''
__all__ = ["grab"]
_www_jiqizhixin_com = [
("html", None),
("body", None),
("div", {
"class": ["home", "u-has-header"],
}),
("div", {
"class": ["u-container"],
}),
("div", {
"class": ["u-clearfix"],
}),
("div", {
"class": ["js-article-container"],
}),
("div", {
"class": ["u-block__body"],
}),
("div", {
"class": ["u-block__item"],
}),
("article", {
"class": ["article-item__container"],
}),
("main", {
"class": ["article-item__right"],
}),
("section", None),
]
# /html/body/div{article}/div{u-min-height-container,u-container}/div{u-col-8,article__inline}/div{article__content}
# /html/body/div{article}/div{u-min-height-container,u-container}/div{u-col-8,article__inline}/div{article__content}
_content_pattern = [
("html", None),
("body", None),
("div", {
"class": ["article"],
}),
("div", {
"class": ["u-container"],
}),
("div", {
"class": ["u-col-8"],
}),
("div", {
"class": ["article__content"],
}),
]
from db import DB
import bs4
from bs4 import BeautifulSoup as bs
import requests
from grab_stack import GrabStack, get_matched_link, get_page_content
_db = DB()
def my_get_urls(tag):
urls = []
links = tag.find_all("a")
for link in links:
url = link["href"]
urls.append(url)
return urls
def my_get_content(tag):
''' 机器之心内容是 <h1> ...</h1> <p> .. 这些组成,可以简单的删除所有 <...>
'''
def remove_html_markup(s):
tag = False
quote = False
out = ""
for c in s:
if c == '<' and not quote:
tag = True
elif c == '>' and not quote:
tag = False
elif (c == '"' or c == "'") and tag:
quote = not quote
elif not tag:
out = out + c
return out
return tag.text
def grab(logger):
urls = get_matched_link(logger, _www_jiqizhixin_com, "https://www.jiqizhixin.com", my_get_urls)
logger.info("there are {} pages to grab".format(len(urls)))
for url in urls:
if url[0] == "/":
url = "https://www.jiqizhixin.com" + url
if _db.has(url):
continue
content = get_page_content(logger, _content_pattern, url, my_get_content)
cs = ""
for c in content:
if isinstance(c, str):
cs += c
if not cs:
logger.warning("page: {} no data??".format(url))
continue
_db.save(url, cs)
logger.info("page: {} saved {} chars".format(url, len(cs)))