s
- for el in self.reverse_tags(
- node, "table", "ul", "div", "aside", "header", "footer", "section"
- ):
- if el in allowed:
- continue
- weight = self.class_weight(el)
- if el in candidates:
- content_score = candidates[el]["content_score"]
- else:
- content_score = 0
- tag = el.tag
-
- if weight + content_score < 0:
- el.drop_tree()
- elif el.text_content().count(",") + el.text_content().count(",") < 10:
- counts = {}
- for kind in ["p", "img", "li", "a", "embed", "input"]:
- counts[kind] = len(el.findall(".//%s" % kind))
- counts["li"] -= 100
- counts["input"] -= len(el.findall('.//input[@type="hidden"]'))
-
- content_length = text_length(el)
- link_density = self.get_link_density(el)
-
- to_remove = False
- reason = ""
-
- # 修改
- if el.tag == "div" and counts["img"] >= 1:
- continue
- if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
- reason = "too many images (%s)" % counts["img"]
- # to_remove = True
- elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
- reason = "more
s than s"
- # to_remove = True
- elif counts["input"] > (counts["p"] / 3):
- reason = "less than 3x
s than s"
- to_remove = True
- elif content_length < MIN_LEN and counts["img"] == 0:
- reason = (
- "too short content length %s without a single image"
- % content_length
- )
- to_remove = True
- elif content_length < MIN_LEN and counts["img"] > 2:
- reason = (
- "too short content length %s and too many images"
- % content_length
- )
- to_remove = True
- elif weight < 25 and link_density > 0.2:
- if tag in ["div", "ul", "table"]:
- ptest = el.xpath(".//text()[not(ancestor::a)]")
- ptest_len = text_len("".join(ptest))
- if ptest_len >= MIN_LEN and link_density <= 0.3:
- continue
- reason = "too many links %.3f for its weight %s" % (
- link_density,
- weight,
- )
- to_remove = True
- elif weight >= 25 and link_density > 0.5:
- reason = "too many links %.3f for its weight %s" % (
- link_density,
- weight,
- )
- to_remove = True
- elif (counts["embed"] == 1 and content_length < 75) or counts[
- "embed"
- ] > 1:
- reason = (
- "