Skip to content

Commit

Permalink
feat(tests): starting moving tests to pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyTheFactory committed Oct 29, 2023
1 parent ec2d474 commit f294a01
Show file tree
Hide file tree
Showing 165 changed files with 159 additions and 10 deletions.
2 changes: 1 addition & 1 deletion newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def build(self):

def _parse_scheme_file(self, path):
try:
with open(path, "r") as fin:
with open(path, "r", encoding="utf-8") as fin:
return fin.read()
except OSError as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
Expand Down
1 change: 1 addition & 0 deletions newspaper/extractors/defines.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
{"attribute": "name", "value": "sailthru.date", "content": "content"},
{"attribute": "name", "value": "PublishDate", "content": "content"},
{"attribute": "pubdate", "value": "pubdate", "content": "datetime"},
{"attribute": "name", "value": "pubdate", "content": "content"},
{"attribute": "name", "value": "publish_date", "content": "content"},
{"attribute": "name", "value": "dc.date", "content": "content"},
{"attribute": "class", "value": "entry-date", "content": "datetime"},
Expand Down
6 changes: 3 additions & 3 deletions newspaper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
Parser objects will only contain operations that manipulate
or query an lxml or soup dom object generated from an article's html.
"""
import re
import logging
import string
from copy import deepcopy
import lxml.etree
import lxml.html
import lxml.html.clean
import re
from html import unescape
import string

from bs4 import UnicodeDammit
from copy import deepcopy

from . import text

Expand Down
8 changes: 4 additions & 4 deletions newspaper/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def download_categories(self):
)
else:
log.warning(
("Deleting category %s from source %s due to " "download error"),
("Deleting category %s from source %s due to download error"),
self.categories[index].url,
self.url,
)
Expand All @@ -213,7 +213,7 @@ def download_feeds(self):
self.feeds[index].rss = network.get_html(req.url, response=req.resp)
else:
log.warning(
("Deleting feed %s from source %s due to " "download error"),
("Deleting feed %s from source %s due to download error"),
self.categories[index].url,
self.url,
)
Expand Down Expand Up @@ -340,7 +340,7 @@ def download_articles(self, threads=1):
failed_articles = []

if threads == 1:
for index, article in enumerate(self.articles):
for index, _ in enumerate(self.articles):
url = urls[index]
html = network.get_html(url, config=self.config)
self.articles[index].set_html(html)
Expand Down Expand Up @@ -372,7 +372,7 @@ def download_articles(self, threads=1):

def parse_articles(self):
"""Parse all articles, delete if too small"""
for index, article in enumerate(self.articles):
for article in self.articles:
article.parse()

self.articles = self.purge_articles("body", self.articles)
Expand Down
2 changes: 1 addition & 1 deletion newspaper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def extract_meta_refresh(html):
element = soup.find("meta", attrs={"http-equiv": "refresh"})
if element:
try:
wait_part, url_part = element["content"].split(";")
_, url_part = element["content"].split(";")
except ValueError:
# In case there are not enough values to unpack
# for instance: <meta http-equiv="refresh" content="600" />
Expand Down
17 changes: 17 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Some utility functions for testing
from pathlib import Path


def get_data(filename, resource_type):
"""
Mocks an HTTP request by pulling text from a pre-downloaded file
"""
assert resource_type in ["html", "txt"], f"Invalid resource type {resource_type}"
file = (
Path(__file__).resolve().parent
/ "data"
/ resource_type
/ f"{filename}.{resource_type}"
)
with open(file, "r", encoding="utf-8") as f:
return f.read()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
130 changes: 130 additions & 0 deletions tests/test_article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# pytest file for testing the article class
import pytest
from dateutil.parser import parse as date_parser
import newspaper
from newspaper.article import Article, ArticleDownloadState, ArticleException
from newspaper.configuration import Configuration
import tests.conftest as conftest


@pytest.fixture(scope="module")
def cnn_article():
url = (
"http://www.cnn.com/2013/11/27/travel/weather-"
"thanksgiving/index.html?iref=allsearch"
)
html_content = conftest.get_data("cnn_article", "html")
text_content = conftest.get_data("cnn_article", "txt")

return {
"url": url,
"html_content": html_content,
"text_content": text_content,
}


@pytest.fixture(scope="module")
def meta_refresh():
return [
(conftest.get_data("google_meta_refresh", "html"), "Example Domain"),
(
conftest.get_data("ap_meta_refresh", "html"),
"News from The Associated Press",
),
]


class TestArticle:
def test_article(self, cnn_article):
article = newspaper.Article(cnn_article["url"])
article.download(input_html=cnn_article["html_content"])
article.parse()
assert article.url == cnn_article["url"]
assert article.download_state == ArticleDownloadState.SUCCESS
assert article.download_exception_msg is None
assert len(article.html) == 75404

assert article.text.strip() == cnn_article["text_content"].strip()
assert (
article.title
== "After storm, forecasters see smooth sailing for Thanksgiving"
)
assert article.authors == [
"Dana A. Ford",
"James S.A. Corey",
"Chien-Ming Wang",
"Tom Watkins",
]
assert (article.publish_date - date_parser("2013-11-27T00:00:00Z")).days == 0
assert (
article.top_image
== "http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg"
)
assert article.movies == []
assert article.keywords == []
assert article.meta_keywords == [
"winter storm",
"holiday travel",
"Thanksgiving storm",
"Thanksgiving winter storm",
]
assert article.meta_lang == "en"
assert (
article.meta_description
== "A strong storm struck much of the eastern United "
"States on Wednesday, complicating holiday plans for many "
"of the 43 million Americans expected to travel."
)

def test_call_parse_before_download(self):
article = newspaper.Article("http://www.cnn.com")
with pytest.raises(ArticleException):
article.parse()

def test_call_nlp_before_download(self):
article = newspaper.Article("http://www.cnn.com")
with pytest.raises(ArticleException):
article.nlp()

def test_call_nlp_before_parse(self, cnn_article):
article = newspaper.Article(cnn_article["url"])
article.download(input_html=cnn_article["html_content"])
with pytest.raises(ArticleException):
article.nlp()

def test_meta_refresh(self, meta_refresh):
config = Configuration()
config.follow_meta_refresh = True
article = Article("", config=config)
for html, title in meta_refresh:
article.download(input_html=html)
article.parse()
assert article.title == title

def test_article_nlp(self, cnn_article):
article = newspaper.Article(cnn_article["url"])
article.download(input_html=cnn_article["html_content"])
article.parse()
article.nlp()

summary = conftest.get_data("cnn_summary", "txt")
summary = summary.strip()

assert sorted(article.keywords) == sorted(
[
"balloons",
"delays",
"flight",
"forecasters",
"good",
"sailing",
"smooth",
"storm",
"thanksgiving",
"travel",
"weather",
"winds",
"york",
]
)
assert article.summary.strip() == summary
3 changes: 2 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# core module
sys.path.insert(0, PARENT_DIR)

TEXT_FN = os.path.join(TEST_DIR, "data", "text")
TEXT_FN = os.path.join(TEST_DIR, "data", "txt")
HTML_FN = os.path.join(TEST_DIR, "data", "html")
URLS_FILE = os.path.join(TEST_DIR, "data", "fulltext_url_list.txt")

Expand Down Expand Up @@ -153,6 +153,7 @@ def get_filename(url):


class ArticleTestCase(unittest.TestCase):
# done in pytest
def setup_stage(self, stage_name):
stages = OrderedDict(
[
Expand Down

0 comments on commit f294a01

Please sign in to comment.