feat(tests): starting moving tests to pytest

AndyTheFactory · Oct 29, 2023 · f294a01 · f294a01
1 parent ec2d474
commit f294a01
Show file tree

Hide file tree

Showing 165 changed files with 159 additions and 10 deletions.
diff --git a/newspaper/article.py b/newspaper/article.py
@@ -170,7 +170,7 @@ def build(self):
 
  def _parse_scheme_file(self, path):
  try:
- with open(path, "r") as fin:
+ with open(path, "r", encoding="utf-8") as fin:
  return fin.read()
  except OSError as e:
  self.download_state = ArticleDownloadState.FAILED_RESPONSE

diff --git a/newspaper/extractors/defines.py b/newspaper/extractors/defines.py
@@ -48,6 +48,7 @@
  {"attribute": "name", "value": "sailthru.date", "content": "content"},
  {"attribute": "name", "value": "PublishDate", "content": "content"},
  {"attribute": "pubdate", "value": "pubdate", "content": "datetime"},
+ {"attribute": "name", "value": "pubdate", "content": "content"},
  {"attribute": "name", "value": "publish_date", "content": "content"},
  {"attribute": "name", "value": "dc.date", "content": "content"},
  {"attribute": "class", "value": "entry-date", "content": "datetime"},

diff --git a/newspaper/parsers.py b/newspaper/parsers.py
@@ -9,16 +9,16 @@
 Parser objects will only contain operations that manipulate
 or query an lxml or soup dom object generated from an article's html.
 """
+import re
 import logging
+import string
+from copy import deepcopy
 import lxml.etree
 import lxml.html
 import lxml.html.clean
-import re
 from html import unescape
-import string
 
 from bs4 import UnicodeDammit
-from copy import deepcopy
 
 from . import text
 

diff --git a/newspaper/source.py b/newspaper/source.py
@@ -196,7 +196,7 @@ def download_categories(self):
  )
  else:
  log.warning(
- ("Deleting category %s from source %s due to " "download error"),
+ ("Deleting category %s from source %s due to download error"),
  self.categories[index].url,
  self.url,
  )
@@ -213,7 +213,7 @@ def download_feeds(self):
  self.feeds[index].rss = network.get_html(req.url, response=req.resp)
  else:
  log.warning(
- ("Deleting feed %s from source %s due to " "download error"),
+ ("Deleting feed %s from source %s due to download error"),
  self.categories[index].url,
  self.url,
  )
@@ -340,7 +340,7 @@ def download_articles(self, threads=1):
  failed_articles = []
 
  if threads == 1:
- for index, article in enumerate(self.articles):
+ for index, _ in enumerate(self.articles):
  url = urls[index]
  html = network.get_html(url, config=self.config)
  self.articles[index].set_html(html)
@@ -372,7 +372,7 @@ def download_articles(self, threads=1):
 
  def parse_articles(self):
  """Parse all articles, delete if too small"""
- for index, article in enumerate(self.articles):
+ for article in self.articles:
  article.parse()
 
  self.articles = self.purge_articles("body", self.articles)

diff --git a/newspaper/utils.py b/newspaper/utils.py
@@ -195,7 +195,7 @@ def extract_meta_refresh(html):
  element = soup.find("meta", attrs={"http-equiv": "refresh"})
  if element:
  try:
- wait_part, url_part = element["content"].split(";")
+ _, url_part = element["content"].split(";")
  except ValueError:
  # In case there are not enough values to unpack
  # for instance: <meta http-equiv="refresh" content="600" />

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,17 @@
+# Some utility functions for testing
+from pathlib import Path
+
+
+def get_data(filename, resource_type):
+ """
+ Mocks an HTTP request by pulling text from a pre-downloaded file
+ """
+ assert resource_type in ["html", "txt"], f"Invalid resource type {resource_type}"
+ file = (
+ Path(__file__).resolve().parent
+ / "data"
+ / resource_type
+ / f"{filename}.{resource_type}"
+ )
+ with open(file, "r", encoding="utf-8") as f:
+ return f.read()
diff --git a/tests/data/text/247wallst.com1.txt → tests/data/txt/247wallst.com1.txt b/tests/data/text/247wallst.com1.txt → tests/data/txt/247wallst.com1.txt
diff --git a/tests/data/text/247wallst.com2.txt → tests/data/txt/247wallst.com2.txt b/tests/data/text/247wallst.com2.txt → tests/data/txt/247wallst.com2.txt
diff --git a/tests/data/text/about.com1.txt → tests/data/txt/about.com1.txt b/tests/data/text/about.com1.txt → tests/data/txt/about.com1.txt
diff --git a/tests/data/text/about.com2.txt → tests/data/txt/about.com2.txt b/tests/data/text/about.com2.txt → tests/data/txt/about.com2.txt
diff --git a/tests/data/text/adoption.com1.txt → tests/data/txt/adoption.com1.txt b/tests/data/text/adoption.com1.txt → tests/data/txt/adoption.com1.txt
diff --git a/tests/data/text/al.com1.txt → tests/data/txt/al.com1.txt b/tests/data/text/al.com1.txt → tests/data/txt/al.com1.txt
diff --git a/tests/data/text/al.com2.txt → tests/data/txt/al.com2.txt b/tests/data/text/al.com2.txt → tests/data/txt/al.com2.txt
diff --git a/tests/data/text/apartmenttherapy.com1.txt → tests/data/txt/apartmenttherapy.com1.txt b/tests/data/text/apartmenttherapy.com1.txt → tests/data/txt/apartmenttherapy.com1.txt
diff --git a/tests/data/text/apartmenttherapy.com2.txt → tests/data/txt/apartmenttherapy.com2.txt b/tests/data/text/apartmenttherapy.com2.txt → tests/data/txt/apartmenttherapy.com2.txt
diff --git a/tests/data/text/arabic.txt → tests/data/txt/arabic.txt b/tests/data/text/arabic.txt → tests/data/txt/arabic.txt
diff --git a/tests/data/text/architecturaldigest.com1.txt → tests/data/txt/architecturaldigest.com1.txt b/tests/data/text/architecturaldigest.com1.txt → tests/data/txt/architecturaldigest.com1.txt
diff --git a/tests/data/text/architecturaldigest.com2.txt → tests/data/txt/architecturaldigest.com2.txt b/tests/data/text/architecturaldigest.com2.txt → tests/data/txt/architecturaldigest.com2.txt
diff --git a/tests/data/text/avclub.com1.txt → tests/data/txt/avclub.com1.txt b/tests/data/text/avclub.com1.txt → tests/data/txt/avclub.com1.txt
diff --git a/tests/data/text/avclub.com2.txt → tests/data/txt/avclub.com2.txt b/tests/data/text/avclub.com2.txt → tests/data/txt/avclub.com2.txt
diff --git a/tests/data/text/backstage.com1.txt → tests/data/txt/backstage.com1.txt b/tests/data/text/backstage.com1.txt → tests/data/txt/backstage.com1.txt
diff --git a/tests/data/text/backstage.com2.txt → tests/data/txt/backstage.com2.txt b/tests/data/text/backstage.com2.txt → tests/data/txt/backstage.com2.txt
diff --git a/tests/data/text/bhg.com1.txt → tests/data/txt/bhg.com1.txt b/tests/data/text/bhg.com1.txt → tests/data/txt/bhg.com1.txt
diff --git a/tests/data/text/bhg.com2.txt → tests/data/txt/bhg.com2.txt b/tests/data/text/bhg.com2.txt → tests/data/txt/bhg.com2.txt
diff --git a/tests/data/text/bloomberg.com1.txt → tests/data/txt/bloomberg.com1.txt b/tests/data/text/bloomberg.com1.txt → tests/data/txt/bloomberg.com1.txt
diff --git a/tests/data/text/bostonherald.com1.txt → tests/data/txt/bostonherald.com1.txt b/tests/data/text/bostonherald.com1.txt → tests/data/txt/bostonherald.com1.txt
diff --git a/tests/data/text/bostonherald.com2.txt → tests/data/txt/bostonherald.com2.txt b/tests/data/text/bostonherald.com2.txt → tests/data/txt/bostonherald.com2.txt
diff --git a/tests/data/text/businessinsider.com1.txt → tests/data/txt/businessinsider.com1.txt b/tests/data/text/businessinsider.com1.txt → tests/data/txt/businessinsider.com1.txt
diff --git a/tests/data/text/businessinsider.com2.txt → tests/data/txt/businessinsider.com2.txt b/tests/data/text/businessinsider.com2.txt → tests/data/txt/businessinsider.com2.txt
diff --git a/tests/data/text/businessweek.com1.txt → tests/data/txt/businessweek.com1.txt b/tests/data/text/businessweek.com1.txt → tests/data/txt/businessweek.com1.txt
diff --git a/tests/data/text/businessweek.com2.txt → tests/data/txt/businessweek.com2.txt b/tests/data/text/businessweek.com2.txt → tests/data/txt/businessweek.com2.txt
diff --git a/tests/data/text/chinese.txt → tests/data/txt/chinese.txt b/tests/data/text/chinese.txt → tests/data/txt/chinese.txt
diff --git a/tests/data/text/cleveland.com1.txt → tests/data/txt/cleveland.com1.txt b/tests/data/text/cleveland.com1.txt → tests/data/txt/cleveland.com1.txt
diff --git a/tests/data/text/cleveland.com2.txt → tests/data/txt/cleveland.com2.txt b/tests/data/text/cleveland.com2.txt → tests/data/txt/cleveland.com2.txt
diff --git a/tests/data/text/cnn.txt → tests/data/txt/cnn_article.txt b/tests/data/text/cnn.txt → tests/data/txt/cnn_article.txt
diff --git a/tests/data/text/cnn_summary.txt → tests/data/txt/cnn_summary.txt b/tests/data/text/cnn_summary.txt → tests/data/txt/cnn_summary.txt
diff --git a/tests/data/text/cntraveler.com1.txt → tests/data/txt/cntraveler.com1.txt b/tests/data/text/cntraveler.com1.txt → tests/data/txt/cntraveler.com1.txt
diff --git a/tests/data/text/cntraveler.com2.txt → tests/data/txt/cntraveler.com2.txt b/tests/data/text/cntraveler.com2.txt → tests/data/txt/cntraveler.com2.txt
diff --git a/tests/data/text/coolhunting.com1.txt → tests/data/txt/coolhunting.com1.txt b/tests/data/text/coolhunting.com1.txt → tests/data/txt/coolhunting.com1.txt
diff --git a/tests/data/text/cricket.com.au1.txt → tests/data/txt/cricket.com.au1.txt b/tests/data/text/cricket.com.au1.txt → tests/data/txt/cricket.com.au1.txt
diff --git a/tests/data/text/cricket.com.au2.txt → tests/data/txt/cricket.com.au2.txt b/tests/data/text/cricket.com.au2.txt → tests/data/txt/cricket.com.au2.txt
diff --git a/tests/data/text/dailycaller.com1.txt → tests/data/txt/dailycaller.com1.txt b/tests/data/text/dailycaller.com1.txt → tests/data/txt/dailycaller.com1.txt
diff --git a/tests/data/text/dailycaller.com2.txt → tests/data/txt/dailycaller.com2.txt b/tests/data/text/dailycaller.com2.txt → tests/data/txt/dailycaller.com2.txt
diff --git a/tests/data/text/dailystar.co.uk1.txt → tests/data/txt/dailystar.co.uk1.txt b/tests/data/text/dailystar.co.uk1.txt → tests/data/txt/dailystar.co.uk1.txt
diff --git a/tests/data/text/dailystar.co.uk2.txt → tests/data/txt/dailystar.co.uk2.txt b/tests/data/text/dailystar.co.uk2.txt → tests/data/txt/dailystar.co.uk2.txt
diff --git a/tests/data/text/dallasnews.com1.txt → tests/data/txt/dallasnews.com1.txt b/tests/data/text/dallasnews.com1.txt → tests/data/txt/dallasnews.com1.txt
diff --git a/tests/data/text/dallasnews.com2.txt → tests/data/txt/dallasnews.com2.txt b/tests/data/text/dallasnews.com2.txt → tests/data/txt/dallasnews.com2.txt
diff --git a/tests/data/text/details.com1.txt → tests/data/txt/details.com1.txt b/tests/data/text/details.com1.txt → tests/data/txt/details.com1.txt
diff --git a/tests/data/text/details.com2.txt → tests/data/txt/details.com2.txt b/tests/data/text/details.com2.txt → tests/data/txt/details.com2.txt
diff --git a/tests/data/text/elle.com1.txt → tests/data/txt/elle.com1.txt b/tests/data/text/elle.com1.txt → tests/data/txt/elle.com1.txt
diff --git a/tests/data/text/elle.com2.txt → tests/data/txt/elle.com2.txt b/tests/data/text/elle.com2.txt → tests/data/txt/elle.com2.txt
diff --git a/tests/data/text/flavorwire.com1.txt → tests/data/txt/flavorwire.com1.txt b/tests/data/text/flavorwire.com1.txt → tests/data/txt/flavorwire.com1.txt
diff --git a/tests/data/text/flavorwire.com2.txt → tests/data/txt/flavorwire.com2.txt b/tests/data/text/flavorwire.com2.txt → tests/data/txt/flavorwire.com2.txt
diff --git a/tests/data/text/fool.com1.txt → tests/data/txt/fool.com1.txt b/tests/data/text/fool.com1.txt → tests/data/txt/fool.com1.txt
diff --git a/tests/data/text/fool.com2.txt → tests/data/txt/fool.com2.txt b/tests/data/text/fool.com2.txt → tests/data/txt/fool.com2.txt
diff --git a/tests/data/text/foxbusiness.com1.txt → tests/data/txt/foxbusiness.com1.txt b/tests/data/text/foxbusiness.com1.txt → tests/data/txt/foxbusiness.com1.txt
diff --git a/tests/data/text/foxbusiness.com2.txt → tests/data/txt/foxbusiness.com2.txt b/tests/data/text/foxbusiness.com2.txt → tests/data/txt/foxbusiness.com2.txt
diff --git a/tests/data/text/foxnews.com1.txt → tests/data/txt/foxnews.com1.txt b/tests/data/text/foxnews.com1.txt → tests/data/txt/foxnews.com1.txt
diff --git a/tests/data/text/foxnews.com2.txt → tests/data/txt/foxnews.com2.txt b/tests/data/text/foxnews.com2.txt → tests/data/txt/foxnews.com2.txt
diff --git a/tests/data/text/foxnews.com3.txt → tests/data/txt/foxnews.com3.txt b/tests/data/text/foxnews.com3.txt → tests/data/txt/foxnews.com3.txt
diff --git a/tests/data/text/foxnews.com4.txt → tests/data/txt/foxnews.com4.txt b/tests/data/text/foxnews.com4.txt → tests/data/txt/foxnews.com4.txt
diff --git a/tests/data/text/glamour.com1.txt → tests/data/txt/glamour.com1.txt b/tests/data/text/glamour.com1.txt → tests/data/txt/glamour.com1.txt
diff --git a/tests/data/text/glamour.com2.txt → tests/data/txt/glamour.com2.txt b/tests/data/text/glamour.com2.txt → tests/data/txt/glamour.com2.txt
diff --git a/tests/data/text/globalnews.ca1.txt → tests/data/txt/globalnews.ca1.txt b/tests/data/text/globalnews.ca1.txt → tests/data/txt/globalnews.ca1.txt
diff --git a/tests/data/text/globalnews.ca2.txt → tests/data/txt/globalnews.ca2.txt b/tests/data/text/globalnews.ca2.txt → tests/data/txt/globalnews.ca2.txt
diff --git a/tests/data/text/gq.com1.txt → tests/data/txt/gq.com1.txt b/tests/data/text/gq.com1.txt → tests/data/txt/gq.com1.txt
diff --git a/tests/data/text/gq.com2.txt → tests/data/txt/gq.com2.txt b/tests/data/text/gq.com2.txt → tests/data/txt/gq.com2.txt
diff --git a/tests/data/text/graziadaily.co.uk1.txt → tests/data/txt/graziadaily.co.uk1.txt b/tests/data/text/graziadaily.co.uk1.txt → tests/data/txt/graziadaily.co.uk1.txt
diff --git a/tests/data/text/graziadaily.co.uk2.txt → tests/data/txt/graziadaily.co.uk2.txt b/tests/data/text/graziadaily.co.uk2.txt → tests/data/txt/graziadaily.co.uk2.txt
diff --git a/tests/data/text/gulflive.com1.txt → tests/data/txt/gulflive.com1.txt b/tests/data/text/gulflive.com1.txt → tests/data/txt/gulflive.com1.txt
diff --git a/tests/data/text/gulflive.com2.txt → tests/data/txt/gulflive.com2.txt b/tests/data/text/gulflive.com2.txt → tests/data/txt/gulflive.com2.txt
diff --git a/tests/data/text/huffingtonpost.com1.txt → tests/data/txt/huffingtonpost.com1.txt b/tests/data/text/huffingtonpost.com1.txt → tests/data/txt/huffingtonpost.com1.txt
diff --git a/tests/data/text/japanese.txt → tests/data/txt/japanese.txt b/tests/data/text/japanese.txt → tests/data/txt/japanese.txt
diff --git a/tests/data/text/japanese2.txt → tests/data/txt/japanese2.txt b/tests/data/text/japanese2.txt → tests/data/txt/japanese2.txt
diff --git a/tests/data/text/lifebuzz.com1.txt → tests/data/txt/lifebuzz.com1.txt b/tests/data/text/lifebuzz.com1.txt → tests/data/txt/lifebuzz.com1.txt
diff --git a/tests/data/text/lifebuzz.com2.txt → tests/data/txt/lifebuzz.com2.txt b/tests/data/text/lifebuzz.com2.txt → tests/data/txt/lifebuzz.com2.txt
diff --git a/tests/data/text/livescience.com1.txt → tests/data/txt/livescience.com1.txt b/tests/data/text/livescience.com1.txt → tests/data/txt/livescience.com1.txt
diff --git a/tests/data/text/livescience.com2.txt → tests/data/txt/livescience.com2.txt b/tests/data/text/livescience.com2.txt → tests/data/txt/livescience.com2.txt
diff --git a/tests/data/text/mashable.com1.txt → tests/data/txt/mashable.com1.txt b/tests/data/text/mashable.com1.txt → tests/data/txt/mashable.com1.txt
diff --git a/tests/data/text/mashable.com2.txt → tests/data/txt/mashable.com2.txt b/tests/data/text/mashable.com2.txt → tests/data/txt/mashable.com2.txt
diff --git a/tests/data/text/mlive.com1.txt → tests/data/txt/mlive.com1.txt b/tests/data/text/mlive.com1.txt → tests/data/txt/mlive.com1.txt
diff --git a/tests/data/text/mlive.com2.txt → tests/data/txt/mlive.com2.txt b/tests/data/text/mlive.com2.txt → tests/data/txt/mlive.com2.txt
diff --git a/tests/data/text/newyorker.com1.txt → tests/data/txt/newyorker.com1.txt b/tests/data/text/newyorker.com1.txt → tests/data/txt/newyorker.com1.txt
diff --git a/tests/data/text/nj.com1.txt → tests/data/txt/nj.com1.txt b/tests/data/text/nj.com1.txt → tests/data/txt/nj.com1.txt
diff --git a/tests/data/text/nola.com1.txt → tests/data/txt/nola.com1.txt b/tests/data/text/nola.com1.txt → tests/data/txt/nola.com1.txt
diff --git a/tests/data/text/nydailynews.com1.txt → tests/data/txt/nydailynews.com1.txt b/tests/data/text/nydailynews.com1.txt → tests/data/txt/nydailynews.com1.txt
diff --git a/tests/data/text/nypost.com1.txt → tests/data/txt/nypost.com1.txt b/tests/data/text/nypost.com1.txt → tests/data/txt/nypost.com1.txt
diff --git a/tests/data/text/nypost.com2.txt → tests/data/txt/nypost.com2.txt b/tests/data/text/nypost.com2.txt → tests/data/txt/nypost.com2.txt
diff --git a/tests/data/text/ok.co.uk1.txt → tests/data/txt/ok.co.uk1.txt b/tests/data/text/ok.co.uk1.txt → tests/data/txt/ok.co.uk1.txt
diff --git a/tests/data/text/ok.co.uk2.txt → tests/data/txt/ok.co.uk2.txt b/tests/data/text/ok.co.uk2.txt → tests/data/txt/ok.co.uk2.txt
diff --git a/tests/data/text/oregonlive.com1.txt → tests/data/txt/oregonlive.com1.txt b/tests/data/text/oregonlive.com1.txt → tests/data/txt/oregonlive.com1.txt
diff --git a/tests/data/text/oregonlive.com2.txt → tests/data/txt/oregonlive.com2.txt b/tests/data/text/oregonlive.com2.txt → tests/data/txt/oregonlive.com2.txt
diff --git a/tests/data/text/parsely.com1.txt → tests/data/txt/parsely.com1.txt b/tests/data/text/parsely.com1.txt → tests/data/txt/parsely.com1.txt
diff --git a/tests/data/text/parsely.com2.txt → tests/data/txt/parsely.com2.txt b/tests/data/text/parsely.com2.txt → tests/data/txt/parsely.com2.txt
diff --git a/tests/data/text/pe.com1.txt → tests/data/txt/pe.com1.txt b/tests/data/text/pe.com1.txt → tests/data/txt/pe.com1.txt
diff --git a/tests/data/text/pewresearch.org1.txt → tests/data/txt/pewresearch.org1.txt b/tests/data/text/pewresearch.org1.txt → tests/data/txt/pewresearch.org1.txt
diff --git a/tests/data/text/pewresearch.org2.txt → tests/data/txt/pewresearch.org2.txt b/tests/data/text/pewresearch.org2.txt → tests/data/txt/pewresearch.org2.txt
diff --git a/tests/data/text/pixable.com1.txt → tests/data/txt/pixable.com1.txt b/tests/data/text/pixable.com1.txt → tests/data/txt/pixable.com1.txt
diff --git a/tests/data/text/pixable.com2.txt → tests/data/txt/pixable.com2.txt b/tests/data/text/pixable.com2.txt → tests/data/txt/pixable.com2.txt
diff --git a/tests/data/text/pixelmonkey.org1.txt → tests/data/txt/pixelmonkey.org1.txt b/tests/data/text/pixelmonkey.org1.txt → tests/data/txt/pixelmonkey.org1.txt
diff --git a/tests/data/text/pixelmonkey.org2.txt → tests/data/txt/pixelmonkey.org2.txt b/tests/data/text/pixelmonkey.org2.txt → tests/data/txt/pixelmonkey.org2.txt
diff --git a/tests/data/text/readwrite.com1.txt → tests/data/txt/readwrite.com1.txt b/tests/data/text/readwrite.com1.txt → tests/data/txt/readwrite.com1.txt
diff --git a/tests/data/text/recipe.com1.txt → tests/data/txt/recipe.com1.txt b/tests/data/text/recipe.com1.txt → tests/data/txt/recipe.com1.txt
diff --git a/tests/data/text/recipe.com2.txt → tests/data/txt/recipe.com2.txt b/tests/data/text/recipe.com2.txt → tests/data/txt/recipe.com2.txt
diff --git a/tests/data/text/reuters.com1.txt → tests/data/txt/reuters.com1.txt b/tests/data/text/reuters.com1.txt → tests/data/txt/reuters.com1.txt
diff --git a/tests/data/text/reuters.com2.txt → tests/data/txt/reuters.com2.txt b/tests/data/text/reuters.com2.txt → tests/data/txt/reuters.com2.txt
diff --git a/tests/data/text/reuters.com3.txt → tests/data/txt/reuters.com3.txt b/tests/data/text/reuters.com3.txt → tests/data/txt/reuters.com3.txt
diff --git a/tests/data/text/reuters.com4.txt → tests/data/txt/reuters.com4.txt b/tests/data/text/reuters.com4.txt → tests/data/txt/reuters.com4.txt
diff --git a/tests/data/text/reuters.com5.txt → tests/data/txt/reuters.com5.txt b/tests/data/text/reuters.com5.txt → tests/data/txt/reuters.com5.txt
diff --git a/tests/data/text/reuters.com6.txt → tests/data/txt/reuters.com6.txt b/tests/data/text/reuters.com6.txt → tests/data/txt/reuters.com6.txt
diff --git a/tests/data/text/self.com1.txt → tests/data/txt/self.com1.txt b/tests/data/text/self.com1.txt → tests/data/txt/self.com1.txt
diff --git a/tests/data/text/self.com2.txt → tests/data/txt/self.com2.txt b/tests/data/text/self.com2.txt → tests/data/txt/self.com2.txt
diff --git a/tests/data/text/sitepoint.com1.txt → tests/data/txt/sitepoint.com1.txt b/tests/data/text/sitepoint.com1.txt → tests/data/txt/sitepoint.com1.txt
diff --git a/tests/data/text/sitepoint.com2.txt → tests/data/txt/sitepoint.com2.txt b/tests/data/text/sitepoint.com2.txt → tests/data/txt/sitepoint.com2.txt
diff --git a/tests/data/text/slate.com1.txt → tests/data/txt/slate.com1.txt b/tests/data/text/slate.com1.txt → tests/data/txt/slate.com1.txt
diff --git a/tests/data/text/slate.com2.txt → tests/data/txt/slate.com2.txt b/tests/data/text/slate.com2.txt → tests/data/txt/slate.com2.txt
diff --git a/tests/data/text/space.com1.txt → tests/data/txt/space.com1.txt b/tests/data/text/space.com1.txt → tests/data/txt/space.com1.txt
diff --git a/tests/data/text/space.com2.txt → tests/data/txt/space.com2.txt b/tests/data/text/space.com2.txt → tests/data/txt/space.com2.txt
diff --git a/tests/data/text/spanish.txt → tests/data/txt/spanish.txt b/tests/data/text/spanish.txt → tests/data/txt/spanish.txt
diff --git a/tests/data/text/syracuse.com1.txt → tests/data/txt/syracuse.com1.txt b/tests/data/text/syracuse.com1.txt → tests/data/txt/syracuse.com1.txt
diff --git a/tests/data/text/syracuse.com2.txt → tests/data/txt/syracuse.com2.txt b/tests/data/text/syracuse.com2.txt → tests/data/txt/syracuse.com2.txt
diff --git a/tests/data/text/talkingpointsmemo.com1.txt → tests/data/txt/talkingpointsmemo.com1.txt b/tests/data/text/talkingpointsmemo.com1.txt → tests/data/txt/talkingpointsmemo.com1.txt
diff --git a/tests/data/text/technologyreview.com1.txt → tests/data/txt/technologyreview.com1.txt b/tests/data/text/technologyreview.com1.txt → tests/data/txt/technologyreview.com1.txt
diff --git a/tests/data/text/technologyreview.com2.txt → tests/data/txt/technologyreview.com2.txt b/tests/data/text/technologyreview.com2.txt → tests/data/txt/technologyreview.com2.txt
diff --git a/tests/data/text/teenvogue.com1.txt → tests/data/txt/teenvogue.com1.txt b/tests/data/text/teenvogue.com1.txt → tests/data/txt/teenvogue.com1.txt
diff --git a/tests/data/text/teenvogue.com2.txt → tests/data/txt/teenvogue.com2.txt b/tests/data/text/teenvogue.com2.txt → tests/data/txt/teenvogue.com2.txt
diff --git a/tests/data/text/telegraph.co.uk1.txt → tests/data/txt/telegraph.co.uk1.txt b/tests/data/text/telegraph.co.uk1.txt → tests/data/txt/telegraph.co.uk1.txt
diff --git a/tests/data/text/telegraph.co.uk2.txt → tests/data/txt/telegraph.co.uk2.txt b/tests/data/text/telegraph.co.uk2.txt → tests/data/txt/telegraph.co.uk2.txt
diff --git a/tests/data/text/thai.txt → tests/data/txt/thai.txt b/tests/data/text/thai.txt → tests/data/txt/thai.txt
diff --git a/tests/data/text/theatlantic.com1.txt → tests/data/txt/theatlantic.com1.txt b/tests/data/text/theatlantic.com1.txt → tests/data/txt/theatlantic.com1.txt
diff --git a/tests/data/text/theatlantic.com2.txt → tests/data/txt/theatlantic.com2.txt b/tests/data/text/theatlantic.com2.txt → tests/data/txt/theatlantic.com2.txt
diff --git a/tests/data/text/theatlanticcities.com1.txt → tests/data/txt/theatlanticcities.com1.txt b/tests/data/text/theatlanticcities.com1.txt → tests/data/txt/theatlanticcities.com1.txt
diff --git a/tests/data/text/theatlanticcities.com2.txt → tests/data/txt/theatlanticcities.com2.txt b/tests/data/text/theatlanticcities.com2.txt → tests/data/txt/theatlanticcities.com2.txt
diff --git a/tests/data/text/thedailybeast.com1.txt → tests/data/txt/thedailybeast.com1.txt b/tests/data/text/thedailybeast.com1.txt → tests/data/txt/thedailybeast.com1.txt
diff --git a/tests/data/text/thedailybeast.com2.txt → tests/data/txt/thedailybeast.com2.txt b/tests/data/text/thedailybeast.com2.txt → tests/data/txt/thedailybeast.com2.txt
diff --git a/tests/data/text/thedebrief.co.uk1.txt → tests/data/txt/thedebrief.co.uk1.txt b/tests/data/text/thedebrief.co.uk1.txt → tests/data/txt/thedebrief.co.uk1.txt
diff --git a/tests/data/text/thedebrief.co.uk2.txt → tests/data/txt/thedebrief.co.uk2.txt b/tests/data/text/thedebrief.co.uk2.txt → tests/data/txt/thedebrief.co.uk2.txt
diff --git a/tests/data/text/theglobeandmail.com1.txt → tests/data/txt/theglobeandmail.com1.txt b/tests/data/text/theglobeandmail.com1.txt → tests/data/txt/theglobeandmail.com1.txt
diff --git a/tests/data/text/theglobeandmail.com2.txt → tests/data/txt/theglobeandmail.com2.txt b/tests/data/text/theglobeandmail.com2.txt → tests/data/txt/theglobeandmail.com2.txt
diff --git a/tests/data/text/thekitchn.com1.txt → tests/data/txt/thekitchn.com1.txt b/tests/data/text/thekitchn.com1.txt → tests/data/txt/thekitchn.com1.txt
diff --git a/tests/data/text/thekitchn.com2.txt → tests/data/txt/thekitchn.com2.txt b/tests/data/text/thekitchn.com2.txt → tests/data/txt/thekitchn.com2.txt
diff --git a/tests/data/text/thenextweb.com1.txt → tests/data/txt/thenextweb.com1.txt b/tests/data/text/thenextweb.com1.txt → tests/data/txt/thenextweb.com1.txt
diff --git a/tests/data/text/theonion.com1.txt → tests/data/txt/theonion.com1.txt b/tests/data/text/theonion.com1.txt → tests/data/txt/theonion.com1.txt
diff --git a/tests/data/text/theroot.com1.txt → tests/data/txt/theroot.com1.txt b/tests/data/text/theroot.com1.txt → tests/data/txt/theroot.com1.txt
diff --git a/tests/data/text/tnr.com1.txt → tests/data/txt/tnr.com1.txt b/tests/data/text/tnr.com1.txt → tests/data/txt/tnr.com1.txt
diff --git a/tests/data/text/tnr.com2.txt → tests/data/txt/tnr.com2.txt b/tests/data/text/tnr.com2.txt → tests/data/txt/tnr.com2.txt
diff --git a/tests/data/text/uproxx.com1.txt → tests/data/txt/uproxx.com1.txt b/tests/data/text/uproxx.com1.txt → tests/data/txt/uproxx.com1.txt
diff --git a/tests/data/text/uproxx.com2.txt → tests/data/txt/uproxx.com2.txt b/tests/data/text/uproxx.com2.txt → tests/data/txt/uproxx.com2.txt
diff --git a/tests/data/text/upworthy.com1.txt → tests/data/txt/upworthy.com1.txt b/tests/data/text/upworthy.com1.txt → tests/data/txt/upworthy.com1.txt
diff --git a/tests/data/text/upworthy.com2.txt → tests/data/txt/upworthy.com2.txt b/tests/data/text/upworthy.com2.txt → tests/data/txt/upworthy.com2.txt
diff --git a/tests/data/text/usnews.com1.txt → tests/data/txt/usnews.com1.txt b/tests/data/text/usnews.com1.txt → tests/data/txt/usnews.com1.txt
diff --git a/tests/data/text/usnews.com2.txt → tests/data/txt/usnews.com2.txt b/tests/data/text/usnews.com2.txt → tests/data/txt/usnews.com2.txt
diff --git a/tests/data/text/vanityfair.com1.txt → tests/data/txt/vanityfair.com1.txt b/tests/data/text/vanityfair.com1.txt → tests/data/txt/vanityfair.com1.txt
diff --git a/tests/data/text/vogue.de1.txt → tests/data/txt/vogue.de1.txt b/tests/data/text/vogue.de1.txt → tests/data/txt/vogue.de1.txt
diff --git a/tests/data/text/vogue.de2.txt → tests/data/txt/vogue.de2.txt b/tests/data/text/vogue.de2.txt → tests/data/txt/vogue.de2.txt
diff --git a/tests/data/text/wetpaint.com1.txt → tests/data/txt/wetpaint.com1.txt b/tests/data/text/wetpaint.com1.txt → tests/data/txt/wetpaint.com1.txt
diff --git a/tests/data/text/wetpaint.com2.txt → tests/data/txt/wetpaint.com2.txt b/tests/data/text/wetpaint.com2.txt → tests/data/txt/wetpaint.com2.txt
diff --git a/tests/data/text/wired.com1.txt → tests/data/txt/wired.com1.txt b/tests/data/text/wired.com1.txt → tests/data/txt/wired.com1.txt
diff --git a/tests/data/text/wired.com2.txt → tests/data/txt/wired.com2.txt b/tests/data/text/wired.com2.txt → tests/data/txt/wired.com2.txt
diff --git a/tests/data/text/wnet.org1.txt → tests/data/txt/wnet.org1.txt b/tests/data/text/wnet.org1.txt → tests/data/txt/wnet.org1.txt
diff --git a/tests/data/text/wnet.org2.txt → tests/data/txt/wnet.org2.txt b/tests/data/text/wnet.org2.txt → tests/data/txt/wnet.org2.txt
diff --git a/tests/data/text/youbeauty.com1.txt → tests/data/txt/youbeauty.com1.txt b/tests/data/text/youbeauty.com1.txt → tests/data/txt/youbeauty.com1.txt
diff --git a/tests/data/text/youbeauty.com2.txt → tests/data/txt/youbeauty.com2.txt b/tests/data/text/youbeauty.com2.txt → tests/data/txt/youbeauty.com2.txt
diff --git a/tests/test_article.py b/tests/test_article.py
@@ -0,0 +1,130 @@
+# pytest file for testing the article class
+import pytest
+from dateutil.parser import parse as date_parser
+import newspaper
+from newspaper.article import Article, ArticleDownloadState, ArticleException
+from newspaper.configuration import Configuration
+import tests.conftest as conftest
+
+
+@pytest.fixture(scope="module")
+def cnn_article():
+ url = (
+ "http://www.cnn.com/2013/11/27/travel/weather-"
+ "thanksgiving/index.html?iref=allsearch"
+ )
+ html_content = conftest.get_data("cnn_article", "html")
+ text_content = conftest.get_data("cnn_article", "txt")
+
+ return {
+ "url": url,
+ "html_content": html_content,
+ "text_content": text_content,
+ }
+
+
+@pytest.fixture(scope="module")
+def meta_refresh():
+ return [
+ (conftest.get_data("google_meta_refresh", "html"), "Example Domain"),
+ (
+ conftest.get_data("ap_meta_refresh", "html"),
+ "News from The Associated Press",
+ ),
+ ]
+
+
+class TestArticle:
+ def test_article(self, cnn_article):
+ article = newspaper.Article(cnn_article["url"])
+ article.download(input_html=cnn_article["html_content"])
+ article.parse()
+ assert article.url == cnn_article["url"]
+ assert article.download_state == ArticleDownloadState.SUCCESS
+ assert article.download_exception_msg is None
+ assert len(article.html) == 75404
+
+ assert article.text.strip() == cnn_article["text_content"].strip()
+ assert (
+ article.title
+ == "After storm, forecasters see smooth sailing for Thanksgiving"
+ )
+ assert article.authors == [
+ "Dana A. Ford",
+ "James S.A. Corey",
+ "Chien-Ming Wang",
+ "Tom Watkins",
+ ]
+ assert (article.publish_date - date_parser("2013-11-27T00:00:00Z")).days == 0
+ assert (
+ article.top_image
+ == "http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg"
+ )
+ assert article.movies == []
+ assert article.keywords == []
+ assert article.meta_keywords == [
+ "winter storm",
+ "holiday travel",
+ "Thanksgiving storm",
+ "Thanksgiving winter storm",
+ ]
+ assert article.meta_lang == "en"
+ assert (
+ article.meta_description
+ == "A strong storm struck much of the eastern United "
+ "States on Wednesday, complicating holiday plans for many "
+ "of the 43 million Americans expected to travel."
+ )
+
+ def test_call_parse_before_download(self):
+ article = newspaper.Article("http://www.cnn.com")
+ with pytest.raises(ArticleException):
+ article.parse()
+
+ def test_call_nlp_before_download(self):
+ article = newspaper.Article("http://www.cnn.com")
+ with pytest.raises(ArticleException):
+ article.nlp()
+
+ def test_call_nlp_before_parse(self, cnn_article):
+ article = newspaper.Article(cnn_article["url"])
+ article.download(input_html=cnn_article["html_content"])
+ with pytest.raises(ArticleException):
+ article.nlp()
+
+ def test_meta_refresh(self, meta_refresh):
+ config = Configuration()
+ config.follow_meta_refresh = True
+ article = Article("", config=config)
+ for html, title in meta_refresh:
+ article.download(input_html=html)
+ article.parse()
+ assert article.title == title
+
+ def test_article_nlp(self, cnn_article):
+ article = newspaper.Article(cnn_article["url"])
+ article.download(input_html=cnn_article["html_content"])
+ article.parse()
+ article.nlp()
+
+ summary = conftest.get_data("cnn_summary", "txt")
+ summary = summary.strip()
+
+ assert sorted(article.keywords) == sorted(
+ [
+ "balloons",
+ "delays",
+ "flight",
+ "forecasters",
+ "good",
+ "sailing",
+ "smooth",
+ "storm",
+ "thanksgiving",
+ "travel",
+ "weather",
+ "winds",
+ "york",
+ ]
+ )
+ assert article.summary.strip() == summary
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -26,7 +26,7 @@
 # core module
 sys.path.insert(0, PARENT_DIR)
 
-TEXT_FN = os.path.join(TEST_DIR, "data", "text")
+TEXT_FN = os.path.join(TEST_DIR, "data", "txt")
 HTML_FN = os.path.join(TEST_DIR, "data", "html")
 URLS_FILE = os.path.join(TEST_DIR, "data", "fulltext_url_list.txt")
 
@@ -153,6 +153,7 @@ def get_filename(url):
 
 
 class ArticleTestCase(unittest.TestCase):
+ # done in pytest
  def setup_stage(self, stage_name):
  stages = OrderedDict(
  [