diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..9f8ccc38 --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +ignore = + # line too long + E501 +exclude = + build/ diff --git a/.github/workflows/lint_and_test.yml b/.github/workflows/lint_and_test.yml new file mode 100644 index 00000000..d6e13da3 --- /dev/null +++ b/.github/workflows/lint_and_test.yml @@ -0,0 +1,33 @@ +--- +name: python-textile + +on: [push] + +jobs: + lint_and_test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.10"] + image_size: ['true', 'false'] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Python flake8 Lint + uses: py-actions/flake8@v2.3.0 + - name: Install dependencies + run: | + imagesize='' + pip install -U pytest pytest-cov coverage codecov + if [[ ${{ matrix.image_size }} == true ]] ; then imagesize='[imagesize]' ; fi + pip install -e ".${imagesize}" + - name: run tests + run: | + pytest + - name: Codecov + uses: codecov/codecov-action@v4 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore index 2ea03521..7f97eb68 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ develop-eggs .DS_Store *.swp .tox +README.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 3f7d77cf..00000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -dist: xenial # required for Python >= 3.7 -language: python -env: - - IMAGESIZE=true - - IMAGESIZE=false -python: - - "3.5" - - "3.6" - - "3.7" - - "3.8" - - "3.9" - # PyPy versions - - "pypy3" -# command to install dependencies -install: - - imagesize='' - - pip install -U pytest pytest-cov coverage codecov - - if [[ $IMAGESIZE == true ]] ; then imagesize='[imagesize]' ; fi - - pip install -e ".${imagesize}" -# command to run tests -script: py.test -after_success: - - codecov diff --git a/CHANGELOG.textile b/CHANGELOG.textile index 43beedca..a3ff7424 100644 --- a/CHANGELOG.textile +++ b/CHANGELOG.textile @@ -1,5 +1,14 @@ h1. Textile Changelog +h2. Version 4.0.3 +* Update supported Python versions to 3.8 - 3.12 ("#83":https://github.com/textile/python-textile/issues/83) +* Replace html5lib with nh3 for html sanitization +* General code cleanup +* Bugfixes: +** Wrong HTML output when "bc.." is the very last in the document ("#81":https://github.com/textile/python-textile/issues/81) +* Other: +** Use github actions instead of travis for automated testing + h2. Version 4.0.2 * Bugfixes: ** Support non-http schemas in url refs ("#75":https://github.com/textile/python-textile/pull/75) diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 5624ae6a..90d949aa 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -7,4 +7,6 @@ Alex Shiels Jason Samsa Kurt Raschke Dave Brondsema -Dmitry Shachnev \ No newline at end of file +Dmitry Shachnev +Kirill Mavreshko +Brad Schoening \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 5ca56e8d..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include MANIFEST.in -include tests/fixtures/README.txt diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..7570eac3 --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +clean: + $(RM) README.txt + $(RM) -r ./dist ./build + +generate_pypi_README: + ${VIRTUAL_ENV}/bin/pytextile README.textile | sed -e 's/^\t//' > README.txt + +build: generate_pypi_README + python -m build + +upload_to_test: build + twine check ./dist/* + twine upload --repository test_textile ./dist/* + +upload_to_prod: build + twine check ./dist/* + # for now, don't actually upload to prod PyPI, just output the command to do so. + @echo "twine upload --repository textile ./dist/*" diff --git a/README.textile b/README.textile index 98f4fbde..958ea63c 100644 --- a/README.textile +++ b/README.textile @@ -1,15 +1,15 @@ -!https://travis-ci.org/textile/python-textile.svg!:https://travis-ci.org/textile/python-textile !https://codecov.io/github/textile/python-textile/coverage.svg!:https://codecov.io/github/textile/python-textile !https://img.shields.io/pypi/pyversions/textile! !https://img.shields.io/pypi/wheel/textile! +!https://github.com/textile/python-textile/actions/workflows/lint_and_test.yml/badge.svg(python-textile)!:https://github.com/textile/python-textile/actions/workflows/lint_and_test.yml !https://codecov.io/github/textile/python-textile/coverage.svg!:https://codecov.io/github/textile/python-textile !https://img.shields.io/pypi/pyversions/textile! !https://img.shields.io/pypi/wheel/textile! h1. python-textile -python-textile is a Python port of "Textile":http://txstyle.org/, Dean Allen's humane web text generator. +python-textile is a Python port of "Textile":https://textile-lang.com/, Dean Allen's humane web text generator. h2. Installation @pip install textile@ Dependencies: -* "html5lib":https://pypi.org/project/html5lib/ +* "nh3":https://pypi.org/project/nh3/ * "regex":https://pypi.org/project/regex/ (The regex package causes problems with PyPy, and is not installed as a dependency in such environments. If you are upgrading a textile install on PyPy which had regex previously included, you may need to uninstall it.) Optional dependencies include: @@ -42,7 +42,7 @@ bc.. import textile h3. Notes: -* Active development supports Python 3.5 or later. +* Active development supports Python 3.8 or later. h3. Running Tests @@ -50,8 +50,8 @@ To run the test suite, use pytest. `pytest-cov` is required as well. When textile is installed locally: -bc.. pytest +bc. pytest When textile is not installed locally: -bc.. PYTHONPATH=. pytest +bc. PYTHONPATH=. pytest diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..caa03da1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["setuptools", "setuptools-scm", "nh3"] +build-backend = "setuptools.build_meta" + +[project] +name = "textile" +authors = [ + { name = "Dennis Burke", email = "ikirudennis@gmail.com"} +] +description = 'Textile processing for python.' +classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'Environment :: Web Environment', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3 :: Only', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Topic :: Software Development :: Libraries :: Python Modules', +] +dynamic = ["version",] +dependencies = [ + 'nh3', + 'regex>1.0; implementation_name != "pypy"', +] +requires-python = '>=3.8' +keywords = ['textile', 'text', 'html markup'] +# Use the following command to generate a README.txt which is compatible with +# pypi's readme rendering: +# pytextile README.textile | sed -e 's/^\t//' > README.txt +readme = {file = 'README.txt', content-type = 'text/markdown'} + +[project.optional-dependencies] +develop = ['pytest', 'pytest-cov'] +imagesize = ['Pillow>=3.0.0',] + +[project.urls] +Homepage = "https://github.com/textile/python-textile" +Repository = "https://github.com/textile/python-textile.git" +Issues = "https://github.com/textile/python-textile/issues" + +[project.scripts] +pytextile = "textile.__main__:main" + +[tool.setuptools.dynamic] +version = {attr = "textile.__version__"} diff --git a/setup.py b/setup.py deleted file mode 100644 index 118c2fb0..00000000 --- a/setup.py +++ /dev/null @@ -1,53 +0,0 @@ -from setuptools import setup, find_packages -from setuptools.command.test import test as TestCommand -import os -import sys - - -def get_version(): - basedir = os.path.dirname(__file__) - with open(os.path.join(basedir, 'textile/version.py')) as f: - variables = {} - exec(f.read(), variables) - return variables.get('VERSION') - raise RuntimeError('No version info found.') - -setup( - name='textile', - version=get_version(), - author='Dennis Burke', - author_email='ikirudennis@gmail.com', - description='Textile processing for python.', - url='http://github.com/textile/python-textile', - packages=find_packages(), - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Web Environment', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Topic :: Software Development :: Libraries :: Python Modules', - ], - keywords='textile,text,html markup', - install_requires=[ - 'html5lib>=1.0.1', - 'regex>1.0; implementation_name != "pypy"', - ], - extras_require={ - 'develop': ['pytest', 'pytest-cov'], - 'imagesize': ['Pillow>=3.0.0'], - }, - entry_points={'console_scripts': ['pytextile=textile.__main__:main']}, - tests_require=['pytest', 'pytest-cov'], - include_package_data=True, - zip_safe=False, - python_requires='>=3.5', -) diff --git a/textile/tools/__init__.py b/tests/__init__.py similarity index 100% rename from textile/tools/__init__.py rename to tests/__init__.py diff --git a/tests/fixtures/README.txt b/tests/fixtures/README.txt index 61dc0f01..515cf860 100644 --- a/tests/fixtures/README.txt +++ b/tests/fixtures/README.txt @@ -1,8 +1,8 @@ -

+

python-textile

python-textile

-

python-textile is a Python port of Textile, Dean Allen’s humane web text generator.

+

python-textile is a Python port of Textile, Dean Allen’s humane web text generator.

Installation

@@ -10,7 +10,7 @@

Dependencies:

@@ -47,7 +47,7 @@

Notes:

Running Tests

@@ -56,8 +56,8 @@

When textile is installed locally:

-
pytest
+
pytest
-When textile is not installed locally:
+

When textile is not installed locally:

-
PYTHONPATH=. pytest
\ No newline at end of file +
PYTHONPATH=. pytest
\ No newline at end of file diff --git a/tests/test_attributes.py b/tests/test_attributes.py index 70da8422..fed235de 100644 --- a/tests/test_attributes.py +++ b/tests/test_attributes.py @@ -1,5 +1,6 @@ +from typing import OrderedDict from textile.utils import parse_attributes -import re + def test_parse_attributes(): assert parse_attributes('\\1', element='td') == {'colspan': '1'} @@ -13,3 +14,11 @@ def test_parse_attributes(): assert parse_attributes('<') == {'style': 'text-align:left;'} assert parse_attributes('(c#i)') == {'class': 'c', 'id': 'i'} assert parse_attributes('\\2 100', element='col') == {'span': '2', 'width': '100'} + + +def test_parse_attributes_edge_cases(): + result = parse_attributes('(:c#i)') + expect = OrderedDict({'id': 'i'}) + assert result == expect + + assert parse_attributes('(<)') == OrderedDict() diff --git a/tests/test_block.py b/tests/test_block.py index 44f3ea23..eed5441c 100644 --- a/tests/test_block.py +++ b/tests/test_block.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import textile from textile.objects import Block @@ -8,6 +6,7 @@ except ImportError: from ordereddict import OrderedDict + def test_block(): t = textile.Textile() result = t.block('h1. foobar baby') @@ -16,15 +15,14 @@ def test_block(): b = Block(t, "bq", "", None, "", "Hello BlockQuote") expect = ('blockquote', OrderedDict(), 'p', OrderedDict(), - 'Hello BlockQuote') + 'Hello BlockQuote') result = (b.outer_tag, b.outer_atts, b.inner_tag, b.inner_atts, b.content) assert result == expect b = Block(t, "bq", "", None, "http://google.com", "Hello BlockQuote") - citation = '{0}1:url'.format(t.uid) expect = ('blockquote', OrderedDict([('cite', - '{0.uid}{0.refIndex}:url'.format(t))]), 'p', OrderedDict(), - 'Hello BlockQuote') + '{0.uid}{0.refIndex}:url'.format(t))]), 'p', OrderedDict(), + 'Hello BlockQuote') result = (b.outer_tag, b.outer_atts, b.inner_tag, b.inner_atts, b.content) assert result == expect @@ -40,6 +38,7 @@ def test_block(): result = (b.outer_tag, b.outer_atts, b.inner_tag, b.inner_atts, b.content) assert result == expect + def test_block_tags_false(): t = textile.Textile(block_tags=False) assert t.block_tags is False @@ -48,6 +47,7 @@ def test_block_tags_false(): expect = 'test' assert result == expect + def test_blockcode_extended(): input = 'bc.. text\nmoretext\n\nevenmoretext\n\nmoremoretext\n\np. test' expect = '
text\nmoretext\n\nevenmoretext\n\nmoremoretext
\n\n\t

test

' @@ -55,6 +55,7 @@ def test_blockcode_extended(): result = t.parse(input) assert result == expect + def test_blockcode_in_README(): with open('README.textile') as f: readme = ''.join(f.readlines()) @@ -63,6 +64,7 @@ def test_blockcode_in_README(): expect = ''.join(f.readlines()) assert result == expect + def test_blockcode_comment(): input = '###.. block comment\nanother line\n\np. New line' expect = '\t

New line

' @@ -70,6 +72,7 @@ def test_blockcode_comment(): result = t.parse(input) assert result == expect + def test_extended_pre_block_with_many_newlines(): """Extra newlines in an extended pre block should not get cut down to only two.""" diff --git a/tests/test_cli.py b/tests/test_cli.py index 5f6e501f..5e6ab794 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,28 +3,30 @@ import textile + def test_console_script(): command = [sys.executable, '-m', 'textile', 'README.textile'] try: result = subprocess.check_output(command) except AttributeError: command[2] = 'textile.__main__' - result = subprocess.Popen(command, - stdout=subprocess.PIPE).communicate()[0] + result = subprocess.Popen( + command, stdout=subprocess.PIPE).communicate()[0] with open('tests/fixtures/README.txt') as f: expect = ''.join(f.readlines()) - if type(result) == bytes: + if isinstance(result, bytes): result = result.decode('utf-8') assert result == expect + def test_version_string(): command = [sys.executable, '-m', 'textile', '-v'] try: result = subprocess.check_output(command) except AttributeError: command[2] = 'textile.__main__' - result = subprocess.Popen(command, - stdout=subprocess.PIPE).communicate()[0] - if type(result) == bytes: + result = subprocess.Popen( + command, stdout=subprocess.PIPE).communicate()[0] + if isinstance(result, bytes): result = result.decode('utf-8') assert result.strip() == textile.__version__ diff --git a/tests/test_footnoteRef.py b/tests/test_footnoteRef.py index b773ad2f..5ac2ea4b 100644 --- a/tests/test_footnoteRef.py +++ b/tests/test_footnoteRef.py @@ -1,5 +1,5 @@ from textile import Textile -import re + def test_footnoteRef(): t = Textile() diff --git a/tests/test_getRefs.py b/tests/test_getRefs.py index d3cfcd72..8a22d4fb 100644 --- a/tests/test_getRefs.py +++ b/tests/test_getRefs.py @@ -1,5 +1,6 @@ from textile import Textile + def test_getRefs(): t = Textile() result = t.getRefs("some text [Google]http://www.google.com") diff --git a/tests/test_getimagesize.py b/tests/test_getimagesize.py index 43f85e3a..4cafc9dc 100644 --- a/tests/test_getimagesize.py +++ b/tests/test_getimagesize.py @@ -1,8 +1,9 @@ -from textile.tools.imagesize import getimagesize +from textile.utils import getimagesize import pytest PIL = pytest.importorskip('PIL') + def test_imagesize(): assert getimagesize("http://www.google.com/intl/en_ALL/images/logo.gif") == (276, 110) assert getimagesize("http://bad.domain/") == '' diff --git a/tests/test_github_issues.py b/tests/test_github_issues.py index 2507e5f4..6808054a 100644 --- a/tests/test_github_issues.py +++ b/tests/test_github_issues.py @@ -1,53 +1,60 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - import textile + def test_github_issue_16(): result = textile.textile('"$":http://google.com "$":https://google.com "$":mailto:blackhole@sun.comet') expect = '\t

google.com google.com blackhole@sun.comet

' assert result == expect + def test_github_issue_17(): result = textile.textile('!http://www.ox.ac.uk/favicon.ico!') expect = '\t

' assert result == expect + def test_github_issue_20(): text = 'This is a link to a ["Wikipedia article about Textile":http://en.wikipedia.org/wiki/Textile_(markup_language)].' result = textile.textile(text) expect = '\t

This is a link to a Wikipedia article about Textile.

' assert result == expect + def test_github_issue_21(): - text = '''h1. xml example + text = ('''h1. xml example -bc. +bc. ''' + ''' bar -''' +''') result = textile.textile(text) expect = '\t

xml example

\n\n
\n<foo>\n  bar\n</foo>
' assert result == expect + def test_github_issue_22(): text = '''_(artist-name)Ty Segall_’s''' result = textile.textile(text) expect = '\t

Ty Segall’s

' assert result == expect + def test_github_issue_26(): text = '' result = textile.textile(text) expect = '' assert result == expect + def test_github_issue_27(): test = """* Folders with ":" in their names are displayed with a forward slash "/" instead. (Filed as "#4581709":/test/link, which was considered "normal behaviour" - quote: "Please note that Finder presents the 'Carbon filesystem' view, regardless of the underlying filesystem.")""" result = textile.textile(test) expect = """\t""" assert result == expect + def test_github_issue_28(): test = """So here I am porting my ancient "newspipe":newspipe "front-end":blog/2006/09/30/0950 to "Snakelets":Snakelets and "Python":Python, and I've just trimmed down over 20 lines of "PHP":PHP down to essentially one line of "BeautifulSoup":BeautifulSoup retrieval: @@ -80,23 +87,26 @@ def parseWapProfile(self, url): \t

Of course there’s a lot more error handling to do (and useful data to glean off the XML), but being able to cut through all the usual parsing crap is immensely gratifying.

""") assert result == expect + def test_github_issue_30(): - text ='"Tëxtíle (Tëxtíle)":http://lala.com' + text = '"Tëxtíle (Tëxtíle)":http://lala.com' result = textile.textile(text) expect = '\t

Tëxtíle

' assert result == expect - text ='!http://lala.com/lol.gif(♡ imáges)!' + text = '!http://lala.com/lol.gif(♡ imáges)!' result = textile.textile(text) expect = '\t

♡ imáges

' assert result == expect + def test_github_issue_36(): text = '"Chögyam Trungpa":https://www.google.com/search?q=Chögyam+Trungpa' result = textile.textile(text) expect = '\t

Chögyam Trungpa

' assert result == expect + def test_github_issue_37(): text = '# xxx\n# yyy\n*blah*' result = textile.textile(text) @@ -118,24 +128,28 @@ def test_github_issue_37(): \t''' assert result == expect + def test_github_issue_40(): text = '\r\n' result = textile.textile(text) expect = '\r\n' assert result == expect + def test_github_issue_42(): text = '!./image.png!' result = textile.textile(text) expect = '\t

' assert result == expect + def test_github_issue_43(): text = 'pre. smart ‘quotes’ are not smart!' result = textile.textile(text) expect = '
smart ‘quotes’ are not smart!
' assert result == expect + def test_github_issue_45(): """Incorrect transform unicode url""" text = '"test":https://myabstractwiki.ru/index.php/%D0%97%D0%B0%D0%B3%D0%BB%D0%B0%D0%B2%D0%BD%D0%B0%D1%8F_%D1%81%D1%82%D1%80%D0%B0%D0%BD%D0%B8%D1%86%D0%B0' @@ -143,6 +157,7 @@ def test_github_issue_45(): expect = '\t

test

' assert result == expect + def test_github_issue_46(): """Key error on mal-formed numbered lists. CAUTION: both the input and the ouput are ugly.""" @@ -153,6 +168,7 @@ def test_github_issue_46(): result = textile.textile(text) assert result == expect + def test_github_issue_47(): """Incorrect wrap pre-formatted value""" text = '''pre.. word @@ -172,6 +188,7 @@ def test_github_issue_47(): yet anothe word''' assert result == expect + def test_github_issue_49(): """Key error on russian hash-route link""" s = '"link":https://ru.vuejs.org/v2/guide/components.html#Входные-параметры' @@ -179,6 +196,7 @@ def test_github_issue_49(): expect = '\t

link

' assert result == expect + def test_github_issue_50(): """Incorrect wrap code with Java generics in pre""" test = ('pre.. public class Tynopet {}\n\nfinal ' @@ -189,6 +207,7 @@ def test_github_issue_50(): 'ArrayList<>();') assert result == expect + def test_github_issue_51(): """Link build with $ sign without "http" prefix broken.""" test = '"$":www.google.com.br' @@ -196,6 +215,7 @@ def test_github_issue_51(): expect = '\t

www.google.com.br

' assert result == expect + def test_github_issue_52(): """Table build without space after aligment raise a AttributeError.""" test = '|=.First Header |=. Second Header |' @@ -205,6 +225,7 @@ def test_github_issue_52(): '\n\t\t\n\t') assert result == expect + def test_github_issue_55(): """Incorrect handling of quote entities in extended pre block""" test = ('pre.. this is the first line\n\nbut "quotes" in an extended pre ' @@ -258,15 +279,17 @@ def test_github_issue_55(): 'return configs;\n}\n}') assert result == expect + def test_github_issue_56(): """Empty description lists throw error""" result = textile.textile("- :=\n-") expect = '
\n
' assert result == expect + def test_github_pull_61(): """Fixed code block multiline encoding on quotes/span""" - test = '''bc.. This is some TEXT inside a "Code BLOCK" + test = ('''bc.. This is some TEXT inside a "Code BLOCK" { if (JSON) { @@ -275,11 +298,12 @@ def test_github_pull_61(): } } -Back to 10-4 CAPS +Back to 10-4 CAPS ''' + ''' p.. Some multiline Paragragh -Here is some output!!! "Some" CAPS''' +Here is some output!!! "Some" CAPS''') expect = '''
This is some TEXT inside a "Code BLOCK"
 
@@ -299,6 +323,7 @@ def test_github_pull_61():
     result = t.parse(test)
     assert result == expect
 
+
 def test_github_pull_62():
     """Fix for paragraph multiline, only last paragraph is rendered
     correctly"""
@@ -341,6 +366,7 @@ def test_github_pull_62():
     result = t.parse(test)
     assert result == expect
 
+
 def test_github_pull_63():
     """Forgot to set multiline_para to False"""
     test = '''p.. First one 'is'
diff --git a/tests/test_glyphs.py b/tests/test_glyphs.py
index 56b0d272..ed50ad53 100644
--- a/tests/test_glyphs.py
+++ b/tests/test_glyphs.py
@@ -1,5 +1,6 @@
 from textile import Textile
 
+
 def test_glyphs():
     t = Textile()
 
diff --git a/tests/test_image.py b/tests/test_image.py
index aad39e29..b7462924 100644
--- a/tests/test_image.py
+++ b/tests/test_image.py
@@ -1,5 +1,6 @@
 from textile import Textile
 
+
 def test_image():
     t = Textile()
     result = t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
@@ -17,5 +18,5 @@ def test_image():
     t = Textile(rel='nofollow')
     result = t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
     expect = (''.format(t.uid))
+              '/>'.format(t.uid))
     assert result == expect
diff --git a/tests/test_imagesize.py b/tests/test_imagesize.py
index 112989e1..cb3ad68a 100644
--- a/tests/test_imagesize.py
+++ b/tests/test_imagesize.py
@@ -1,10 +1,11 @@
 import textile
 
+
 def test_imagesize():
     imgurl = 'http://www.google.com/intl/en_ALL/images/srpr/logo1w.png'
-    result = textile.tools.imagesize.getimagesize(imgurl)
+    result = textile.utils.getimagesize(imgurl)
     try:
-        import PIL
+        import PIL  # noqa: F401
 
         expect = (275, 95)
         assert result == expect
diff --git a/tests/test_lists.py b/tests/test_lists.py
index 4e85f4c8..06d13c33 100644
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -1,5 +1,6 @@
 from textile import Textile
 
+
 def test_lists():
     t = Textile()
     result = t.textileLists("* one\n* two\n* three")
diff --git a/tests/test_retrieve.py b/tests/test_retrieve.py
index 10bd1733..a4165240 100644
--- a/tests/test_retrieve.py
+++ b/tests/test_retrieve.py
@@ -1,5 +1,6 @@
 from textile import Textile
 
+
 def test_retrieve():
     t = Textile()
     id = t.shelve("foobar")
diff --git a/tests/test_span.py b/tests/test_span.py
index d83530dd..7ae5b4b1 100644
--- a/tests/test_span.py
+++ b/tests/test_span.py
@@ -1,19 +1,20 @@
 from textile import Textile
 
+
 def test_span():
     t = Textile()
-    result = t.span("hello %(bob)span *strong* and **bold**% goodbye")
+    result = t.retrieveTags(t.span("hello %(bob)span *strong* and **bold**% goodbye"))
     expect = ('hello span strong and '
-            'bold goodbye')
+              'bold goodbye')
     assert result == expect
 
-    result = t.span('%:http://domain.tld test%')
+    result = t.retrieveTags(t.span('%:http://domain.tld test%'))
     expect = 'test'
     assert result == expect
 
     t = Textile()
     # cover the partial branch where we exceed the max_span_depth.
     t.max_span_depth = 2
-    result = t.span('_-*test*-_')
+    result = t.retrieveTags(t.span('_-*test*-_'))
     expect = '*test*'
     assert result == expect
diff --git a/tests/test_subclassing.py b/tests/test_subclassing.py
index 9235e032..a7db99a3 100644
--- a/tests/test_subclassing.py
+++ b/tests/test_subclassing.py
@@ -1,10 +1,10 @@
 import textile
 
+
 def test_change_glyphs():
     class TextilePL(textile.Textile):
         glyph_definitions = dict(textile.Textile.glyph_definitions,
-            quote_double_open = '„'
-        )
+                                 quote_double_open='„')
 
     test = 'Test "quotes".'
     expect = '\t

Test „quotes”.

' diff --git a/tests/test_table.py b/tests/test_table.py index 0a3cb0d6..1ea34e94 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1,5 +1,6 @@ from textile import Textile + def test_table(): t = Textile() result = t.table('(rowclass). |one|two|three|\n|a|b|c|') diff --git a/tests/test_textile.py b/tests/test_textile.py index 0c37690d..84e9ddf8 100644 --- a/tests/test_textile.py +++ b/tests/test_textile.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import pytest import re import textile + def test_FootnoteReference(): html = textile.textile('YACC[1]') assert re.search(r'^\t

YACC1

', html) is not None + def test_Footnote(): html = textile.textile('This is covered elsewhere[1].\n\nfn1. Down here, in fact.\n\nfn2. Here is another footnote.') assert re.search(r'^\t

This is covered elsewhere1.

\n\n\t

1 Down here, in fact.

\n\n\t

2 Here is another footnote.

$', html) is not None @@ -24,17 +25,19 @@ def test_Footnote(): html = textile.textile('''See[4!] for details.\n\nfn4^. Here are the details.''') assert re.search(r'^\t

See4 for details.

\n\n\t

4 Here are the details.

$', html) is not None + def test_issue_35(): result = textile.textile('"z"') - expect = '\t

“z”

' + expect = '\t

“z”

' assert result == expect result = textile.textile('" z"') - expect = '\t

“ z”

' + expect = '\t

“ z”

' assert result == expect + def test_restricted(): - #Note that the HTML is escaped, thus rendering the " result = textile.textile_restricted(test) expect = "\t

Here is some text.
\n<script>alert(‘hello world’)</script>

" @@ -72,7 +75,6 @@ def test_restricted(): expect = '''\ \t \t -\t \t \t\t \t\t\t @@ -93,10 +95,12 @@ def test_restricted(): assert result == expect + def test_unicode_footnote(): html = textile.textile('текст[1]') assert re.compile(r'^\t

текст1

$', re.U).search(html) is not None + def test_autolinking(): test = """some text "test":http://www.google.com http://www.google.com "$":http://www.google.com""" result = """\t

some text test http://www.google.com www.google.com

""" @@ -104,6 +108,7 @@ def test_autolinking(): assert result == expect + def test_sanitize(): test = "a paragraph of benign text" result = "\t

a paragraph of benign text

" @@ -111,7 +116,7 @@ def test_sanitize(): assert result == expect test = """

a paragraph of evil text

""" - result = '

a paragraph of evil text

' + result = '

a paragraph of evil text

' expect = textile.Textile().parse(test, sanitize=True) assert result == expect @@ -120,14 +125,16 @@ def test_sanitize(): expect = textile.Textile(html_type='html5').parse(test, sanitize=True) assert result == expect + def test_imagesize(): - PIL = pytest.importorskip('PIL') + PIL = pytest.importorskip('PIL') # noqa: F841 test = "!http://www.google.com/intl/en_ALL/images/srpr/logo1w.png!" result = '\t

' expect = textile.Textile(get_sizes=True).parse(test) assert result == expect + def test_endnotes_simple(): test = """Scientists say the moon is slowly shrinking[#my_first_label].\n\nnotelist!.\n\nnote#my_first_label Over the past billion years, about a quarter of the moon's 4.5 billion-year lifespan, it has shrunk about 200 meters (700 feet) in diameter.""" html = textile.textile(test) @@ -135,6 +142,7 @@ def test_endnotes_simple(): result_re = re.compile(result_pattern) assert result_re.search(html) is not None + def test_endnotes_complex(): test = """Tim Berners-Lee is one of the pioneer voices in favour of Net Neutrality[#netneutral] and has expressed the view that ISPs should supply "connectivity with no strings attached"[#netneutral!] [#tbl_quote]\n\nBerners-Lee admitted that the forward slashes ("//") in a web address were actually unnecessary. He told the newspaper that he could easily have designed URLs not to have the forward slashes. "... it seemed like a good idea at the time,"[#slashes]\n\nnote#netneutral. "Web creator rejects net tracking":http://news.bbc.co.uk/2/hi/technology/7613201.stm. BBC. 15 September 2008\n\nnote#tbl_quote. "Web inventor's warning on spy software":http://www.telegraph.co.uk/news/uknews/1581938/Web-inventor%27s-warning-on-spy-software.html. The Daily Telegraph (London). 25 May 2008\n\nnote#slashes. "Berners-Lee 'sorry' for slashes":http://news.bbc.co.uk/1/hi/technology/8306631.stm. BBC. 14 October 2009\n\nnotelist.""" html = textile.textile(test) @@ -142,6 +150,7 @@ def test_endnotes_complex(): result_re = re.compile(result_pattern) assert result_re.search(html) is not None + def test_endnotes_unreferenced_note(): test = """Scientists say[#lavader] the moon is quite small. But I, for one, don't believe them. Others claim it to be made of cheese[#aardman]. If this proves true I suspect we are in for troubled times[#apollo13] as people argue over their "share" of the moon's cheese. In the end, its limited size[#lavader] may prove problematic.\n\nnote#lavader(noteclass). "Proof of the small moon hypothesis":http://antwrp.gsfc.nasa.gov/apod/ap080801.html. Copyright(c) Laurent Laveder\n\nnote#aardman(#noteid). "Proof of a cheese moon":http://www.imdb.com/title/tt0104361\n\nnote#apollo13. After all, things do go "wrong":http://en.wikipedia.org/wiki/Apollo_13#The_oxygen_tank_incident.\n\nnotelist{padding:1em; margin:1em; border-bottom:1px solid gray}.\n\nnotelist{padding:1em; margin:1em; border-bottom:1px solid gray}:§^.\n\nnotelist{padding:1em; margin:1em; border-bottom:1px solid gray}:‡""" html = textile.textile(test) @@ -149,6 +158,7 @@ def test_endnotes_unreferenced_note(): result_re = re.compile(result_pattern, re.U) assert result_re.search(html) is not None + def test_endnotes_malformed(): test = """Scientists say[#lavader] the moon is quite small. But I, for one, don't believe them. Others claim it to be made of cheese[#aardman]. If this proves true I suspect we are in for troubled times[#apollo13!] as people argue over their "share" of the moon's cheese. In the end, its limited size[#lavader] may prove problematic.\n\nnote#unused An unreferenced note.\n\nnote#lavader^ "Proof of the small moon hypothesis":http://antwrp.gsfc.nasa.gov/apod/ap080801.html. Copyright(c) Laurent Laveder\n\nnote#aardman^ "Proof of a cheese moon":http://www.imdb.com/title/tt0104361\n\nnote#apollo13^ After all, things do go "wrong":http://en.wikipedia.org/wiki/Apollo_13#The_oxygen_tank_incident.\n\nnotelist{padding:1em; margin:1em; border-bottom:1px solid gray}:α!+""" html = textile.textile(test) @@ -156,13 +166,15 @@ def test_endnotes_malformed(): result_re = re.compile(result_pattern, re.U) assert result_re.search(html) is not None + def test_endnotes_undefined_note(): test = """Scientists say the moon is slowly shrinking[#my_first_label].\n\nnotelist!.""" html = textile.textile(test) - result_pattern = r"""\t

Scientists say the moon is slowly shrinking1.

\n\n\t
    \n\t\t
  1. Undefined Note \[#my_first_label\].
  2. \n\t
$""" + result_pattern = r"""\t

Scientists say the moon is slowly shrinking1.

\n\n\t
    \n\t\t
  1. Undefined Note \[#my_first_label\].
  2. \n\t
$""" result_re = re.compile(result_pattern) assert result_re.search(html) is not None + def test_encode_url(): # I tried adding these as doctests, but the unicode tests weren't # returning the correct results. @@ -198,21 +210,25 @@ def test_encode_url(): eurl = t.encode_url(url) assert eurl == result + def test_footnote_crosslink(): html = textile.textile('''See[2] for details, and later, reference it again[2].\n\nfn2^(footy#otherid)[en]. Here are the details.''') searchstring = r'\t

See2 for details, and later, reference it again2.

\n\n\t

2 Here are the details.

$' assert re.compile(searchstring).search(html) is not None + def test_footnote_without_reflink(): html = textile.textile('''See[3!] for details.\n\nfn3. Here are the details.''') searchstring = r'^\t

See3 for details.

\n\n\t

3 Here are the details.

$' assert re.compile(searchstring).search(html) is not None + def testSquareBrackets(): html = textile.textile("""1[^st^], 2[^nd^], 3[^rd^]. 2 log[~n~]\n\nA close[!http://textpattern.com/favicon.ico!]image.\nA tight["text":http://textpattern.com/]link.\nA ["footnoted link":http://textpattern.com/][182].""") searchstring = r'^\t

1st, 2nd, 3rd. 2 logn

\n\n\t

A closeimage.
\nA tighttextlink.
\nA footnoted link182.

' assert re.compile(searchstring).search(html) is not None + def test_html5(): """docstring for testHTML5""" @@ -221,6 +237,7 @@ def test_html5(): expect = textile.textile(test, html_type="html5") assert result == expect + def test_relURL(): t = textile.Textile() t.restricted = True diff --git a/tests/test_textilefactory.py b/tests/test_textilefactory.py index 846b9275..e9fc027f 100644 --- a/tests/test_textilefactory.py +++ b/tests/test_textilefactory.py @@ -1,6 +1,7 @@ from textile import textilefactory import pytest + def test_TextileFactory(): f = textilefactory.TextileFactory() result = f.process("some text here") diff --git a/tests/test_urls.py b/tests/test_urls.py index 7a9798eb..1cd09f92 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from textile import Textile -import re + def test_urls(): t = Textile() @@ -54,12 +54,14 @@ def test_urls(): expect = '\t

A link that contains a\nnewline raises an exception.

' assert result == expect + def test_rel_attribute(): t = Textile(rel='nofollow') result = t.parse('"$":http://domain.tld') expect = '\t

domain.tld

' assert result == expect + def test_quotes_in_link_text(): """quotes in link text are tricky.""" test = '""this is a quote in link text"":url' diff --git a/tests/test_utils.py b/tests/test_utils.py index 7f386a9b..952c7b4c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,23 +1,25 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - from textile import utils + def test_encode_html(): result = utils.encode_html('''this is a "test" of text that's safe to ''' - 'put in an attribute.') + 'put in an attribute.') expect = ('this is a "test" of text that's safe to put in ' - 'an <html> attribute.') + 'an <html> attribute.') assert result == expect + def test_has_raw_text(): assert utils.has_raw_text('

foo bar biz baz

') is False assert utils.has_raw_text(' why yes, yes it does') is True + def test_is_rel_url(): assert utils.is_rel_url("http://www.google.com/") is False assert utils.is_rel_url("/foo") is True + def test_generate_tag(): result = utils.generate_tag('span', 'inner text', {'class': 'test'}) expect = 'inner text' @@ -28,3 +30,8 @@ def test_generate_tag(): expect = 'Übermensch' result = utils.generate_tag('a', text, attributes) assert result == expect + + +def test_human_readable_url_edge_case(): + assert utils.human_readable_url('google.com') == 'google.com' + assert utils.human_readable_url('tel:1-800-555-1212') == '1-800-555-1212' diff --git a/tests/test_values.py b/tests/test_values.py index 063ed3e9..7c19e116 100644 --- a/tests/test_values.py +++ b/tests/test_values.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import textile import pytest @@ -14,7 +13,7 @@ ('I spoke.\nAnd none replied.', '\t

I spoke.
\nAnd none replied.

'), - ('"Observe!"', '\t

“Observe!”

'), + ('"Observe!"', '\t

“Observe!”

'), ('Observe -- very nice!', '\t

Observe — very nice!

'), @@ -35,7 +34,7 @@ ('h3. Header 3', '\t

Header 3

'), ('An old text\n\nbq. A block quotation.\n\nAny old text''', - '\t

An old text

\n\n\t
\n\t\t

A block quotation.

\n\t
\n\n\t

Any old text

'), + '\t

An old text

\n\n\t
\n\t\t

A block quotation.

\n\t
\n\n\t

Any old text

'), ('I _believe_ every word.', '\t

I believe every word.

'), @@ -70,8 +69,8 @@ ('p[fr]. rouge', '\t

rouge

'), ('I seriously *{color:red}blushed*\nwhen I _(big)sprouted_ that\ncorn stalk from my\n%[es]cabeza%.', - '\t

I seriously blushed
\nwhen I sprouted' - ' that
\ncorn stalk from my
\ncabeza.

'), + '\t

I seriously blushed
\nwhen I sprouted' + ' that
\ncorn stalk from my
\ncabeza.

'), ('p<. align left', '\t

align left

'), @@ -219,14 +218,14 @@ '\t

TxStyle is a documentation project of Textile 2.4 for Textpattern CMS.

'), (""""Übermensch":http://de.wikipedia.org/wiki/Übermensch""", """\t

Übermensch

"""), ("""Here is some text with a block.\n\n\n\n\n\nbc. """, - """\t

Here is some text with a block.

\n\n\t

\n\n\t

\n\n
<!-- Here is a comment block in a code block. -->
"""), + """\t

Here is some text with a block.

\n\n\n\n\n\n
<!-- Here is a comment block in a code block. -->
"""), (""""Textile(c)" is a registered(r) 'trademark' of Textpattern(tm) -- or TXP(That's textpattern!) -- at least it was - back in '88 when 2x4 was (+/-)5(o)C ... QED!\n\np{font-size: 200%;}. 2(1/4) 3(1/2) 4(3/4)""", """\t

“Textile©” is a registered® ‘trademark’ of Textpattern™ — or TXP — at least it was – back in ’88 when 2×4 was ±5°C … QED!

\n\n\t

2¼ 3½ 4¾

"""), ("""|=. Testing colgroup and col syntax\n|:\\5. 80\n|a|b|c|d|e|\n\n|=. Testing colgroup and col syntax|\n|:\\5. 80|\n|a|b|c|d|e|""", """\t
Your caption goes here
A footer
\n\t\n\t\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
Testing colgroup and col syntax
abcde
\n\n\t\n\t\n\t\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
Testing colgroup and col syntax
abcde
"""), ("""table(#dvds){border-collapse:collapse}. Great films on DVD employing Textile summary, caption, thead, tfoot, two tbody elements and colgroups\n|={font-size:140%;margin-bottom:15px}. DVDs with two Textiled tbody elements\n|:\\3. 100 |{background:#ddd}|250||50|300|\n|^(header).\n|_. Title |_. Starring |_. Director |_. Writer |_. Notes |\n|~(footer).\n|\\5=. This is the tfoot, centred |\n|-(toplist){background:#c5f7f6}.\n| _The Usual Suspects_ | Benicio Del Toro, Gabriel Byrne, Stephen Baldwin, Kevin Spacey | Bryan Singer | Chris McQaurrie | One of the finest films ever made |\n| _Se7en_ | Morgan Freeman, Brad Pitt, Kevin Spacey | David Fincher | Andrew Kevin Walker | Great psychological thriller |\n| _Primer_ | David Sullivan, Shane Carruth | Shane Carruth | Shane Carruth | Amazing insight into trust and human psychology
rather than science fiction. Terrific! |\n| _District 9_ | Sharlto Copley, Jason Cope | Neill Blomkamp | Neill Blomkamp, Terri Tatchell | Social commentary layered on thick,\nbut boy is it done well |\n|-(medlist){background:#e7e895;}.\n| _Arlington Road_ | Tim Robbins, Jeff Bridges | Mark Pellington | Ehren Kruger | Awesome study in neighbourly relations |\n| _Phone Booth_ | Colin Farrell, Kiefer Sutherland, Forest Whitaker | Joel Schumacher | Larry Cohen | Edge-of-the-seat stuff in this\nshort but brilliantly executed thriller |""", """\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\n\t
DVDs with two Textiled tbody elements
Title Starring Director Writer Notes
This is the tfoot, centred
The Usual Suspects Benicio Del Toro, Gabriel Byrne, Stephen Baldwin, Kevin Spacey Bryan Singer Chris McQaurrie One of the finest films ever made
Se7en Morgan Freeman, Brad Pitt, Kevin Spacey David Fincher Andrew Kevin Walker Great psychological thriller
Primer David Sullivan, Shane Carruth Shane Carruth Shane Carruth Amazing insight into trust and human psychology
\nrather than science fiction. Terrific!
District 9 Sharlto Copley, Jason Cope Neill Blomkamp Neill Blomkamp, Terri Tatchell Social commentary layered on thick,
\nbut boy is it done well
Arlington Road Tim Robbins, Jeff Bridges Mark Pellington Ehren Kruger Awesome study in neighbourly relations
Phone Booth Colin Farrell, Kiefer Sutherland, Forest Whitaker Joel Schumacher Larry Cohen Edge-of-the-seat stuff in this
\nshort but brilliantly executed thriller
"""), ("""-(hot) *coffee* := Hot _and_ black\n-(hot#tea) tea := Also hot, but a little less black\n-(cold) milk := Nourishing beverage for baby cows.\nCold drink that goes great with cookies. =:\n\n-(hot) coffee := Hot and black\n-(hot#tea) tea := Also hot, but a little less black\n-(cold) milk :=\nNourishing beverage for baby cows.\nCold drink that goes great with cookies. =:""", - """
\n\t
coffee
\n\t
Hot and black
\n\t
tea
\n\t
Also hot, but a little less black
\n\t
milk
\n\t
Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.
\n
\n\n
\n\t
coffee
\n\t
Hot and black
\n\t
tea
\n\t
Also hot, but a little less black
\n\t
milk
\n\t

Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.

\n
"""), + """
\n\t
coffee
\n\t
Hot and black
\n\t
tea
\n\t
Also hot, but a little less black
\n\t
milk
\n\t
Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.
\n
\n\n
\n\t
coffee
\n\t
Hot and black
\n\t
tea
\n\t
Also hot, but a little less black
\n\t
milk
\n\t

Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.

\n
"""), (""";(class#id) Term 1\n: Def 1\n: Def 2\n: Def 3""", """\t
\n\t\t
Term 1
\n\t\t
Def 1
\n\t\t
Def 2
\n\t\t
Def 3
\n\t
"""), ("""*Here is a comment*\n\nHere is *(class)a comment*\n\n*(class)Here is a class* that is a little extended and is\n*followed* by a strong word!\n\nbc. ; Content-type: text/javascript\n; Cache-Control: no-store, no-cache, must-revalidate, pre-check=0, post-check=0, max-age=0\n; Expires: Sat, 24 Jul 2003 05:00:00 GMT\n; Last-Modified: Wed, 1 Jan 2025 05:00:00 GMT\n; Pragma: no-cache\n\n*123 test*\n\n*test 123*\n\n**123 test**\n\n**test 123**""", @@ -236,7 +235,7 @@ ("""# one\n##3 one.three\n## one.four\n## one.five\n# two\n\ntest\n\n#_(continuation#section2).\n# three\n# four\n##_ four.six\n## four.seven\n# five\n\ntest\n\n#21 twenty-one\n# twenty-two""", """\t
    \n\t\t
  1. one\n\t\t
      \n\t\t\t
    1. one.three
    2. \n\t\t\t
    3. one.four
    4. \n\t\t\t
    5. one.five
    6. \n\t\t
  2. \n\t\t
  3. two
  4. \n\t
\n\n\t

test

\n\n\t
    \n\t\t
  1. three
  2. \n\t\t
  3. four\n\t\t
      \n\t\t\t
    1. four.six
    2. \n\t\t\t
    3. four.seven
    4. \n\t\t
  4. \n\t\t
  5. five
  6. \n\t
\n\n\t

test

\n\n\t
    \n\t\t
  1. twenty-one
  2. \n\t\t
  3. twenty-two
  4. \n\t
"""), ("""|* Foo[^2^]\n* _bar_\n* ~baz~ |\n|#4 *Four*\n# __Five__ |\n|-(hot) coffee := Hot and black\n-(hot#tea) tea := Also hot, but a little less black\n-(cold) milk :=\nNourishing beverage for baby cows.\nCold drink that goes great with cookies. =:\n|""", - """\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\n\t
\t
    \n\t\t
  • Foo2
  • \n\t\t
  • bar
  • \n\t\t
  • baz
  • \n\t
\t
    \n\t\t
  1. Four
  2. \n\t\t
  3. Five
  4. \n\t
\n\t
coffee
\n\t
Hot and black
\n\t
tea
\n\t
Also hot, but a little less black
\n\t
milk
\n\t

Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.


\n
"""), + """\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\n\t
\t
    \n\t\t
  • Foo2
  • \n\t\t
  • bar
  • \n\t\t
  • baz
  • \n\t
\t
    \n\t\t
  1. Four
  2. \n\t\t
  3. Five
  4. \n\t
\n\t
coffee
\n\t
Hot and black
\n\t
tea
\n\t
Also hot, but a little less black
\n\t
milk
\n\t

Nourishing beverage for baby cows.
\nCold drink that goes great with cookies.

\n
"""), ("""h4. A more complicated table\n\ntable(tableclass#tableid){color:blue}.\n|_. table |_. more |_. badass |\n|\\3. Horizontal span of 3|\n(firstrow). |first|HAL(open the pod bay doors)|1|\n|some|{color:green}. styled|content|\n|/2. spans 2 rows|this is|quite a|\n| deep test | don't you think?|\n(lastrow). |fifth|I'm a lumberjack|5|\n|sixth| _*bold italics*_ |6|""", """\t

A more complicated table

\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
table more badass
Horizontal span of 3
firstHAL1
somestyledcontent
spans 2 rowsthis isquite a
deep test don’t you think?
fifthI’m a lumberjack5
sixth bold italics 6
"""), ("""| *strong* |\n\n| _em_ |\n\n| Inter-word -dashes- | ZIP-codes are 5- or 9-digit codes |""", @@ -246,7 +245,7 @@ ("""h2. A definition list\n\n;(class#id) Term 1\n: Def 1\n: Def 2\n: Def 3\n;; Center\n;; NATO(Why Em Cee Ayy)\n:: Subdef 1\n:: Subdef 2\n;;; SubSub Term\n::: SubSub Def 1\n::: SubSub Def 2\n::: Subsub Def 3\nWith newline\n::: Subsub Def 4\n:: Subdef 3\n: DEF 4\n; Term 2\n: Another def\n: And another\n: One more\n:: A def without a term\n:: More defness\n; Third term for good measure\n: My definition of a boombastic jazz""", """\t

A definition list

\n\n\t
\n\t\t
Term 1
\n\t\t
Def 1
\n\t\t
Def 2
\n\t\t
Def 3\n\t\t
\n\t\t\t
Center
\n\t\t\t
NATO
\n\t\t\t
Subdef 1
\n\t\t\t
Subdef 2\n\t\t\t
\n\t\t\t\t
SubSub Term
\n\t\t\t\t
SubSub Def 1
\n\t\t\t\t
SubSub Def 2
\n\t\t\t\t
Subsub Def 3
\nWith newline
\n\t\t\t\t
Subsub Def 4
\n\t\t\t
\n\t\t\t
Subdef 3
\n\t\t
\n\t\t
DEF 4
\n\t\t
Term 2
\n\t\t
Another def
\n\t\t
And another
\n\t\t
One more\n\t\t
\n\t\t\t
A def without a term
\n\t\t\t
More defness
\n\t\t
\n\t\t
Third term for good measure
\n\t\t
My definition of a boombastic jazz
\n\t
"""), ("""###. Here's a comment.\n\nh3. Hello\n\n###. And\nanother\none.\n\nGoodbye.""", """\t

Hello

\n\n\t

Goodbye.

"""), - ("""h2. A Definition list which covers the instance where a new definition list is created with a term without a definition\n\n- term :=\n- term2 := def""", """\t

A Definition list which covers the instance where a new definition list is created with a term without a definition

\n\n
\n\t
term2
\n\t
def
\n
"""), + ("""h2. A Definition list which covers the instance where a new definition list is created with a term without a definition\n\n- term :=\n- term2 := def""", """\t

A Definition list which covers the instance where a new definition list is created with a term without a definition

\n\n
\n\t
term
\n\t
term2
\n\t
def
\n
"""), ('!{height:20px;width:20px;}https://1.gravatar.com/avatar/!', '\t

'), ('& test', '\t

& test

'), @@ -254,12 +253,20 @@ # A few extra cases for HTML4 html_known_values = ( + ("pre.. The beginning\n\nbc.. This code\n\nis the last\n\nblock in the document\n", + "
The beginning
\n\n
This code\n\nis the last\n\nblock in the document
"), + ("bc.. This code\n\nis not\n\nsurrounded by anything\n", + "
This code\n\nis not\n\nsurrounded by anything
"), + ("bc.. Paragraph 1\n\nParagraph 2\n\nParagraph 3\n\np.. post-code paragraph", + "
Paragraph 1\n\nParagraph 2\n\nParagraph 3
\n\n

post-code paragraph

"), + ("bc.. Paragraph 1\n\nParagraph 2\n\nParagraph 3\n\npre.. post-code non-p block", + "
Paragraph 1\n\nParagraph 2\n\nParagraph 3
\n\n
post-code non-p block
"), ('I spoke.\nAnd none replied.', '\t

I spoke.
\nAnd none replied.

'), ('I __know__.\nI **really** __know__.', '\t

I know.
\nI really know.

'), ("I'm %{color:red}unaware%\nof most soft drinks.", '\t

I’m unaware
\nof most soft drinks.

'), ('I seriously *{color:red}blushed*\nwhen I _(big)sprouted_ that\ncorn stalk from my\n%[es]cabeza%.', - '\t

I seriously blushed
\nwhen I sprouted' - ' that
\ncorn stalk from my
\ncabeza.

'), + '\t

I seriously blushed
\nwhen I sprouted' + ' that
\ncorn stalk from my
\ncabeza.

'), ('
\n\na.gsub!( /\n
', '
\n\na.gsub!( /</, "" )\n\n
'), ('
\n\nh3. Sidebar\n\n"Hobix":http://hobix.com/\n"Ruby":http://ruby-lang.org/\n\n
\n\n' @@ -307,14 +314,211 @@ # cite attribute ('bq.:http://textism.com/ Text...', '\t
\n\t\t

Text…

\n\t
'), ('Hello ["(Mum) & dad"]', '\t

Hello [“(Mum) & dad”]

'), + # Dimensions + ( + ('[1/2] x [1/4] and (1/2)" x [1/4]" and (1/2)\' x (1/4)\'\n\n' + '(2 x 10) X (3 / 4) x (200 + 64)\n\n' + '1 x 1 = 1\n\n' + '1 x1 = 1\n\n' + '1x 1 = 1\n\n' + '1x1 = 1\n\n' + '1 X 1 = 1\n\n' + '1 X1 = 1\n\n' + '1X 1 = 1\n\n' + '1X1 = 1\n\n' + 'What is 1 x 1?\n\n' + 'What is 1x1?\n\n' + 'What is 1 X 1?\n\n' + 'What is 1X1?\n\n' + '1 x 2 x 3 = 6\n\n' + '1x2x3=6\n\n' + '1x2 x 1x3 = 6\n\n' + '2\' x 2\' = 4 sqft.\n\n' + '2\'x 2\' = 4 sqft.\n\n' + '2\' x2\' = 4 sqft.\n\n' + '2\'x2\' = 4 sqft.\n\n' + '2\' X 2\' = 4 sqft.\n\n' + '2\'X 2\' = 4 sqft.\n\n' + '2\' X2\' = 4 sqft.\n\n' + '2\'X2\' = 4 sqft.\n\n' + '2" x 2" = 4 sqin.\n\n' + '2"x 2" = 4 sqin.\n\n' + '2" x2" = 4 sqin.\n\n' + '2"x2" = 4 sqin.\n\n' + '2" X 2" = 4 sqin.\n\n' + '2"X 2" = 4 sqin.\n\n' + '2" X2" = 4 sqin.\n\n' + '2"X2" = 4in[^2^].\n\n' + 'What is 1.2 x 3.5?\n\n' + 'What is .2 x .5?\n\n' + 'What is 1.2x3.5?\n\n' + 'What is .2x.5?\n\n' + 'What is 1.2\' x3.5\'?\n\n' + 'What is .2"x .5"?\n\n' + '1 x $10.00 x -£ 1.23 x ¥20,000 x -¤120.00 x ฿1,000,000 x -€110,00\n\n'), + + ('\t

½ × ¼ and ½” × ¼” and ½’ × ¼’

\n\n' + '\t

(2 × 10) × (3 / 4) × (200 + 64)

\n\n' + '\t

1 × 1 = 1

\n\n' + '\t

1 ×1 = 1

\n\n' + '\t

1× 1 = 1

\n\n' + '\t

1×1 = 1

\n\n' + '\t

1 × 1 = 1

\n\n' + '\t

1 ×1 = 1

\n\n' + '\t

1× 1 = 1

\n\n' + '\t

1×1 = 1

\n\n' + '\t

What is 1 × 1?

\n\n' + '\t

What is 1×1?

\n\n' + '\t

What is 1 × 1?

\n\n' + '\t

What is 1×1?

\n\n' + '\t

1 × 2 × 3 = 6

\n\n' + '\t

1×2×3=6

\n\n' + '\t

1×2 × 1×3 = 6

\n\n' + '\t

2’ × 2’ = 4 sqft.

\n\n' + '\t

2’× 2’ = 4 sqft.

\n\n' + '\t

2’ ×2’ = 4 sqft.

\n\n' + '\t

2’×2’ = 4 sqft.

\n\n' + '\t

2’ × 2’ = 4 sqft.

\n\n' + '\t

2’× 2’ = 4 sqft.

\n\n' + '\t

2’ ×2’ = 4 sqft.

\n\n' + '\t

2’×2’ = 4 sqft.

\n\n' + '\t

2” × 2” = 4 sqin.

\n\n' + '\t

2”× 2” = 4 sqin.

\n\n' + '\t

2” ×2” = 4 sqin.

\n\n' + '\t

2”×2” = 4 sqin.

\n\n' + '\t

2” × 2” = 4 sqin.

\n\n' + '\t

2”× 2” = 4 sqin.

\n\n' + '\t

2” ×2” = 4 sqin.

\n\n' + '\t

2”×2” = 4in2.

\n\n' + '\t

What is 1.2 × 3.5?

\n\n' + '\t

What is .2 × .5?

\n\n' + '\t

What is 1.2×3.5?

\n\n' + '\t

What is .2×.5?

\n\n' + '\t

What is 1.2’ ×3.5’?

\n\n' + '\t

What is .2”× .5”?

\n\n' + '\t

1 × $10.00 × -£ 1.23 × ¥20,000 × -¤120.00 × ฿1,000,000 × -€110,00

') + ), + # Empty note lists + ('There should be nothing below.\n\nnotelist.', '\t

There should be nothing below.

\n\n\t'), + # Empty things + (('\'\'\n\n""\n\n%%\n\n^^\n\n&&\n\n**\n\n__\n\n--\n\n++\n\n~~\n\n{}\n\n' + '[]\n\n()\n\n<>\n\n\\\\\n\n//\n\n??\n\n==\n\n@@\n\n##\n\n$$\n\n!!\n\n' + '::\n\n;;\n\n..\n\n,,\n\n||\n\n` `\n\n\' \'\n\n" "\n\n% %\n\n^ ^\n\n' + '& &\n\n* *\n\n_ _\n\n- -\n\n+ +\n\n~ ~\n\n{ }\n\n[ ]\n\n( )\n\n< >\n\n' + '\\ \\\n\n/ /\n\n? ?\n\n= =\n\n@ @\n\n# #\n\n$ $\n\n! !\n\n: :\n\n; ;\n\n' + '. .\n\n, ,'), + ("\t

‘’

\n\n\t

“”

\n\n\t

%%

\n\n\t

^^

\n\n\t" + "

&&

\n\n\t

**

\n\n\t

__

\n\n\t

\n\n\t

++

\n\n\t" + "

~~

\n\n\t

{}

\n\n\t

[]

\n\n\t

()

\n\n\t

<>

\n\n\t

\\\\

\n\n\t" + "

//

\n\n\t

??

\n\n\t

==

\n\n\t

\n\n\t

##

\n\n\t

$$

\n\n\t" + "

!!

\n\n\t

::

\n\n\t

;;

\n\n\t

..

\n\n\t

,,

\n\n\t" + "\n\t\t\n\t\t\t\n\t\t\n\t
\n\n\t

` `

\n\n\t

‘ ‘

\n\n\t" + "

“ “

\n\n\t

% %

\n\n\t

^ ^

\n\n\t

& &

\n\n\t" + "
    \n\t\t
  • *
  • \n\t
\n\n\t

_ _

\n\n\t

- -

\n\n\t

+ +

\n\n\t

~ ~

\n\n\t" + "

{ }

\n\n\t

[ ]

\n\n\t

( )

\n\n\t

< >

\n\n\t

\\ \\

\n\n\t" + "

/ /

\n\n\t

? ?

\n\n\t

= =

\n\n\t

\n\n\t
    \n\t\t
  1. #
  2. \n\t
\n\n\t" + "

$ $

\n\n\t

! !

\n\n\t
\n\t\t
:
\n\t
\n\n\t
\n\t\t
;
\n\t
\n\n\t" + "

. .

\n\n\t

, ,

")), + # A lone standing comment must be preserved as is: + # withouth wrapping it into a paragraph + (('An ordinary block.\n\n' + '\n'), + '\t

An ordinary block.

\n\n'), + # Headers must be "breakable", just like paragraphs. + ('h1. Two line with *strong*\nheading\n', + '\t

Two line with strong
\nheading

'), + # Non-standalone ampersands should not be escaped + (("“test”\n\n" + "“test”\n\n" + " test \n"), + ("\t

test

\n\n" + "\t

test

\n\n" + "\t

 test 

")), + # Nested and mixed multi-level ordered and unordered lists + (("* bullet\n" + "*# number\n" + "*# number\n" + "*#* bullet\n" + "*# number\n" + "*# number with\n" + "a break\n" + "* bullet\n" + "** okay"), + ("\t
    \n" + "\t\t
  • bullet\n" + "\t\t
      \n" + "\t\t\t
    1. number
    2. \n" + "\t\t\t
    3. number\n" + "\t\t\t
        \n" + "\t\t\t\t
      • bullet
      • \n" + "\t\t\t
    4. \n" + "\t\t\t
    5. number
    6. \n" + "\t\t\t
    7. number with
      \n" + "a break
    8. \n" + "\t\t
  • \n" + "\t\t
  • bullet\n" + "\t\t
      \n" + "\t\t\t
    • okay
    • \n" + "\t\t
  • \n" + "\t\t
")), + # Checks proper insertion of
within table cells + (("|-(cold) milk :=\n" + "Nourishing beverage for baby cows. =:\n" + "|"), + ("\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\n\t
\n" + "\t
milk
\n" + "\t

Nourishing beverage for baby cows.

\n" + "
")), + # Long non-textile blocks + ("notextile.. *a very*\n\n*long*\n\n*block*\n", "*a very*\n\n*long*\n\n*block*"), + # Correct use of ‘ and ’ + ("Here is a %(example)'spanned'% word.", + '\t

Here is a ‘spanned’ word.

'), + # Using $-links with link aliases + ("\"$\":test\n[test]https://textpattern.com/start\n", + "\t

textpattern.com/start

"), + ('Please check on "$":test for any updates.\n[test]https://de.wikipedia.org/wiki/Übermensch', + '\t

Please check on de.wikipedia.org/wiki/Übermensch for any updates.

'), + # Make sure smileys don't get recognised as a definition list. + (":(\n\n:)\n\n:( \n:( \n:( \n:) \n\nPinocchio!\n:^)\n\nBaboon!\n:=)\n\nWink!\n;)\n\n:[ \n:]\n\n;(\nsomething\ndark side\n:) \n\n;(c)[de] Item", + '\t

:(

\n\n\t

:)

\n\n\t

:(
\n:(
\n:(
\n:)

\n\n\t

Pinocchio!
\n:^)

\n\n\t

Baboon!
\n:=)

\n\n\t

Wink!
\n;)

\n\n\t

:[
\n:]

\n\n\t

;(
\nsomething
\ndark side
\n:)

\n\n\t
\n\t\t
Item
\n\t
'), + # Checking proper parsing of classes and IDs + ("_(class1 class2#id1)text1_ -(foobarbaz#boom bang)text2-\n", + '\t

text1 text2

'), + # Tables with nested textile elements + ("|!http://tester.local/logo.png!| !http://tester.local/logo.png! |", + '\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
'), + # Tables with colgroups + (("|=. Testing colgroup and col syntax | \n" + "|:\\5. 80 |\x20\n" + "|a|b|c|d|e|\x20\n"), + ('\t\n\t\n' + '\t\n\t\n' + '\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n' + '\t
Testing colgroup and col syntax
abcde
')), + # Table column with an emphasis should not be confused with a heading + ('|_touch_ this!| _touch_ this! |', + '\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
touch this! touch this!
'), + # Table with colgroup but no caption + (("|:\\5. 80 |\x20\n" + "|a|b|c|d|e|\x20\n"), + ('\t\n' + '\t\n\t\n' + '\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n' + '\t
abcde
')), ) + @pytest.mark.parametrize("input, expected_output", xhtml_known_values) def test_KnownValuesXHTML(input, expected_output): # XHTML output = textile.textile(input, html_type='xhtml') assert output == expected_output + @pytest.mark.parametrize("input, expected_output", html_known_values) def test_KnownValuesHTML(input, expected_output): # HTML5 diff --git a/textile/__init__.py b/textile/__init__.py index bb7829f7..16418739 100644 --- a/textile/__init__.py +++ b/textile/__init__.py @@ -1,9 +1,4 @@ -from __future__ import unicode_literals - -import sys -import warnings - -from .core import textile, textile_restricted, Textile +from .core import textile, textile_restricted, Textile # noqa: F401 from .version import VERSION __all__ = ['textile', 'textile_restricted'] diff --git a/textile/__main__.py b/textile/__main__.py index 18459610..210c147d 100644 --- a/textile/__main__.py +++ b/textile/__main__.py @@ -33,5 +33,5 @@ def main(): outfile.write(output) -if __name__ == '__main__': #pragma: no cover +if __name__ == '__main__': # pragma: no cover main() diff --git a/textile/core.py b/textile/core.py index 7b66af02..4a2594f8 100644 --- a/textile/core.py +++ b/textile/core.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - __copyright__ = """ Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ Copyright (c) 2010, Kurt Raschke @@ -20,13 +18,14 @@ import uuid from urllib.parse import urlparse, urlsplit, urlunsplit, quote, unquote from collections import OrderedDict +from nh3 import clean -from textile.tools import sanitizer, imagesize from textile.regex_strings import (align_re_s, cls_re_s, pnct_re_s, - regex_snippets, syms_re_s, table_span_re_s) + regex_snippets, syms_re_s, table_span_re_s) from textile.utils import (decode_high, encode_high, encode_html, generate_tag, - has_raw_text, is_rel_url, is_valid_url, list_type, normalize_newlines, - parse_attributes, pba) + getimagesize, has_raw_text, human_readable_url, + is_rel_url, is_valid_url, list_type, + normalize_newlines, parse_attributes, pba) from textile.objects import Block, Table try: @@ -35,43 +34,149 @@ import re +def make_glyph_replacers(html_type, uid, glyph_defs): + """ + Generates a list of "replacers" (each is a pair consiting of + a regular expression and a replacing pattern) that, + when applied sequentially, replace some characters of the original + text with their HTML codes to produce valid HTML. + """ + cur = ( + r'(?:[{0}]{1}*)?'.format(regex_snippets['cur'], regex_snippets['space']) + if regex_snippets['cur'] + else r'') + pre_result = [ + # dimension sign (before apostrophes/quotes are replaced) + (re.compile( + r'([0-9]+[\])]?[\'"]? ?)[x]( ?[\[(]?)' + r'(?=[+-]?{0}[0-9]*\.?[0-9]+)'.format(cur), + flags=re.I | re.U), + r'\1{dimension}\2'), + # apostrophe's + (re.compile( + r"({0}|\))'({0})" + .format(regex_snippets['wrd']), + flags=re.U), + r'\1{apostrophe}\2'), + # back in '88 + (re.compile( + r"({0})'(\d+{1}?)\b(?![.]?[{1}]*?')".format( + regex_snippets['space'], regex_snippets['wrd']), + flags=re.U), + r'\1{apostrophe}\2'), + # single opening following an open bracket. + (re.compile(r"([([{])'(?=\S)", flags=re.U), + r'\1{quote_single_open}'), + # single closing + (re.compile( + r"(\S)'(?={0}|{1}|<|$)".format(regex_snippets['space'], pnct_re_s), + flags=re.U), + r'\1{quote_single_close}'), + # single opening + (re.compile(r"'", re.U), r'{quote_single_open}'), + # double opening following an open bracket. Allows things like + # Hello ["(Mum) & dad"] + (re.compile(r'([([{])"(?=\S)', flags=re.U), + r'\1{quote_double_open}'), + # double closing + (re.compile( + r'(\S)"(?={0}|{1}|<|$)'.format(regex_snippets['space'], pnct_re_s), + flags=re.U), + r'\1{quote_double_close}'), + # double opening + (re.compile(r'"'), r'{quote_double_open}'), + # ellipsis + (re.compile(r'([^.]?)\.{3}'), r'\1{ellipsis}'), + # em dash + (re.compile(r'(\s?)--(\s?)'), r'\1{emdash}\2'), + # en dash + (re.compile(r' - '), r' {endash} '), + # trademark + (re.compile( + r'(\b ?|{0}|^)[([]TM[])]'.format(regex_snippets['space']), + flags=re.I | re.U), + r'\1{trademark}'), + # registered + (re.compile( + r'(\b ?|{0}|^)[([]R[])]'.format(regex_snippets['space']), + flags=re.I | re.U), + r'\1{registered}'), + # copyright + (re.compile( + r'(\b ?|{0}|^)[([]C[])]'.format(regex_snippets['space']), + flags=re.I | re.U), + r'\1{copyright}'), + # 1/2 + (re.compile(r'[([]1\/2[])]'), r'{half}'), + # 1/4 + (re.compile(r'[([]1\/4[])]'), r'{quarter}'), + # 3/4 + (re.compile(r'[([]3\/4[])]'), r'{threequarters}'), + # degrees + (re.compile(r'[([]o[])]'), r'{degrees}'), + # plus/minus + (re.compile(r'[([]\+\/-[])]'), r'{plusminus}'), + # 3+ uppercase acronym + (re.compile( + r'\b([{0}][{1}]{{2,}})\b(?:[(]([^)]*)[)])' + .format(regex_snippets['abr'], regex_snippets['acr']), + flags=re.U), + (r'\1' if html_type == 'html5' + else r'\1')), + # 3+ uppercase + (re.compile( + r'({space}|^|[>(;-])([{abr}]{{3,}})([{nab}]*)' + '(?={space}|{pnct}|<|$)(?=[^">]*?(<|$))' + .format(space=regex_snippets['space'], + abr=regex_snippets['abr'], + nab=regex_snippets['nab'], + pnct=pnct_re_s), + re.U), + r'\1{0}:glyph:\2\3'.format(uid)), + ] + return [(regex_obj, replacement.format(**glyph_defs)) + for (regex_obj, replacement) in pre_result] + + class Textile(object): restricted_url_schemes = ('http', 'https', 'ftp', 'mailto') - unrestricted_url_schemes = restricted_url_schemes + ('file', 'tel', - 'callto', 'sftp', 'data') + unrestricted_url_schemes = restricted_url_schemes + ( + 'file', 'tel', 'callto', 'sftp', 'data') btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', r'fn\d+', 'p', '###') btag_lite = ('bq', 'bc', 'p') note_index = 1 - doctype_whitelist = ['xhtml', 'html5'] - glyph_definitions = { - 'quote_single_open': '‘', - 'quote_single_close': '’', - 'quote_double_open': '“', - 'quote_double_close': '”', - 'apostrophe': '’', - 'prime': '′', - 'prime_double': '″', - 'ellipsis': '…', - 'ampersand': '&', - 'emdash': '—', - 'endash': '–', - 'dimension': '×', - 'trademark': '™', - 'registered': '®', - 'copyright': '©', - 'half': '½', - 'quarter': '¼', - 'threequarters': '¾', - 'degrees': '°', - 'plusminus': '±', + 'quote_single_open': '‘', # noqa: E241 + 'quote_single_close': '’', # noqa: E241 + 'quote_double_open': '“', # noqa: E241 + 'quote_double_close': '”', # noqa: E241 + 'apostrophe': '’', # noqa: E241 + 'prime': '′', # noqa: E241 + 'prime_double': '″', # noqa: E241 + 'ellipsis': '…', # noqa: E241 + 'ampersand': '&', # noqa: E241 + 'emdash': '—', # noqa: E241 + 'endash': '–', # noqa: E241 + 'dimension': '×', # noqa: E241 + 'trademark': '™', # noqa: E241 + 'registered': '®', # noqa: E241 + 'copyright': '©', # noqa: E241 + 'half': '½', # noqa: E241 + 'quarter': '¼', # noqa: E241 + 'threequarters': '¾', # noqa: E241 + 'degrees': '°', # noqa: E241 + 'plusminus': '±', # noqa: E241 } + spanWrappers = ( + ('[', ']'), + ) + def __init__(self, restricted=False, lite=False, noimage=False, - get_sizes=False, html_type='xhtml', rel='', block_tags=True): + get_sizes=False, html_type='xhtml', rel='', block_tags=True): """Textile properties that are common to regular textile and textile_restricted""" self.restricted = restricted @@ -93,119 +198,8 @@ def __init__(self, restricted=False, lite=False, noimage=False, self.refIndex = 0 self.block_tags = block_tags - cur = r'' - if regex_snippets['cur']: # pragma: no branch - cur = r'(?:[{0}]{1}*)?'.format(regex_snippets['cur'], - regex_snippets['space']) - - # We'll be searching for characters that need to be HTML-encoded to - # produce properly valid html. These are the defaults that work in - # most cases. Below, we'll copy this and modify the necessary pieces - # to make it work for characters at the beginning of the string. - self.glyph_search = [ - # apostrophe's - re.compile(r"(^|{0}|\))'({0})".format(regex_snippets['wrd']), - flags=re.U), - # back in '88 - re.compile(r"({0})'(\d+{1}?)\b(?![.]?[{1}]*?')".format( - regex_snippets['space'], regex_snippets['wrd']), - flags=re.U), - # single opening following an open bracket. - re.compile(r"([([{])'(?=\S)", flags=re.U), - # single closing - re.compile(r"(^|\S)'(?={0}|{1}|<|$)".format( - regex_snippets['space'], pnct_re_s), flags=re.U), - # single opening - re.compile(r"'", re.U), - # double opening following an open bracket. Allows things like - # Hello ["(Mum) & dad"] - re.compile(r'([([{])"(?=\S)', flags=re.U), - # double closing - re.compile(r'(^|\S)"(?={0}|{1}|<|$)'.format( - regex_snippets['space'], pnct_re_s), re.U), - # double opening - re.compile(r'"'), - # ellipsis - re.compile(r'([^.]?)\.{3}'), - # ampersand - re.compile(r'(\s?)&(\s)', re.U), - # em dash - re.compile(r'(\s?)--(\s?)'), - # en dash - re.compile(r' - '), - # dimension sign - re.compile(r'([0-9]+[\])]?[\'"]? ?)[x]( ?[\[(]?)' - r'(?=[+-]?{0}[0-9]*\.?[0-9]+)'.format(cur), flags=re.I | re.U), - # trademark - re.compile(r'(\b ?|{0}|^)[([]TM[])]'.format(regex_snippets['space'] - ), flags=re.I | re.U), - # registered - re.compile(r'(\b ?|{0}|^)[([]R[])]'.format(regex_snippets['space'] - ), flags=re.I | re.U), - # copyright - re.compile(r'(\b ?|{0}|^)[([]C[])]'.format(regex_snippets['space'] - ), flags=re.I | re.U), - # 1/2 - re.compile(r'[([]1\/2[])]'), - # 1/4 - re.compile(r'[([]1\/4[])]'), - # 3/4 - re.compile(r'[([]3\/4[])]'), - # degrees - re.compile(r'[([]o[])]'), - # plus/minus - re.compile(r'[([]\+\/-[])]'), - # 3+ uppercase acronym - re.compile(r'\b([{0}][{1}]{{2,}})\b(?:[(]([^)]*)[)])'.format( - regex_snippets['abr'], regex_snippets['acr']), flags=re.U), - # 3+ uppercase - re.compile(r'({space}|^|[>(;-])([{abr}]{{3,}})([{nab}]*)' - '(?={space}|{pnct}|<|$)(?=[^">]*?(<|$))'.format(**{ 'space': - regex_snippets['space'], 'abr': regex_snippets['abr'], - 'nab': regex_snippets['nab'], 'pnct': pnct_re_s}), re.U), - ] - # These are the changes that need to be made for characters that occur - # at the beginning of the string. - self.glyph_search_initial = list(self.glyph_search) - # apostrophe's - self.glyph_search_initial[0] = re.compile(r"({0}|\))'({0})".format( - regex_snippets['wrd']), flags=re.U) - # single closing - self.glyph_search_initial[3] = re.compile(r"(\S)'(?={0}|{1}|$)".format( - regex_snippets['space'], pnct_re_s), re.U) - # double closing - self.glyph_search_initial[6] = re.compile(r'(\S)"(?={0}|{1}|<|$)'.format( - regex_snippets['space'], pnct_re_s), re.U) - - self.glyph_replace = [x.format(**self.glyph_definitions) for x in ( - r'\1{apostrophe}\2', # apostrophe's - r'\1{apostrophe}\2', # back in '88 - r'\1{quote_single_open}', # single opening after bracket - r'\1{quote_single_close}', # single closing - r'{quote_single_open}', # single opening - r'\1{quote_double_open}', # double opening after bracket - r'\1{quote_double_close}', # double closing - r'{quote_double_open}', # double opening - r'\1{ellipsis}', # ellipsis - r'\1{ampersand}\2', # ampersand - r'\1{emdash}\2', # em dash - r' {endash} ', # en dash - r'\1{dimension}\2', # dimension sign - r'\1{trademark}', # trademark - r'\1{registered}', # registered - r'\1{copyright}', # copyright - r'{half}', # 1/2 - r'{quarter}', # 1/4 - r'{threequarters}', # 3/4 - r'{degrees}', # degrees - r'{plusminus}', # plus/minus - r'\1', # 3+ uppercase acronym - r'\1{0}:glyph:\2' # 3+ uppercase - r'\3'.format(self.uid), - )] - - if self.html_type == 'html5': - self.glyph_replace[21] = r'\1' + self.glyph_replacers = make_glyph_replacers( + html_type, self.uid, self.glyph_definitions) if self.restricted is True: self.url_schemes = self.restricted_url_schemes @@ -238,12 +232,12 @@ def parse(self, text, rel=None, sanitize=False): if self.block_tags: if self.lite: - self.blocktag_whitelist = ['bq', 'p'] + self.blocktag_allowlist = set(['bq', 'p', 'br']) text = self.block(text) else: - self.blocktag_whitelist = [ 'bq', 'p', 'bc', 'notextile', - 'pre', 'h[1-6]', - 'fn{0}+'.format(regex_snippets['digit']), '###'] + self.blocktag_allowlist = set(['bq', 'p', 'br', 'bc', 'notextile', + 'pre', 'h[1-6]', + f"fn{regex_snippets['digit']}+", '###']) text = self.block(text) text = self.placeNoteLists(text) else: @@ -265,8 +259,9 @@ def parse(self, text, rel=None, sanitize=False): text = text.replace('{0}:glyph:'.format(self.uid), '') if sanitize: - text = sanitizer.sanitize(text) + text = clean(text, tags=self.blocktag_allowlist) + text = self.retrieveTags(text) text = self.retrieveURLs(text) # if the text contains a break tag (
or
) not followed by @@ -280,9 +275,10 @@ def parse(self, text, rel=None, sanitize=False): def table(self, text): text = "{0}\n\n".format(text) pattern = re.compile(r'^(?:table(?P_?{s}{a}{c})\.' - r'(?P.*?)\n)?^(?P{a}{c}\.? ?\|.*\|)' - r'[\s]*\n\n'.format(**{'s': table_span_re_s, 'a': align_re_s, - 'c': cls_re_s}), flags=re.S | re.M | re.U) + r'(?P.*?)\n)?^(?P{a}{c}\.? ?\|.*\|)' + r'[\s]*\n\n'.format( + **{'s': table_span_re_s, 'a': align_re_s, + 'c': cls_re_s}), flags=re.S | re.M | re.U) match = pattern.search(text) if match: table = Table(self, **match.groupdict()) @@ -291,7 +287,7 @@ def table(self, text): def textileLists(self, text): pattern = re.compile(r'^((?:[*;:]+|[*;:#]*#(?:_|\d+)?){0}[ .].*)$' - r'(?![^#*;:])'.format(cls_re_s), re.U | re.M | re.S) + r'(?![^#*;:])'.format(cls_re_s), re.U | re.M | re.S) return pattern.sub(self.fTextileList, text) def fTextileList(self, match): @@ -306,7 +302,7 @@ def fTextileList(self, match): nextline = '' m = re.search(r"^(?P[#*;:]+)(?P_|\d+)?(?P{0})[ .]" - "(?P.*)$".format(cls_re_s), line, re.S) + "(?P.*)$".format(cls_re_s), line, re.S) if m: tl, start, atts, content = m.groups() content = content.strip() @@ -354,7 +350,7 @@ def fTextileList(self, match): self.olstarts[tl] = 1 nm = re.match(r"^(?P[#\*;:]+)(_|[\d]+)?{0}" - r"[ .].*".format(cls_re_s), nextline) + r"[ .].*".format(cls_re_s), nextline) if nm: nl = nm.group('nextlistitem') @@ -374,7 +370,7 @@ def fTextileList(self, match): if tl not in ls: ls[tl] = 1 itemtag = ("\n{0}\t<{1}>{2}".format(tabs, litem, content) if - showitem else '') + showitem else '') line = "<{0}l{1}{2}>{3}".format(ltype, atts, start, itemtag) else: line = ("\t<{0}{1}>{2}".format(litem, atts, content) if @@ -387,18 +383,13 @@ def fTextileList(self, match): for k, v in reversed(list(ls.items())): if len(k) > len(nl): if v != 2: - line = "{0}\n{1}".format(line, tabs, - list_type(k)) + line = "{0}\n{1}".format( + line, tabs, list_type(k)) if len(k) > 1 and v != 2: line = "{0}".format(line, litem) del ls[k] # Remember the current Textile tag: pt = tl - # This else exists in the original php version. I'm not sure how - # to come up with a case where the line would not match. I think - # it may have been necessary due to the way php returns matches. - # else: - #line = "{0}\n".format(line) result.append(line) return self.doTagBr(litem, "\n".join(result)) @@ -407,14 +398,28 @@ def doTagBr(self, tag, input): re.S).sub(self.doBr, input) def doPBr(self, in_): - return re.compile(r'<(p)([^>]*?)>(.*)()', re.S).sub(self.doBr, - in_) + return (re + .compile(r'<(p|h[1-6])([^>]*?)>(.*)()', re.S) + .sub(self.fPBr, in_)) + + def fPBr(self, m): + content = m.group(3) + content = ( + re.compile(r"{0}*\n(?![{0}|])".format(regex_snippets['space']), + re.I) + .sub("\n", content)) + content = re.compile(r"\n(?![\s|])").sub('
', content) + return '<{0}{1}>{2}{3}'.format(m.group(1), m.group(2), content, m.group(4)) def doBr(self, match): - content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*;:\s|])', - r'\1
', match.group(3)) + content = ( + re.compile( + r'(.+)(?!(?<=|||
)' + r'|(?<=
)|(?<=
))\n(?![#*;:\s|])', + re.I) + .sub(r'\1
', match.group(3))) return '<{0}{1}>{2}{3}'.format(match.group(1), match.group(2), content, - match.group(4)) + match.group(4)) def block(self, text): if not self.lite: @@ -450,8 +455,8 @@ def block(self, text): eat_whitespace = False pattern = (r'^(?P{0})(?P{1}{2})\.(?P\.?)' - r'(?::(?P\S+))? (?P.*)$'.format(tre, - align_re_s, cls_re_s)) + r'(?::(?P\S+))? (?P.*)$'.format( + tre, align_re_s, cls_re_s)) match = re.search(pattern, line, flags=re.S | re.U) # tag specified on this line. if match: @@ -467,15 +472,17 @@ def block(self, text): content = out[-2] if not multiline_para: - content = generate_tag(block.inner_tag, content, - block.inner_atts) - content = generate_tag(block.outer_tag, content, - block.outer_atts) + # block will have been defined in a previous run of the + # loop + content = generate_tag(block.inner_tag, content, # noqa: F821 + block.inner_atts) # noqa: F821 + content = generate_tag(block.outer_tag, content, # noqa: F821 + block.outer_atts) # noqa: F821 out[-2] = content tag, atts, ext, cite, content = match.groups() block = Block(self, **match.groupdict()) inner_block = generate_tag(block.inner_tag, block.content, - block.inner_atts) + block.inner_atts) # code tags and raw text won't be indented inside outer_tag. if block.inner_tag != 'code' and not has_raw_text(inner_block): inner_block = "\n\t\t{0}\n\t".format(inner_block) @@ -483,7 +490,7 @@ def block(self, text): line = block.content else: line = generate_tag(block.outer_tag, inner_block, - block.outer_atts) + block.outer_atts) # pre tags and raw text won't be indented. if block.outer_tag != 'pre' and not has_raw_text(line): line = "\t{0}".format(line) @@ -515,10 +522,10 @@ def block(self, text): line = block.content else: line = generate_tag(block.outer_tag, block.content, - block.outer_atts) + block.outer_atts) line = "\t{0}".format(line) else: - if block.tag == 'pre' or block.inner_tag == 'code': + if block.tag in ('pre', 'notextile') or block.inner_tag == 'code': line = self.shelve(encode_html(line, quotes=True)) else: line = self.graf(line) @@ -559,24 +566,24 @@ def block(self, text): # at this point, we've gone through all the lines. if there's still an # extension in effect, we close it here if ext and out and not block.tag == 'p': - block.content = out.pop() - block.process() - final = generate_tag(block.outer_tag, block.content, - block.outer_atts) - out.append(final) + content = out.pop() + content = generate_tag(block.inner_tag, content, block.inner_atts) + content = generate_tag(block.outer_tag, content, block.outer_atts) + out.append(content) return ''.join(out) def footnoteRef(self, text): # somehow php-textile gets away with not capturing the space. return re.compile(r'(?<=\S)\[(?P{0}+)(?P!?)\]' - r'(?P{1}?)'.format(regex_snippets['digit'], - regex_snippets['space']), re.U).sub(self.footnoteID, text) + r'(?P{1}?)'.format( + regex_snippets['digit'], regex_snippets['space']), + re.U).sub(self.footnoteID, text) def footnoteID(self, m): fn_att = OrderedDict({'class': 'footnote'}) if m.group('id') not in self.fn: - self.fn[m.group('id')] = '{0}{1}'.format(self.linkPrefix, - self._increment_link_index()) + self.fn[m.group('id')] = '{0}{1}'.format( + self.linkPrefix, self._increment_link_index()) fnid = self.fn[m.group('id')] fn_att['id'] = 'fnrev{0}'.format(fnid) fnid = self.fn[m.group('id')] @@ -602,21 +609,31 @@ def glyphs(self, text): single quote. If it's the first character of one of those splits, it's an apostrophe or closed single quote, but the regex will bear that out. A similar situation occurs for double quotes as well. - So, for the first pass, we use the glyph_search_initial set of - regexes. For all remaining passes, we use glyph_search + So, for the first pass, we use a set of regexes from + the initial_glyph_replacers. For all remaining passes, + we use glyph_replacers """ text = text.rstrip('\n') result = [] - searchlist = self.glyph_search_initial + standalone_amp_re = re.compile( + r"&(?!#[0-9]+;|#x[a-f0-9]+;|[a-z][a-z0-9]*;)", + flags=re.I) + html_amp_symbol = self.glyph_definitions['ampersand'] # split the text by any angle-bracketed tags - for i, line in enumerate(re.compile(r'(<[\w\/!?].*?>)', re.U).split( - text)): - if not i % 2: - for s, r in zip(searchlist, self.glyph_replace): + lines = re.compile(r'(<[\w/!?].*?>)', re.U | re.S).split(text) + for i, line in enumerate(lines): + if i % 2 == 0: + if not self.restricted: + # Raw < > & chars have already been encoded + # when in restricted mode + line = ( + standalone_amp_re + .sub(html_amp_symbol, line) + .replace('<', '<') + .replace('>', '>')) + for s, r in self.glyph_replacers: line = s.sub(r, line) result.append(line) - if i == 0: - searchlist = self.glyph_search return ''.join(result) def getRefs(self, text): @@ -719,7 +736,7 @@ def markStartOfLinks(self, text): linkparts = [] i = 0 - while balanced != 0 or i == 0: # pragma: no branch + while balanced != 0 or i == 0: # pragma: no branch # Starting at the end, pop off the previous part of the # slice's fragments. @@ -728,9 +745,9 @@ def markStartOfLinks(self, text): if len(possibility) > 0: # did this part inc or dec the balanced count? - if re.search(r'^\S|=$', possibility, flags=re.U): # pragma: no branch + if re.search(r'^\S|=$', possibility, flags=re.U): # pragma: no branch balanced = balanced - 1 - if re.search(r'\S$', possibility, flags=re.U): # pragma: no branch + if re.search(r'\S$', possibility, flags=re.U): # pragma: no branch balanced = balanced + 1 try: possibility = possible_start_quotes.pop() @@ -750,7 +767,7 @@ def markStartOfLinks(self, text): try: possibility = possible_start_quotes.pop() - except IndexError: # pragma: no cover + except IndexError: # pragma: no cover # If out of possible starting segments we back the # last one from the linkparts array linkparts.pop() @@ -759,7 +776,7 @@ def markStartOfLinks(self, text): # we have a closing ". if (possibility == '' or possibility.endswith(' ')): # force search exit - balanced = 0; + balanced = 0 if balanced <= 0: possible_start_quotes.append(possibility) @@ -775,7 +792,7 @@ def markStartOfLinks(self, text): # Re-assemble the link starts with a specific marker for the # next regex. o = '{0}{1}linkStartMarker:"{2}'.format(pre_link, self.uid, - link_content) + link_content) output.append(o) # Add the last part back @@ -817,14 +834,14 @@ def fLink(self, m): ) # end of $text (?:\((?P[^)]+?)\))? # $title (if any) $'''.format(cls_re_s, regex_snippets['space']), inner, - flags=re.X | re.U) + flags=re.X | re.U) atts = (m and m.group('atts')) or '' text = (m and m.group('text')) or inner title = (m and m.group('title')) or '' pop, tight = '', '' - counts = { '[': None, ']': url.count(']'), '(': None, ')': None } + counts = {'[': None, ']': url.count(']'), '(': None, ')': None} # Look for footnotes or other square-bracket delimited stuff at the end # of the url... @@ -891,13 +908,13 @@ def _closingsquarebracket(c, pop, popped, url_chars, counts, pre): # it popped = True url_chars.pop() - counts[']'] = counts[']'] - 1; - if first: # pragma: no branch + counts[']'] = counts[']'] - 1 + if first: # pragma: no branch pre = '' return pop, popped, url_chars, counts, pre def _closingparenthesis(c, pop, popped, url_chars, counts, pre): - if counts[')'] is None: # pragma: no branch + if counts[')'] is None: # pragma: no branch counts['('] = url.count('(') counts[')'] = url.count(')') @@ -912,20 +929,20 @@ def _casesdefault(c, pop, popped, url_chars, counts, pre): return pop, popped, url_chars, counts, pre cases = { - '!': _endchar, - '?': _endchar, - ':': _endchar, - ';': _endchar, - '.': _endchar, - ',': _endchar, - '>': _rightanglebracket, - ']': _closingsquarebracket, - ')': _closingparenthesis, - } - for c in url_chars[-1::-1]: # pragma: no branch + '!': _endchar, + '?': _endchar, + ':': _endchar, + ';': _endchar, + '.': _endchar, + ',': _endchar, + '>': _rightanglebracket, + ']': _closingsquarebracket, + ')': _closingparenthesis, + } + for c in url_chars[-1::-1]: # pragma: no branch popped = False - pop, popped, url_chars, counts, pre = cases.get(c, - _casesdefault)(c, pop, popped, url_chars, counts, pre) + pop, popped, url_chars, counts, pre = cases.get( + c, _casesdefault)(c, pop, popped, url_chars, counts, pre) first = False if popped is False: break @@ -939,16 +956,19 @@ def _casesdefault(c, pop, popped, url_chars, counts, pre): return in_.replace('{0}linkStartMarker:'.format(self.uid), '') if text == '$': - text = url - if "://" in text: - text = text.split("://")[1] - elif ":" in text: - text = text.split(":")[1] + if valid_scheme: + text = human_readable_url(url) + else: + ref_url = self.urlrefs.get(url) + if ref_url is not None: + text = human_readable_url(ref_url) + else: + text = url text = text.strip() title = encode_html(title) - if not self.noimage: # pragma: no branch + if not self.noimage: # pragma: no branch text = self.image(text) text = self.span(text) text = self.glyphs(text) @@ -989,14 +1009,14 @@ def encode_url(self, url): """, re.X | re.U) netloc_parsed = netloc_pattern.match(parsed.netloc).groupdict() else: - netloc_parsed = {'user': '', 'password': '', 'host': '', 'port': - ''} + netloc_parsed = {'user': '', 'password': '', 'host': '', 'port': ''} # encode each component scheme = parsed.scheme user = netloc_parsed['user'] and quote(netloc_parsed['user']) - password = (netloc_parsed['password'] and - quote(netloc_parsed['password'])) + password = ( + netloc_parsed['password'] and quote(netloc_parsed['password']) + ) host = netloc_parsed['host'] port = netloc_parsed['port'] and netloc_parsed['port'] # the below splits the path portion of the url by slashes, translates @@ -1006,7 +1026,7 @@ def encode_url(self, url): # because the quote and unquote functions expects different input # types: unicode strings for PY2 and str for PY3. path_parts = (quote(unquote(pce), b'') for pce in - parsed.path.split('/')) + parsed.path.split('/')) path = '/'.join(path_parts) # put it back together @@ -1039,26 +1059,36 @@ def span(self, text): (?P<end>[{pnct}]*) {tag} (?P<tail>$|[\[\]}}<]|(?=[{pnct}]{{1,2}}[^0-9]|\s|\))) - """.format(**{'tag': tag, 'cls': cls_re_s, 'pnct': pnct, - 'space': regex_snippets['space']}), flags=re.X | re.U) + """.format( + **{'tag': tag, 'cls': cls_re_s, 'pnct': pnct, 'space': + regex_snippets['space']} + ), flags=re.X | re.U) text = pattern.sub(self.fSpan, text) self.span_depth = self.span_depth - 1 return text + def getSpecialOptions(self, pre, tail): + for before, after in self.spanWrappers: + if pre == before and tail == after: + pre = tail = '' + break + return (pre, tail) + def fSpan(self, match): pre, tag, atts, cite, content, end, tail = match.groups() + pre, tail = self.getSpecialOptions(pre, tail) qtags = { - '*': 'strong', - '**': 'b', - '??': 'cite', - '_': 'em', - '__': 'i', - '-': 'del', - '%': 'span', - '+': 'ins', - '~': 'sub', - '^': 'sup' + '*': 'strong', # noqa: E241 + '**': 'b', # noqa: E241 + '??': 'cite', # noqa: E241 + '_': 'em', # noqa: E241 + '__': 'i', # noqa: E241 + '-': 'del', # noqa: E241 + '%': 'span', # noqa: E241 + '+': 'ins', # noqa: E241 + '~': 'sub', # noqa: E241 + '^': 'sup' # noqa: E241 } tag = qtags[tag] @@ -1067,25 +1097,45 @@ def fSpan(self, match): atts = '{0} cite="{1}"'.format(atts, cite.rstrip()) content = self.span(content) + opentag = '<{0}{1}>'.format(tag, atts) + closetag = '</{0}>'.format(tag) + tags = self.storeTags(opentag, closetag) + return pre + tags['open'] + content + end + tags['close'] + tail + + def storeTags(self, opentag, closetag=''): + tags = {} + self.refIndex += 1 + self.refCache[self.refIndex] = opentag + tags['open'] = self.uid + str(self.refIndex) + ':ospan ' + + self.refIndex += 1 + self.refCache[self.refIndex] = closetag + tags['close'] = ' ' + self.uid + str(self.refIndex) + ':cspan' + return tags + + def retrieveTags(self, text): + text = (re.compile('{0}(?P<token>[0-9]+):ospan '.format(self.uid), re.U) + .sub(self.fRetrieveTags, text)) + text = (re.compile(' {0}(?P<token>[0-9]+):cspan'.format(self.uid), re.U) + .sub(self.fRetrieveTags, text)) + return text - out = "<{0}{1}>{2}{3}</{4}>".format(tag, atts, content, end, tag) - if pre and not tail or tail and not pre: - out = '{0}{1}{2}'.format(pre, out, tail) - return out + def fRetrieveTags(self, match): + return self.refCache[int(match.group('token'))] def image(self, text): pattern = re.compile(r""" - (?:[\[{{])? # pre - \! # opening ! - (\<|\=|\>)? # optional alignment atts - ({0}) # optional style,class atts - (?:\.\s)? # optional dot-space - ([^\s(!]+) # presume this is the src - \s? # optional space - (?:\(([^\)]+)\))? # optional title - \! # closing - (?::(\S+))? # optional href - (?:[\]}}]|(?=\s|$)) # lookahead: space or end of string + (?:[\[{{])? # pre + \! # opening ! + (\<|\=|\>)? # optional alignment atts + ({0}) # optional style,class atts + (?:\.\s)? # optional dot-space + ([^\s(!]+) # presume this is the src + \s? # optional space + (?:\(([^\)]+)\))? # optional title + \! # closing + (?::(\S+)(?<![\]).,]))? # optional href sans final punct + (?:[\]}}]|(?=[.,\s)|]|$)) # lookahead: space or end of string """.format(cls_re_s), re.U | re.X) return pattern.sub(self.fImage, text) @@ -1101,7 +1151,7 @@ def fImage(self, match): title = '' if not is_rel_url(url) and self.get_sizes: - size = imagesize.getimagesize(url) + size = getimagesize(url) if href: href = self.shelveURL(href) @@ -1137,6 +1187,7 @@ def code(self, text): def fCode(self, match): before, text, after = match.groups() after = after or '' + before, after = self.getSpecialOptions(before, after) # text needs to be escaped text = encode_html(text, quotes=False) return ''.join([before, self.shelve('<code>{0}</code>'.format(text)), after]) @@ -1145,6 +1196,7 @@ def fPre(self, match): before, text, after = match.groups() if after is None: after = '' + before, after = self.getSpecialOptions(before, after) # text needs to be escaped text = encode_html(text) return ''.join([before, '<pre>', self.shelve(text), '</pre>', after]) @@ -1161,8 +1213,9 @@ def noTextile(self, text): def fTextile(self, match): before, notextile, after = match.groups() - if after is None: # pragma: no branch + if after is None: # pragma: no branch after = '' + before, after = self.getSpecialOptions(before, after) return ''.join([before, self.shelve(notextile), after]) def getHTMLComments(self, text): @@ -1187,7 +1240,7 @@ def redcloth_list(self, text): """Parse the text for definition lists and send them to be formatted.""" pattern = re.compile(r"^([-]+{0}[ .].*:=.*)$(?![^-])".format(cls_re_s), - re.M | re.U | re.S) + re.M | re.U | re.S) return pattern.sub(self.fRCList, text) def fRCList(self, match): @@ -1197,7 +1250,7 @@ def fRCList(self, match): for line in text: # parse the attributes and content m = re.match(r'^[-]+({0})[ .](.*)$'.format(cls_re_s), line, - flags=re.M | re.S) + flags=re.M | re.S) if not m: continue @@ -1207,9 +1260,12 @@ def fRCList(self, match): atts = pba(atts, restricted=self.restricted) # split the content into the term and definition - xm = re.match(r'^(.*?)[\s]*:=(.*?)[\s]*(=:|:=)?[\s]*$', content, - re.S) - term, definition, ending = xm.groups() + xm = re.match( + r'^(.*?){0}*:=(.*?){0}*(=:|:=)?{0}*$' + .format(regex_snippets['space']), + content, + re.S) + term, definition, _ = xm.groups() # cleanup term = term.strip() definition = definition.strip(' ') @@ -1222,16 +1278,23 @@ def fRCList(self, match): dltag = "<dl>" out.append(dltag) - if definition != '' and term != '': - if definition.startswith('\n'): - definition = '<p>{0}</p>'.format(definition.lstrip()) - definition = definition.replace('\n', '<br />').strip() + if term != '': + is_newline_started_def = definition.startswith('\n') + definition = ( + definition + .strip() + .replace('\n', '<br />')) + + if is_newline_started_def: + definition = '<p>{0}</p>'.format(definition) + term = term.replace('\n', '<br />') term = self.graf(term) definition = self.graf(definition) - out.extend(['\t<dt{0}>{1}</dt>'.format(atts, term), - '\t<dd>{0}</dd>'.format(definition)]) + out.append('\t<dt{0}>{1}</dt>'.format(atts, term)) + if definition: + out.append('\t<dd>{0}</dd>'.format(definition)) out.append('</dl>') out = '\n'.join(out) @@ -1249,12 +1312,12 @@ def placeNoteLists(self, text): else: self.unreferencedNotes[label] = info - if o: # pragma: no branch + if o: # pragma: no branch # sort o by key o = OrderedDict(sorted(o.items(), key=lambda t: t[0])) self.notes = o text_re = re.compile(r'<p>notelist({0})(?:\:([\w|{1}]))?([\^!]?)(\+?)' - r'\.?[\s]*</p>'.format(cls_re_s, syms_re_s), re.U) + r'\.?[\s]*</p>'.format(cls_re_s, syms_re_s), re.U) text = text_re.sub(self.fNoteLists, text) return text @@ -1265,9 +1328,9 @@ def fNoteLists(self, match): index = '{0}{1}{2}'.format(g_links, extras, start_char) result = '' - if index not in self.notelist_cache: # pragma: no branch + if index not in self.notelist_cache: # pragma: no branch o = [] - if self.notes: # pragma: no branch + if self.notes: # pragma: no branch for seq, info in self.notes.items(): links = self.makeBackrefLink(info, g_links, start_char) atts = '' @@ -1276,11 +1339,11 @@ def fNoteLists(self, match): atts = info['def']['atts'] content = info['def']['content'] li = ('\t\t<li{0}>{1}<span id="note{2}"> ' - '</span>{3}</li>').format(atts, links, infoid, - content) + '</span>{3}</li>').format(atts, links, infoid, + content) else: - li = ('\t\t<li{0}>{1} Undefined Note [#{2}].<li>' - ).format(atts, links, info['seq']) + li = ('\t\t<li{0}>{1} Undefined Note [#{2}].</li>' + ).format(atts, links, info['seq']) o.append(li) if '+' == extras and self.unreferencedNotes: for seq, info in self.unreferencedNotes.items(): @@ -1290,13 +1353,14 @@ def fNoteLists(self, match): o.append(li) self.notelist_cache[index] = "\n".join(o) result = self.notelist_cache[index] - list_atts = pba(att, restricted=self.restricted) - result = '<ol{0}>\n{1}\n\t</ol>'.format(list_atts, result) + if result: + list_atts = pba(att, restricted=self.restricted) + result = '<ol{0}>\n{1}\n\t</ol>'.format(list_atts, result) return result def makeBackrefLink(self, info, g_links, i): """Given the pieces of a back reference link, create an <a> tag.""" - atts, content, infoid, link = '', '', '', '' + link = '' if 'def' in info: link = info['def']['link'] backlink_type = link or g_links @@ -1314,7 +1378,7 @@ def makeBackrefLink(self, info, g_links, i): for refid in info['refids']: i_entity = decode_high(i_) sup = """<sup><a href="#noteref{0}">{1}</a></sup>""".format( - refid, i_entity) + refid, i_entity) if allow_inc: i_ = i_ + 1 result.append(sup) @@ -1330,13 +1394,14 @@ def fParseNoteDefs(self, m): # Assign an id if the note reference parse hasn't found the label yet. if label not in self.notes: - self.notes[label] = {'id': '{0}{1}'.format(self.linkPrefix, - self._increment_link_index())} + self.notes[label] = {'id': '{0}{1}'.format( + self.linkPrefix, self._increment_link_index())} # Ignores subsequent defs using the same label - if 'def' not in self.notes[label]: # pragma: no branch - self.notes[label]['def'] = {'atts': pba(att, restricted=self.restricted), 'content': - self.graf(content), 'link': link} + if 'def' not in self.notes[label]: # pragma: no branch + self.notes[label]['def'] = { + 'atts': pba(att, restricted=self.restricted), 'content': + self.graf(content), 'link': link} return '' def noteRef(self, text): @@ -1378,8 +1443,8 @@ def fParseNoteRefs(self, match): # If we are referencing a note that hasn't had the definition parsed # yet, then assign it an ID... if not self.notes[label]['id']: - self.notes[label]['id'] = '{0}{1}'.format(self.linkPrefix, - self._increment_link_index()) + self.notes[label]['id'] = '{0}{1}'.format( + self.linkPrefix, self._increment_link_index()) labelid = self.notes[label]['id'] # Build the link (if any)... @@ -1445,5 +1510,4 @@ def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): """ return Textile(restricted=True, lite=lite, noimage=noimage, - html_type=html_type, rel='nofollow').parse( - text) + html_type=html_type, rel='nofollow').parse(text) diff --git a/textile/objects/block.py b/textile/objects/block.py index de993e87..6d611ed1 100644 --- a/textile/objects/block.py +++ b/textile/objects/block.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - from collections import OrderedDict try: import regex as re @@ -40,7 +38,7 @@ def process(self): [{space}]+ # whitespace ends def marker (?P<content>.*)$ # content""".format( space=regex_snippets['space'], cls=cls_re_s), - flags=re.X | re.U) + flags=re.X | re.U) notedef = notedef_re.sub(self.textile.fParseNoteDefs, self.content) # It will be empty if the regex matched and ate it. @@ -49,13 +47,13 @@ def process(self): self.eat = True fns = re.search(r'fn(?P<fnid>{0}+)'.format(regex_snippets['digit']), - self.tag, flags=re.U) + self.tag, flags=re.U) if fns: self.tag = 'p' fnid = self.textile.fn.get(fns.group('fnid'), None) if fnid is None: fnid = '{0}{1}'.format(self.textile.linkPrefix, - self.textile._increment_link_index()) + self.textile._increment_link_index()) # If there is an author-specified ID goes on the wrapper & the # auto-id gets pushed to the <sup> @@ -71,12 +69,11 @@ def process(self): else: supp_id = parse_attributes('(#fn{0})'.format(fnid), restricted=self.textile.restricted) - if '^' not in self.atts: sup = generate_tag('sup', fns.group('fnid'), supp_id) else: fnrev = generate_tag('a', fns.group('fnid'), {'href': - '#fnrev{0}'.format(fnid)}) + '#fnrev{0}'.format(fnid)}) sup = generate_tag('sup', fnrev, supp_id) self.content = '{0} {1}'.format(sup, self.content) diff --git a/textile/objects/table.py b/textile/objects/table.py index 60b68040..72781ad1 100644 --- a/textile/objects/table.py +++ b/textile/objects/table.py @@ -1,11 +1,9 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - from xml.etree import ElementTree from textile.regex_strings import (align_re_s, cls_re_s, regex_snippets, - table_span_re_s, valign_re_s) -from textile.utils import encode_html, generate_tag, parse_attributes + table_span_re_s, valign_re_s, pnct_re_s) +from textile.utils import generate_tag, parse_attributes try: import regex as re @@ -14,6 +12,18 @@ class Table(object): + caption_re = re.compile( + (r'^\|\=(?P<capts>{s}{a}{c})\. ' + r'(?P<cap>[^\n]*)(?P<row>.*)' + .format(**{'s': table_span_re_s, 'a': align_re_s, 'c': cls_re_s})), + re.S) + colgroup_re = re.compile( + r'^\|:(?P<cols>{s}{a}{c}\. .*)' + .format(**{'s': table_span_re_s, 'a': align_re_s, 'c': cls_re_s}), + re.M) + heading_re = re.compile( + r'^_(?={0}|{1})'.format(regex_snippets['space'], pnct_re_s)) + def __init__(self, textile, tatts, rows, summary): self.textile = textile self.attributes = parse_attributes(tatts, 'table', restricted=self.textile.restricted) @@ -27,27 +37,21 @@ def __init__(self, textile, tatts, rows, summary): def process(self): rgrp = None groups = [] - if self.input[-1] == '|': # pragma: no branch - self.input = '{0}\n'.format(self.input) - split = self.input.split('|\n') + split = ( + re.compile(r'\|{0}*?$'.format(regex_snippets['space']), re.M) + .split(self.input)) for i, row in enumerate([x for x in split if x]): row = row.lstrip() # Caption -- only occurs on row 1, otherwise treat '|=. foo |...' # as a normal center-aligned cell. - if i == 0 and row[:2] == '|=': - captionpattern = (r"^\|\=(?P<capts>{s}{a}{c})\. " - r"(?P<cap>[^\n]*)(?P<row>.*)".format(**{ - 's': table_span_re_s, 'a': align_re_s, - 'c': cls_re_s})) - caption_re = re.compile(captionpattern, re.S) - cmtch = caption_re.match(row) - if cmtch: - caption = Caption(restricted=self.textile.restricted, **cmtch.groupdict()) - self.caption = '\n{0}'.format(caption.caption) - row = cmtch.group('row').lstrip() - if row == '': - continue + cmtch = self.caption_re.match(row) + if i == 0 and cmtch: + caption = Caption(restricted=self.textile.restricted, **cmtch.groupdict()) + self.caption = '\n{0}'.format(caption.caption) + row = cmtch.group('row').lstrip() + if row == '': + continue # Colgroup -- A colgroup row will not necessarily end with a |. # Hence it may include the next row of actual table data. @@ -66,8 +70,9 @@ def process(self): # search the row for a table group - thead, tfoot, or tbody grpmatchpattern = (r"(:?^\|(?P<part>{v})(?P<rgrpatts>{s}{a}{c})" - r"\.\s*$\n)?^(?P<row>.*)").format(**{'v': valign_re_s, 's': - table_span_re_s, 'a': align_re_s, 'c': cls_re_s}) + r"\.\s*$\n)?^(?P<row>.*)").format( + **{'v': valign_re_s, 's': table_span_re_s, + 'a': align_re_s, 'c': cls_re_s}) grpmatch_re = re.compile(grpmatchpattern, re.S | re.M) grpmatch = grpmatch_re.match(row.lstrip()) @@ -93,12 +98,13 @@ def process(self): r = Row(row_atts, row) for cellctr, cell in enumerate(row.split('|')[1:]): ctag = 'td' - if cell.startswith('_'): + if self.heading_re.match(cell): ctag = 'th' cmtch = re.search(r'^(?P<catts>_?{0}{1}{2}\. )' - '(?P<cell>.*)'.format(table_span_re_s, align_re_s, - cls_re_s), cell, flags=re.S) + '(?P<cell>.*)'.format( + table_span_re_s, align_re_s, cls_re_s), + cell, flags=re.S) if cmtch: catts = cmtch.group('catts') cell_atts = parse_attributes(catts, 'td', restricted=self.textile.restricted) @@ -108,7 +114,7 @@ def process(self): if not self.textile.lite: a_pattern = r'(?P<space>{0}*)(?P<cell>.*)'.format( - regex_snippets['space']) + regex_snippets['space']) a = re.search(a_pattern, cell, flags=re.S) cell = self.textile.redcloth_list(a.group('cell')) cell = self.textile.textileLists(cell) @@ -131,8 +137,8 @@ def process(self): if rgrp: groups.append('\n\t{0}'.format(rgrp.process())) - content = '{0}{1}{2}{3}\n\t'.format(self.caption, self.colgroup, - ''.join(groups), ''.join(self.content)) + content = '{0}{1}{2}{3}\n\t'.format( + self.caption, self.colgroup, ''.join(groups), ''.join(self.content)) tbl = generate_tag('table', content, self.attributes) return '\t{0}\n\n'.format(tbl) @@ -143,8 +149,8 @@ def __init__(self, capts, cap, row, restricted): self.caption = self.process(cap) def process(self, cap): - tag = generate_tag('caption', cap, self.attributes) - return '\t{0}\n\t'.format(tag) + tag = generate_tag('caption', cap.strip(), self.attributes) + return '\t{0}'.format(tag) class Colgroup(object): @@ -161,7 +167,6 @@ def process(self): colgroup = ElementTree.Element('colgroup', attrib=group_atts) colgroup.text = '\n\t' if self.cols is not None: - has_newline = "\n" in self.cols match_cols = self.cols.replace('.', '').split('|') # colgroup is the first item in match_cols, the remaining items are # cols. @@ -174,7 +179,8 @@ def process(self): # tab between cols and a newline at the end xml_declaration = "<?xml version='1.0' encoding='UTF-8'?>\n" colgrp = colgrp.replace(xml_declaration, '') - return colgrp.replace('><', '>\n\t<') + colgrp = colgrp.replace('><', '>\n\t<') + return f"\n\t{colgrp}" class Row(object): diff --git a/textile/regex_strings.py b/textile/regex_strings.py index 470203cb..c3691bb5 100644 --- a/textile/regex_strings.py +++ b/textile/regex_strings.py @@ -1,10 +1,8 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - try: # Use regex module for matching uppercase characters if installed, # otherwise fall back to finding all the uppercase chars in a loop. - import regex as re + import regex as re # noqa: F401 upper_re_s = r'\p{Lu}' regex_snippets = { 'acr': r'\p{Lu}\p{Nd}', @@ -15,22 +13,25 @@ 'digit': r'\p{N}', 'space': r'(?:\p{Zs}|\v)', 'char': r'(?:[^\p{Zs}\v])', - } + } except ImportError: from sys import maxunicode upper_re_s = "".join( - [chr(c) for c in range(maxunicode) if chr(c).isupper()] - ) + [chr(c) for c in range(maxunicode) if chr(c).isupper()] + ) regex_snippets = { 'acr': r'{0}0-9'.format(upper_re_s), 'abr': r'{0}'.format(upper_re_s), 'nab': r'a-z', 'wrd': r'\w', - 'cur': r'', + # All codepoints identified as currency symbols + # by the [mrab-regex library](https://pypi.org/project/regex/) + # and the UNICODE standard. + 'cur': r'$¢-¥֏؋৲৳৻૱௹฿៛\u20a0-\u20cf\ua838﷼﹩$¢£¥₩', 'digit': r'\d', 'space': r'(?:\s|\v)', 'char': r'\S', - } + } halign_re_s = r'(?:\<(?!>)|(?<!<)\>|\<\>|\=|[()]+(?! ))' valign_re_s = r'[\-^~]' @@ -43,10 +44,10 @@ table_span_re_s = r'(?:{0}|{1})*'.format(colspan_re_s, rowspan_re_s) # regex string to match class, style and language attributes cls_re_s = (r'(?:' - r'{c}(?:{l}(?:{s})?|{s}(?:{l})?)?|' - r'{l}(?:{c}(?:{s})?|{s}(?:{c})?)?|' - r'{s}(?:{c}(?:{l})?|{l}(?:{c})?)?' + r'{c}(?:{l}(?:{s})?|{s}(?:{l})?)?|' + r'{l}(?:{c}(?:{s})?|{s}(?:{c})?)?|' + r'{s}(?:{c}(?:{l})?|{l}(?:{c})?)?' r')?' - ).format(c=class_re_s, s=style_re_s, l=language_re_s) + ).format(c=class_re_s, s=style_re_s, l=language_re_s) pnct_re_s = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]' syms_re_s = '¤§µ¶†‡•∗∴◊♠♣♥♦' diff --git a/textile/textilefactory.py b/textile/textilefactory.py index e5e2458e..402bf868 100644 --- a/textile/textilefactory.py +++ b/textile/textilefactory.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from .core import Textile @@ -21,10 +20,7 @@ def __init__(self, restricted=False, lite=False, sanitize=False, self.method_parms['rel'] = 'nofollow' if noimage is None: - if restricted: - noimage = True - else: - noimage = False + noimage = bool(restricted) self.class_parms['noimage'] = noimage self.method_parms['sanitize'] = sanitize diff --git a/textile/tools/imagesize.py b/textile/tools/imagesize.py deleted file mode 100644 index 6fba73eb..00000000 --- a/textile/tools/imagesize.py +++ /dev/null @@ -1,27 +0,0 @@ -def getimagesize(url): - """ - Attempts to determine an image's width and height, and returns a tuple, - (width, height), in pixels or an empty string in case of failure. - Requires that PIL is installed. - - """ - - try: - from PIL import ImageFile - except ImportError: - return '' - - from urllib.request import urlopen - - try: - p = ImageFile.Parser() - f = urlopen(url) - while True: - s = f.read(1024) - if not s: - break - p.feed(s) - if p.image: - return p.image.size - except (IOError, ValueError): - return '' diff --git a/textile/tools/sanitizer.py b/textile/tools/sanitizer.py deleted file mode 100644 index 3c7209c6..00000000 --- a/textile/tools/sanitizer.py +++ /dev/null @@ -1,11 +0,0 @@ -def sanitize(string): - """ - Ensure that the text does not contain any malicious HTML code which might - break the page. - """ - from html5lib import parseFragment, serialize - - parsed = parseFragment(string) - clean = serialize(parsed, sanitize=True, omit_optional_tags=False, - quote_attr_values='always') - return clean diff --git a/textile/utils.py b/textile/utils.py index 1b18945a..578af4ed 100644 --- a/textile/utils.py +++ b/textile/utils.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - try: import regex as re except ImportError: @@ -14,16 +12,31 @@ from textile.regex_strings import valign_re_s, halign_re_s +# Regular expressions for stripping chunks of HTML, +# leaving only content not wrapped in a tag or a comment +RAW_TEXT_REVEALERS = ( + # The php version has orders the below list of tags differently. The + # important thing to note here is that the pre must occur before the p or + # else the regex module doesn't properly match pre-s. It only matches the + # p in pre. + re.compile(r'<(pre|p|blockquote|div|form|table|ul|ol|dl|h[1-6])[^>]*?>.*</\1>', + re.S), + re.compile(r'<(hr|br)[^>]*?/>'), + re.compile(r'<!--.*?-->'), +) + def decode_high(text): """Decode encoded HTML entities.""" text = '&#{0};'.format(text) return html.unescape(text) + def encode_high(text): """Encode the text so that it is an appropriate HTML entity.""" return ord(text) + def encode_html(text, quotes=True): """Return text that's safe for an HTML attribute.""" a = ( @@ -39,6 +52,7 @@ def encode_html(text, quotes=True): text = text.replace(k, v) return text + def generate_tag(tag, content, attributes=None): """Generate a complete html tag using the ElementTree module. tag and content are strings, the attributes argument is a dictionary. As @@ -59,49 +73,87 @@ def generate_tag(tag, content, attributes=None): # non-ascii text being html-entity encoded. Not bad, but not entirely # matching php-textile either. element_tag = ElementTree.tostringlist(element, encoding=enc, - method='html') + method='html') element_tag.insert(len(element_tag) - 1, content) element_text = ''.join(element_tag) return element_text + +def getimagesize(url): + """ + Attempts to determine an image's width and height, and returns a tuple, + (width, height), in pixels or an empty string in case of failure. + Requires that PIL is installed. + + """ + + try: + from PIL import ImageFile + except ImportError: + return '' + + from urllib.request import urlopen + + try: + p = ImageFile.Parser() + f = urlopen(url) + while True: + s = f.read(1024) + if not s: + break + p.feed(s) + if p.image: + return p.image.size + except (IOError, ValueError): + return '' + + def has_raw_text(text): """checks whether the text has text not already enclosed by a block tag""" - # The php version has orders the below list of tags differently. The - # important thing to note here is that the pre must occur before the p or - # else the regex module doesn't properly match pre-s. It only matches the - # p in pre. - r = re.compile(r'<(pre|p|blockquote|div|form|table|ul|ol|dl|h[1-6])[^>]*?>.*</\1>', - re.S).sub('', text.strip()).strip() - r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) - return '' != r + r = text.strip() + for pattern in RAW_TEXT_REVEALERS: + r = pattern.sub('', r).strip() + return r != '' + + +def human_readable_url(url): + if "://" in url: + url = url.split("://")[1] + elif ":" in url: + url = url.split(":")[1] + return url + def is_rel_url(url): """Identify relative urls.""" (scheme, netloc) = urlparse(url)[0:2] return not scheme and not netloc + def is_valid_url(url): parsed = urlparse(url) if parsed.scheme == '': return True return False + def list_type(list_string): listtypes = { - list_string.startswith('*'): 'u', - list_string.startswith('#'): 'o', - (not list_string.startswith('*') and not list_string.startswith('#')): + list_string.endswith('*'): 'u', + list_string.endswith('#'): 'o', + (not list_string.endswith('*') and not list_string.endswith('#')): 'd' } return listtypes.get(True, False) + def normalize_newlines(string): - out = string.strip() - out = re.sub(r'\r\n?', '\n', out) + out = re.sub(r'\r\n?', '\n', string) out = re.compile(r'^[ \t]*\n', flags=re.M).sub('\n', out) - out = re.sub(r'"$', '" ', out) + out = out.strip('\n') return out + def parse_attributes(block_attributes, element=None, include_id=True, restricted=False): vAlign = {'^': 'top', '-': 'middle', '~': 'bottom'} hAlign = {'<': 'left', '=': 'center', '>': 'right', '<>': 'justify'} @@ -146,8 +198,27 @@ def parse_attributes(block_attributes, element=None, include_id=True, restricted m = re.search(r'\(([^()]+)\)', matched, re.U) if m: - aclass = m.group(1) matched = matched.replace(m.group(0), '') + # Only allow a restricted subset of the CSS standard characters for classes/ids. + # No encoding markers allowed. + id_class_match = re.compile(r"^([-a-zA-Z 0-9_\/\[\]\.\:\#]+)$", re.U).match(m.group(1)) + if id_class_match: + class_regex = re.compile(r"^([-a-zA-Z 0-9_\.\/\[\]]*)$") + id_class = id_class_match.group(1) + # If a textile class block attribute was found with a '#' in it + # split it into the css class and css id... + hashpos = id_class.find('#') + if hashpos >= 0: + id_match = re.match(r"^#([-a-zA-Z0-9_\.\:]*)$", id_class[hashpos:]) + if id_match: + block_id = id_match.group(1) + + cls_match = class_regex.match(id_class[:hashpos]) + else: + cls_match = class_regex.match(id_class) + + if cls_match: + aclass = cls_match.group(1) m = re.search(r'([(]+)', matched) if m: @@ -163,11 +234,6 @@ def parse_attributes(block_attributes, element=None, include_id=True, restricted if m: style.append("text-align:{0}".format(hAlign[m.group(1)])) - m = re.search(r'^(.*)#(.*)$', aclass) - if m: - block_id = m.group(2) - aclass = m.group(1) - if element == 'col': pattern = r'(?:\\(\d+)\.?)?\s*(\d+)?' csp = re.match(pattern, matched) @@ -195,6 +261,7 @@ def parse_attributes(block_attributes, element=None, include_id=True, restricted result['width'] = width return result + def pba(block_attributes, element=None, include_id=True, restricted=False): """Parse block attributes.""" attrs = parse_attributes(block_attributes, element, include_id, restricted) diff --git a/textile/version.py b/textile/version.py index f3c42a78..ad53acbb 100644 --- a/textile/version.py +++ b/textile/version.py @@ -1 +1 @@ -VERSION = '4.0.2' +VERSION = '4.0.3'