Skip to content

Commit

Permalink
Replace html purifier with the symfony/html-sanitizer package for rea…
Browse files Browse the repository at this point in the history
…dme sanitization
  • Loading branch information
Seldaek committed Jun 14, 2024
1 parent 5c6920e commit a68427a
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 102 deletions.
1 change: 0 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
"doctrine/doctrine-bundle": "^2.2",
"doctrine/orm": "^2.7",
"endroid/qr-code": "^5",
"ezyang/htmlpurifier": "^4.6",
"graze/dog-statsd": "^1",
"knplabs/knp-menu-bundle": "^3",
"knpuniversity/oauth2-client-bundle": "^2.8",
Expand Down
63 changes: 1 addition & 62 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

75 changes: 40 additions & 35 deletions src/Package/Updater.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
use Doctrine\DBAL\Connection;
use App\Service\VersionCache;
use Composer\Package\CompletePackageInterface;
use DOMElement;
use Symfony\Component\HtmlSanitizer\HtmlSanitizer;
use Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig;
use Symfony\Component\Mailer\MailerInterface;
use Symfony\Component\Mime\Address;
use Symfony\Component\Mime\Email;
Expand Down Expand Up @@ -710,6 +713,15 @@ private function updateGitLabInfo(HttpDownloader $httpDownloader, IOInterface $i
*/
private function prepareReadme(string $readme, ?string $host = null, ?string $owner = null, ?string $repo = null): string
{
// detect base path for github readme if file is located in a subfolder like docs/README.md
$basePath = '';
if ($host === 'github.com' && Preg::isMatchStrictGroups('{^<div id="readme" [^>]+?data-path="([^"]+)"}', $readme, $match) && false !== strpos($match[1], '/')) {
$basePath = dirname($match[1]);
}
if ($basePath) {
$basePath .= '/';
}

$elements = [
'p',
'br',
Expand All @@ -726,51 +738,40 @@ private function prepareReadme(string $readme, ?string $host = null, ?string $ow
'q', 'blockquote', 'abbr', 'cite',
'table', 'thead', 'tbody', 'th', 'tr', 'td',
'a', 'span',
'img',
'details', 'summary',
];

$attributes = [
'img.src', 'img.title', 'img.alt', 'img.width', 'img.height', 'img.style',
'a.href', 'a.target', 'a.rel', 'a.id',
'td.colspan', 'td.rowspan', 'th.colspan', 'th.rowspan',
'th.align', 'td.align', 'p.align',
'h1.align', 'h2.align', 'h3.align', 'h4.align', 'h5.align', 'h6.align',
'*.class', 'details.open',
];

// detect base path for github readme if file is located in a subfolder like docs/README.md
$basePath = '';
if ($host === 'github.com' && Preg::isMatchStrictGroups('{^<div id="readme" [^>]+?data-path="([^"]+)"}', $readme, $match) && false !== strpos($match[1], '/')) {
$basePath = dirname($match[1]);
}
if ($basePath) {
$basePath .= '/';
}

$config = \HTMLPurifier_Config::createDefault();
$config->set('HTML.AllowedElements', implode(',', $elements));
$config->set('HTML.AllowedAttributes', implode(',', $attributes));
$config->set('Attr.EnableID', true);
$config->set('Attr.AllowedFrameTargets', ['_blank']);

// add custom HTML tag definitions
$def = $config->getHTMLDefinition(true);
Assert::notNull($def);
$def->addElement('details', 'Block', 'Flow', 'Common', [
'open' => 'Bool#open',
]);
$def->addElement('summary', 'Inline', 'Inline', 'Common');

$purifier = new \HTMLPurifier($config);
$readme = $purifier->purify($readme);
$config = (new HtmlSanitizerConfig());
foreach ($elements as $el) {
$config = $config->allowElement($el);
}

$config = $config
->blockElement('div')
->blockElement('article')
->blockElement('g-emoji')
->allowElement('img', ['src', 'title', 'alt', 'width', 'height'])
->allowElement('a', ['href', 'target', 'id'])
->allowElement('td', ['colspan', 'rowspan'])
->allowElement('th', ['colspan', 'rowspan'])
->allowElement('details', ['open'])
->allowAttribute('align', ['th', 'td', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
->allowAttribute('class', '*')
->allowLinkSchemes(['https', 'http', 'mailto'])
->allowRelativeLinks()
->allowRelativeMedias()
->withMaxInputLength(10_000_000);

$sanitizer = new HtmlSanitizer($config);
$readme = $sanitizer->sanitizeFor('body', $readme);

libxml_use_internal_errors(true);
$dom = new \DOMDocument();
$dom->loadHTML('<?xml encoding="UTF-8">' . $readme);

// Links can not be trusted, mark them nofollow and convert relative to absolute links
$links = $dom->getElementsByTagName('a');
/** @var DOMElement $link */
foreach ($links as $link) {
$link->setAttribute('rel', 'nofollow noindex noopener external ugc');
if ('#' === substr($link->getAttribute('href'), 0, 1)) {
Expand All @@ -788,11 +789,15 @@ private function prepareReadme(string $readme, ?string $host = null, ?string $ow
'https://gitlab.com/'.$owner.'/'.$repo.'/-/blob/HEAD/'.$basePath.$link->getAttribute('href')
);
}
if ($link->getAttribute('target') !== '' && $link->getAttribute('target') !== '_blank') {
$link->setAttribute('target', '_blank');
}
}

// embed images of selected hosts by converting relative links to accessible URLs
if (in_array($host, ['github.com', 'gitlab.com', 'bitbucket.org'], true)) {
$images = $dom->getElementsByTagName('img');
/** @var DOMElement $img */
foreach ($images as $img) {
if (!str_contains($img->getAttribute('src'), '//')) {
$imgSrc = match ($host) {
Expand Down
3 changes: 0 additions & 3 deletions symfony.lock
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,6 @@
"endroid/qr-code": {
"version": "3.9.6"
},
"ezyang/htmlpurifier": {
"version": "v4.13.0"
},
"friendsofphp/proxy-manager-lts": {
"version": "v1.0.7"
},
Expand Down
Loading

0 comments on commit a68427a

Please sign in to comment.