Skip to content

Commit

Permalink
Remove DOMDocument post-processing by using html-sanitizer fully
Browse files Browse the repository at this point in the history
  • Loading branch information
Seldaek committed Jun 14, 2024
1 parent a68427a commit a9dc0cb
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 82 deletions.
68 changes: 6 additions & 62 deletions src/Package/Updater.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

use App\Entity\Dependent;
use App\Entity\PackageFreezeReason;
use App\HtmlSanitizer\ReadmeImageSanitizer;
use App\HtmlSanitizer\ReadmeLinkSanitizer;
use App\Util\HttpDownloaderOptionsFactory;
use cebe\markdown\GithubMarkdown;
use Composer\Package\AliasPackage;
Expand All @@ -39,7 +41,6 @@
use Doctrine\DBAL\Connection;
use App\Service\VersionCache;
use Composer\Package\CompletePackageInterface;
use DOMElement;
use Symfony\Component\HtmlSanitizer\HtmlSanitizer;
use Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig;
use Symfony\Component\Mailer\MailerInterface;
Expand Down Expand Up @@ -758,76 +759,19 @@ private function prepareReadme(string $readme, ?string $host = null, ?string $ow
->allowAttribute('align', ['th', 'td', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
->allowAttribute('class', '*')
->allowLinkSchemes(['https', 'http', 'mailto'])
->forceAttribute('a', 'rel', 'nofollow noindex noopener external ugc')
->withAttributeSanitizer(new ReadmeLinkSanitizer($host, $owner.'/'.$repo, $basePath))

Check failure on line 763 in src/Package/Updater.php

View workflow job for this annotation

GitHub Actions / PHPStan

Instantiated class App\HtmlSanitizer\ReadmeLinkSanitizer not found.

Check failure on line 763 in src/Package/Updater.php

View workflow job for this annotation

GitHub Actions / PHPStan

Parameter #1 $sanitizer of method Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig::withAttributeSanitizer() expects Symfony\Component\HtmlSanitizer\Visitor\AttributeSanitizer\AttributeSanitizerInterface, App\HtmlSanitizer\ReadmeLinkSanitizer given.
->withAttributeSanitizer(new ReadmeImageSanitizer($host, $owner.'/'.$repo, $basePath))

Check failure on line 764 in src/Package/Updater.php

View workflow job for this annotation

GitHub Actions / PHPStan

Instantiated class App\HtmlSanitizer\ReadmeImageSanitizer not found.

Check failure on line 764 in src/Package/Updater.php

View workflow job for this annotation

GitHub Actions / PHPStan

Parameter #1 $sanitizer of method Symfony\Component\HtmlSanitizer\HtmlSanitizerConfig::withAttributeSanitizer() expects Symfony\Component\HtmlSanitizer\Visitor\AttributeSanitizer\AttributeSanitizerInterface, App\HtmlSanitizer\ReadmeImageSanitizer given.
->allowRelativeLinks()
->allowRelativeMedias()
->withMaxInputLength(10_000_000);

$sanitizer = new HtmlSanitizer($config);
$readme = $sanitizer->sanitizeFor('body', $readme);

libxml_use_internal_errors(true);
$dom = new \DOMDocument();
$dom->loadHTML('<?xml encoding="UTF-8">' . $readme);

// Links can not be trusted, mark them nofollow and convert relative to absolute links
$links = $dom->getElementsByTagName('a');
/** @var DOMElement $link */
foreach ($links as $link) {
$link->setAttribute('rel', 'nofollow noindex noopener external ugc');
if ('#' === substr($link->getAttribute('href'), 0, 1)) {
$link->setAttribute('href', '#user-content-'.substr($link->getAttribute('href'), 1));
} elseif ('mailto:' === substr($link->getAttribute('href'), 0, 7)) {
// do nothing
} elseif ($host === 'github.com' && !str_contains($link->getAttribute('href'), '//')) {
$link->setAttribute(
'href',
'https://github.com/'.$owner.'/'.$repo.'/blob/HEAD/'.$basePath.$link->getAttribute('href')
);
} elseif ($host === 'gitlab.com' && !str_contains($link->getAttribute('href'), '//')) {
$link->setAttribute(
'href',
'https://gitlab.com/'.$owner.'/'.$repo.'/-/blob/HEAD/'.$basePath.$link->getAttribute('href')
);
}
if ($link->getAttribute('target') !== '' && $link->getAttribute('target') !== '_blank') {
$link->setAttribute('target', '_blank');
}
}

// embed images of selected hosts by converting relative links to accessible URLs
if (in_array($host, ['github.com', 'gitlab.com', 'bitbucket.org'], true)) {
$images = $dom->getElementsByTagName('img');
/** @var DOMElement $img */
foreach ($images as $img) {
if (!str_contains($img->getAttribute('src'), '//')) {
$imgSrc = match ($host) {
'github.com' => 'https://raw.github.com/'.$owner.'/'.$repo.'/HEAD/'.$basePath.$img->getAttribute('src'),
'gitlab.com' => 'https://gitlab.com/'.$owner.'/'.$repo.'/-/raw/HEAD/'.$basePath.$img->getAttribute('src'),
'bitbucket.org' => 'https://bitbucket.org/'.$owner.'/'.$repo.'/raw/HEAD/'.$basePath.$img->getAttribute('src'),
};
$img->setAttribute('src', $imgSrc);
}
}
}

// remove first page element if it's a <h1> or <h2>, because it's usually
// the project name or the `README` string which we don't need
$first = $dom->getElementsByTagName('body')->item(0);
if ($first) {
$first = $first->childNodes->item(0);
}

if ($first && ('h1' === $first->nodeName || 'h2' === $first->nodeName)) {
$first->parentNode?->removeChild($first);
}

$readme = $dom->saveHTML();
Assert::string($readme);
$readme = substr($readme, strpos($readme, '<body>') + 6);
$readme = substr($readme, 0, strrpos($readme, '</body>') ?: PHP_INT_MAX);

libxml_use_internal_errors(false);
libxml_clear_errors();
$readme = Preg::replace('{^<(h[12])>.*</(?1)>}', '', $readme);

return str_replace("\r\n", "\n", $readme);
}
Expand Down
Loading

0 comments on commit a9dc0cb

Please sign in to comment.