diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index f2660156..5e2911eb 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -214,13 +214,31 @@ private function formatContent(?string $content): string return ''; } - // Outside of (String) content in PDF document streams, all - // text should conform to UTF-8. Test for binary content by - // deleting everything after the first open-parenthesis ( which - // indicates the beginning of a string. Then test what remains - // for valid UTF-8. If it's not UTF-8, return an empty string - // as this $content is most likely binary. - if (false === mb_check_encoding(preg_replace('/\(.*$/s', '', $content), 'UTF-8')) { + // Outside of (String) and inline image content in PDF document + // streams, all text should conform to UTF-8. Test for binary + // content by deleting everything after the first open- + // parenthesis ( which indicates the beginning of a string, or + // the first ID command which indicates the beginning of binary + // inline image content. Then test what remains for valid + // UTF-8. If it's not UTF-8, return an empty string as this + // $content is most likely binary. Unfortunately, using + // mb_check_encoding(..., 'UTF-8') is not strict enough, so the + // following regexp, adapted from the W3, is used. See: + // https://www.w3.org/International/questions/qa-forms-utf-8.en + // We use preg_replace() instead of preg_match() to avoid "JIT + // stack limit exhausted" errors on larger files. + $utf8Filter = preg_replace('/( + [\x09\x0A\x0D\x20-\x7E] | # ASCII + [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte + \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs + [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte + \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates + \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 + [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 + \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); + + if ('' !== $utf8Filter) { return ''; } diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index 025c11b1..3c83be53 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -284,6 +284,13 @@ public function testFormatContent(): void // Binary check is done before a regexp that causes an error $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText()); + + // mb_check_encoding(..., 'UTF-8') returns true here, + // necessitating a test for UTF-8 that's more strict + $content = hex2bin('0101010101010101'); + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertEquals('', $cleaned); } public function testGetSectionsText(): void