diff --git a/samples/bugs/Issue715.pdf b/samples/bugs/Issue715.pdf new file mode 100644 index 00000000..b5b1fc7b Binary files /dev/null and b/samples/bugs/Issue715.pdf differ diff --git a/src/Smalot/PdfParser/Element/ElementHexa.php b/src/Smalot/PdfParser/Element/ElementHexa.php index 3fc34136..6d92a86c 100644 --- a/src/Smalot/PdfParser/Element/ElementHexa.php +++ b/src/Smalot/PdfParser/Element/ElementHexa.php @@ -47,12 +47,11 @@ public static function parse(string $content, ?Document $document = null, int &$ if (preg_match('/^\s*\<(?P[A-F0-9]+)\>/is', $content, $match)) { $name = $match['name']; $offset += strpos($content, '<'.$name) + \strlen($name) + 2; // 1 for '>' - // repackage string as standard - $name = '('.self::decode($name).')'; - $element = ElementDate::parse($name, $document); + $name = self::decode($name); + $element = ElementDate::parse('('.$name.')', $document); if (!$element) { - $element = ElementString::parse($name, $document); + $element = new self($name); } return $element; diff --git a/src/Smalot/PdfParser/Element/ElementString.php b/src/Smalot/PdfParser/Element/ElementString.php index 011bcf46..7bdcc38e 100644 --- a/src/Smalot/PdfParser/Element/ElementString.php +++ b/src/Smalot/PdfParser/Element/ElementString.php @@ -51,6 +51,51 @@ public function equals($value): bool return $value == $this->value; } + /** + * Part of parsing process to handle escaped characters. + * Note, most parameters are passed by reference. + * + * Further information in PDF specification (page 53): + * https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + */ + private static function handleEscapedCharacters(string &$name, int &$position, string &$processedName, string $char): void + { + // escaped chars + $nextChar = substr($name, 0, 1); + switch ($nextChar) { + // end-of-line markers (CR, LF, CRLF) should be ignored + case "\r": + case "\n": + preg_match('/^\\r?\\n?/', $name, $matches); + $name = substr($name, \strlen($matches[0])); + $position += \strlen($matches[0]); + break; + // process LF, CR, HT, BS, FF + case 'n': + case 't': + case 'r': + case 'b': + case 'f': + $processedName .= stripcslashes('\\'.$nextChar); + $name = substr($name, 1); + ++$position; + break; + // decode escaped parentheses and backslash + case '(': + case ')': + case '\\': + case ' ': // TODO: this should probably be removed - kept for compatibility + $processedName .= $nextChar; + $name = substr($name, 1); + ++$position; + break; + // TODO: process octal encoding (but it is also processed later) + // keep backslash in other cases + default: + $processedName .= $char; + } + } + /** * @return bool|ElementString */ @@ -59,25 +104,38 @@ public static function parse(string $content, ?Document $document = null, int &$ if (preg_match('/^\s*\((?P.*)/s', $content, $match)) { $name = $match['name']; - // Find next ')' not escaped. - $cur_start_text = $start_search_end = 0; - while (false !== ($cur_start_pos = strpos($name, ')', $start_search_end))) { - $cur_extract = substr($name, $cur_start_text, $cur_start_pos - $cur_start_text); - preg_match('/(?P[\\\]*)$/s', $cur_extract, $match); - if (!(\strlen($match['escape']) % 2)) { - break; + $delimiterCount = 0; + $position = 0; + $processedName = ''; + do { + $char = substr($name, 0, 1); + $name = substr($name, 1); + ++$position; + switch ($char) { + // matched delimiters should be treated as part of string + case '(': + $processedName .= $char; + ++$delimiterCount; + break; + case ')': + if (0 === $delimiterCount) { + $name = substr($name, 1); + break 2; + } + $processedName .= $char; + --$delimiterCount; + break; + case '\\': + self::handleEscapedCharacters($name, $position, $processedName, $char); + break; + default: + $processedName .= $char; } - $start_search_end = $cur_start_pos + 1; - } + } while ('' !== $name); + + $offset += strpos($content, '(') + 1 + $position; - // Extract string. - $name = substr($name, 0, (int) $cur_start_pos); - $offset += strpos($content, '(') + $cur_start_pos + 2; // 2 for '(' and ')' - $name = str_replace( - ['\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'], - ['\\', ' ', '/', '(', ')', "\n", "\r", "\t"], - $name - ); + $name = $processedName; // Decode string. $name = Font::decodeOctal($name); diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index b051f114..f808ab2b 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -296,7 +296,7 @@ protected function parseHeaderElement(?string $type, $value, ?Document $document return ElementString::parse('('.$value.')', $document); case '<': - return $this->parseHeaderElement('(', ElementHexa::decode($value), $document); + return ElementHexa::parse('<'.$value.'>', $document); case '/': return ElementName::parse('/'.$value, $document); diff --git a/tests/PHPUnit/Integration/Element/ElementHexaTest.php b/tests/PHPUnit/Integration/Element/ElementHexaTest.php index 1c22a5c3..d106918e 100644 --- a/tests/PHPUnit/Integration/Element/ElementHexaTest.php +++ b/tests/PHPUnit/Integration/Element/ElementHexaTest.php @@ -131,4 +131,17 @@ public function testParse(): void $this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element); } + + /** + * Closing round bracket encoded in hexadecimal format breaks parsing - string is truncated. + * + * @see https://github.com/smalot/pdfparser/issues/715 + */ + public function testIssue715(): void + { + $offset = 0; + $testString = '()\\'; + $element = ElementHexa::parse('<'.bin2hex($testString).'>', null, $offset); + $this->assertEquals($testString, (string) $element); + } } diff --git a/tests/PHPUnit/Integration/Element/ElementStringTest.php b/tests/PHPUnit/Integration/Element/ElementStringTest.php index 61d3ad6b..8207426f 100644 --- a/tests/PHPUnit/Integration/Element/ElementStringTest.php +++ b/tests/PHPUnit/Integration/Element/ElementStringTest.php @@ -133,6 +133,38 @@ public function testParse(): void $this->assertEquals(27, $offset); } + /** + * @see https://github.com/smalot/pdfparser/issues/715 + */ + public function testParseIssue715(): void + { + $element = ElementString::parse('(())'); + $this->assertEquals('()', $element->getContent()); + + // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + // page 54 + $string = '(Strings may contain balanced parentheses ( ) and +special characters (*!&}^% and so on).)'; + $element = ElementString::parse($string); + $this->assertEquals('Strings may contain balanced parentheses ( ) and +special characters (*!&}^% and so on).', $element->getContent()); + + // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + // page 55 + $string = '( This string has an end−of−line at the end of it. +)'; + $element = ElementString::parse($string); + $this->assertEquals(' This string has an end−of−line at the end of it.'.\PHP_EOL, $element->getContent()); + + // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf + // page 55 + $string = '( These \ +two strings \ +are the same.)'; + $element = ElementString::parse($string); + $this->assertEquals(' These two strings are the same.', $element->getContent()); + } + public function testGetContent(): void { $element = new ElementString('Copyright'); diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index fa0d3f42..b26384b0 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -438,6 +438,27 @@ public function testIgnoreEncryption(): void // without the configuration option set, an exception would be thrown. } + + /** + * Tests special chars encoded as hex. + * + * @see https://github.com/smalot/pdfparser/issues/715 + */ + public function testIssue715SpecialCharsEncodedAsHex(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue715.pdf'; + + $this->fixture = new Parser(); + $document = $this->fixture->parseFile($filename); + $sigObject = $document->getObjectsByType('Sig'); + + $this->assertTrue(isset($sigObject['4_0'])); + $this->assertEquals('()\\', (string) $sigObject['4_0']->getHeader()->get('Contents')); + + $details = $document->getDetails(); + $this->assertEquals('x(y)', $details['Producer'] ?? null); + $this->assertEquals('a(b)', $details['Creator'] ?? null); + } } class ParserSub extends Parser