smalot · krzyc · May 27, 2024 · May 27, 2024 · May 28, 2024 · May 29, 2024
diff --git a/samples/bugs/Issue715.pdf b/samples/bugs/Issue715.pdf
diff --git a/src/Smalot/PdfParser/Element/ElementHexa.php b/src/Smalot/PdfParser/Element/ElementHexa.php
@@ -47,12 +47,11 @@ public static function parse(string $content, ?Document $document = null, int &$
         if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
             $name = $match['name'];
             $offset += strpos($content, '<'.$name) + \strlen($name) + 2; // 1 for '>'
-            // repackage string as standard
-            $name = '('.self::decode($name).')';
-            $element = ElementDate::parse($name, $document);
+            $name = self::decode($name);
+            $element = ElementDate::parse('('.$name.')', $document);
 
             if (!$element) {
-                $element = ElementString::parse($name, $document);
+                $element = new self($name);
             }
 
             return $element;

diff --git a/src/Smalot/PdfParser/Element/ElementString.php b/src/Smalot/PdfParser/Element/ElementString.php
@@ -51,6 +51,51 @@ public function equals($value): bool
         return $value == $this->value;
     }
 
+    /**
+     * Part of parsing process to handle escaped characters.
+     * Note, most parameters are passed by reference.
+     *
+     * Further information in PDF specification (page 53):
+     * https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
+     */
+    private static function handleEscapedCharacters(string &$name, int &$position, string &$processedName, string $char): void
+    {
+        // escaped chars
+        $nextChar = substr($name, 0, 1);
+        switch ($nextChar) {
+            // end-of-line markers (CR, LF, CRLF) should be ignored
+            case "\r":
+            case "\n":
+                preg_match('/^\\r?\\n?/', $name, $matches);
+                $name = substr($name, \strlen($matches[0]));
+                $position += \strlen($matches[0]);
+                break;
+                // process LF, CR, HT, BS, FF
+            case 'n':
+            case 't':
+            case 'r':
+            case 'b':
+            case 'f':
+                $processedName .= stripcslashes('\\'.$nextChar);
+                $name = substr($name, 1);
+                ++$position;
+                break;
+                // decode escaped parentheses and backslash
+            case '(':
+            case ')':
+            case '\\':
+            case ' ': // TODO: this should probably be removed - kept for compatibility
+                $processedName .= $nextChar;
+                $name = substr($name, 1);
+                ++$position;
+                break;
+                // TODO: process octal encoding (but it is also processed later)
+                // keep backslash in other cases
+            default:
+                $processedName .= $char;
+        }
+    }
+
     /**
      * @return bool|ElementString
      */
@@ -59,25 +104,38 @@ public static function parse(string $content, ?Document $document = null, int &$
         if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
             $name = $match['name'];
 
-            // Find next ')' not escaped.
-            $cur_start_text = $start_search_end = 0;
-            while (false !== ($cur_start_pos = strpos($name, ')', $start_search_end))) {
-                $cur_extract = substr($name, $cur_start_text, $cur_start_pos - $cur_start_text);
-                preg_match('/(?P<escape>[\\\]*)$/s', $cur_extract, $match);
-                if (!(\strlen($match['escape']) % 2)) {
-                    break;
+            $delimiterCount = 0;
+            $position = 0;
+            $processedName = '';
+            do {
+                $char = substr($name, 0, 1);
+                $name = substr($name, 1);
+                ++$position;
+                switch ($char) {
+                    // matched delimiters should be treated as part of string
+                    case '(':
+                        $processedName .= $char;
+                        ++$delimiterCount;
+                        break;
+                    case ')':
+                        if (0 === $delimiterCount) {
+                            $name = substr($name, 1);
+                            break 2;
+                        }
+                        $processedName .= $char;
+                        --$delimiterCount;
+                        break;
+                    case '\\':
+                        self::handleEscapedCharacters($name, $position, $processedName, $char);
+                        break;
+                    default:
+                        $processedName .= $char;
                 }
-                $start_search_end = $cur_start_pos + 1;
-            }
+            } while ('' !== $name);
+
+            $offset += strpos($content, '(') + 1 + $position;
 
-            // Extract string.
-            $name = substr($name, 0, (int) $cur_start_pos);
-            $offset += strpos($content, '(') + $cur_start_pos + 2; // 2 for '(' and ')'
-            $name = str_replace(
-                ['\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'],
-                ['\\',   ' ',   '/',   '(',  ')',  "\n", "\r", "\t"],
-                $name
-            );
+            $name = $processedName;
 
             // Decode string.
             $name = Font::decodeOctal($name);

diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php
@@ -296,7 +296,7 @@ protected function parseHeaderElement(?string $type, $value, ?Document $document
                 return ElementString::parse('('.$value.')', $document);
 
             case '<':
-                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
+                return ElementHexa::parse('<'.$value.'>', $document);
 
             case '/':
                 return ElementName::parse('/'.$value, $document);

diff --git a/tests/PHPUnit/Integration/Element/ElementHexaTest.php b/tests/PHPUnit/Integration/Element/ElementHexaTest.php
@@ -131,4 +131,17 @@ public function testParse(): void
 
         $this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element);
     }
+
+    /**
+     * Closing round bracket encoded in hexadecimal format breaks parsing - string is truncated.
+     *
+     * @see https://github.com/smalot/pdfparser/issues/715
+     */
+    public function testIssue715(): void
+    {
+        $offset = 0;
+        $testString = '()\\';
+        $element = ElementHexa::parse('<'.bin2hex($testString).'>', null, $offset);
+        $this->assertEquals($testString, (string) $element);
+    }
 }
diff --git a/tests/PHPUnit/Integration/Element/ElementStringTest.php b/tests/PHPUnit/Integration/Element/ElementStringTest.php
@@ -133,6 +133,38 @@ public function testParse(): void
         $this->assertEquals(27, $offset);
     }
 
+    /**
+     * @see https://github.com/smalot/pdfparser/issues/715
+     */
+    public function testParseIssue715(): void
+    {
+        $element = ElementString::parse('(())');
+        $this->assertEquals('()', $element->getContent());
+
+        // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
+        // page 54
+        $string = '(Strings may contain balanced parentheses ( ) and
+special characters (*!&}^% and so on).)';
+        $element = ElementString::parse($string);
+        $this->assertEquals('Strings may contain balanced parentheses ( ) and
+special characters (*!&}^% and so on).', $element->getContent());
+
+        // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
+        // page 55
+        $string = '( This string has an end−of−line at the end of it.
+)';
+        $element = ElementString::parse($string);
+        $this->assertEquals(' This string has an end−of−line at the end of it.'.\PHP_EOL, $element->getContent());
+
+        // source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
+        // page 55
+        $string = '( These \
+two strings \
+are the same.)';
+        $element = ElementString::parse($string);
+        $this->assertEquals(' These two strings are the same.', $element->getContent());
+    }
+
     public function testGetContent(): void
     {
         $element = new ElementString('Copyright');

diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php
@@ -438,6 +438,27 @@ public function testIgnoreEncryption(): void
 
         // without the configuration option set, an exception would be thrown.
     }
+
+    /**
+     * Tests special chars encoded as hex.
+     *
+     * @see https://github.com/smalot/pdfparser/issues/715
+     */
+    public function testIssue715SpecialCharsEncodedAsHex(): void
+    {
+        $filename = $this->rootDir.'/samples/bugs/Issue715.pdf';
+
+        $this->fixture = new Parser();
+        $document = $this->fixture->parseFile($filename);
+        $sigObject = $document->getObjectsByType('Sig');
+
+        $this->assertTrue(isset($sigObject['4_0']));
+        $this->assertEquals('()\\', (string) $sigObject['4_0']->getHeader()->get('Contents'));
+
+        $details = $document->getDetails();
+        $this->assertEquals('x(y)', $details['Producer'] ?? null);
+        $this->assertEquals('a(b)', $details['Creator'] ?? null);
+    }
 }
 
 class ParserSub extends Parser