Better octal and hex-entity decode (#640)

* Better octal and hex-entity decode Octal strings can include series of backslashes of arbitrary length. If there is an odd number of backslashes, a following octal code is valid, but if there's an even number, the following octal code should not be translated. Previously PdfParser would only account for two backslashes directly preceding an octal code. A commit from in-progress PR #634 extended this to three which probably covers 99.99% of all cases. This change ups that to 100% in that there could be a string with any number of backslashes in a row, and codes will be correctly translated. Also update decodeEntities() to use a preg_replace_callback() instead of the bulkier preg_split() + foreach loop. Make sure it matches all hexadecimal digits including a-f. Add new tests for both of these. * Use #2D to ensure we're capturing hex letters * Change order of special string replacement Move the special string replacement after the unescaping of parentheses so we don't unescape any parentheses we shouldn't. Add more tests to make sure this is working. * Apply suggestions from code review Co-authored-by: Konrad Abicht <[email protected]> --------- Co-authored-by: Konrad Abicht <[email protected]>
smalot · Sep 26, 2023 · 5c48261 · 5c48261
1 parent 53538eb
commit 5c48261
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 22 deletions.
diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php
@@ -349,37 +349,30 @@ public static function decodeHexadecimal(string $hexa, bool $add_braces = false)
      */
     public static function decodeOctal(string $text): string
     {
-        $parts = preg_split('/(?<!\\\\)(\\\\[0-7]{1,3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
-        $text = '';
+        // Replace all double backslashes \\ with a special string
+        $text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);
 
-        foreach ($parts as $part) {
-            if (preg_match('/^\\\\[0-7]{1,3}$/', $part)) {
-                $text .= \chr(octdec(trim($part, '\\')));
-            } else {
-                $text .= str_replace(['\\\\', '\\(', '\\)'], ['\\', '(', ')'], $part);
-            }
-        }
+        // Now we can replace all octal codes without worrying about
+        // escaped backslashes
+        $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
+            return \chr(octdec($m[1]));
+        }, $text);
 
-        return $text;
+        // Unescape any parentheses
+        $text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
+
+        // Replace instances of the special string with a single backslash
+        return str_replace('[**pdfparserdblslsh**]', '\\', $text);
     }
 
     /**
      * Decode string with html entity encoded chars.
      */
     public static function decodeEntities(string $text): string
     {
-        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
-        $text = '';
-
-        foreach ($parts as $part) {
-            if (preg_match('/^#\d{2}$/', $part)) {
-                $text .= \chr(hexdec(trim($part, '#')));
-            } else {
-                $text .= $part;
-            }
-        }
-
-        return $text;
+        return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
+            return \chr(hexdec($m[1]));
+        }, $text);
     }
 
     /**

diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php
@@ -281,12 +281,29 @@ public function testDecodeOctal(): void
         $this->assertEquals('AB C', Font::decodeOctal('\\101\\102\\040\\103'));
         $this->assertEquals('AB CD', Font::decodeOctal('\\101\\102\\040\\103D'));
         $this->assertEquals('AB \199', Font::decodeOctal('\\101\\102\\040\\\\199'));
+
+        // Test that series of backslashes of arbitrary length are decoded properly
+        $this->assertEquals('-', Font::decodeOctal('\\055')); // \055
+        $this->assertEquals('\\055', Font::decodeOctal('\\\\055')); // \\055
+        $this->assertEquals('\\-', Font::decodeOctal('\\\\\\055')); // \\\055
+        $this->assertEquals('\\\\055', Font::decodeOctal('\\\\\\\\055')); // \\\\055
+        $this->assertEquals('\\\\-', Font::decodeOctal('\\\\\\\\\\055')); // \\\\\055
+        $this->assertEquals('\\\\\\055', Font::decodeOctal('\\\\\\\\\\\\055')); // \\\\\\055
+        $this->assertEquals('\\\\\\-', Font::decodeOctal('\\\\\\\\\\\\\\055')); // \\\\\\\055
+
+        // Make sure we're unescaping ( and ) before returning the escaped
+        // backslashes to the string
+        $this->assertEquals('\\(', Font::decodeOctal('\\\\(')); // \\( - nothing to unescape
+        $this->assertEquals('\\(', Font::decodeOctal('\\\\\\(')); // \\\( - parenthesis unescaped
+        $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\(')); // \\\\( - nothing to unescape
+        $this->assertEquals('\\\\(', Font::decodeOctal('\\\\\\\\\\(')); // \\\\\( - parenthesis unescaped
     }
 
     public function testDecodeEntities(): void
     {
         $this->assertEquals('File Type', Font::decodeEntities('File#20Type'));
         $this->assertEquals('File# Ty#pe', Font::decodeEntities('File##20Ty#pe'));
+        $this->assertEquals('Fi#le#-Ty#p#e ', Font::decodeEntities('Fi#23le##2DTy#p#e '));
     }
 
     public function testDecodeUnicode(): void