Add PCRE dotall modifier

Add the /s modifier so the `.` token matches newlines as well. Thanks to @iGrog for supplying another PDF that demonstrated this issue. Add the same modifier for dictionaries as well, fixing this oversight. Move the inline image replacement before string replacement. Parentheses in binary image data may be interpreted as the start of a string. Move the inline images test to its own function and add a newline to the sample data to test for the dotall modifier change.
smalot · Mar 25, 2024 · 6f0ef9c · 6f0ef9c
1 parent 4ae52e7
commit 6f0ef9c
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 28 deletions.
diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php
@@ -224,6 +224,20 @@ private function formatContent(?string $content): string
             return '';
         }
 
+        // Find all inline image content and replace them so they aren't
+        // affected by the next steps
+        $pdfInlineImages = [];
+        while (preg_match('/\sBI\s(.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text)) {
+            $id = uniqid('IMAGE_', true);
+            $pdfInlineImages[$id] = [$text[1], $text[2]];
+            $content = preg_replace(
+                '/'.preg_quote($text[0], '/').'/',
+                '^^^'.$id.'^^^',
+                $content,
+                1
+            );
+        }
+
         // Find all strings () and replace them so they aren't affected
         // by the next steps
         $pdfstrings = [];
@@ -254,27 +268,13 @@ private function formatContent(?string $content): string
             }
         }
 
-        // Find all inline image content and replace them so they aren't
-        // affected by the next steps
-        $pdfInlineImages = [];
-        while (preg_match('/\sBI(.+?)\sID\s(.+?)\sEI(?=\s|$)/', $content, $text)) {
-            $id = uniqid('IMAGE_', true);
-            $pdfInlineImages[$id] = [$text[1], $text[2]];
-            $content = preg_replace(
-                '/'.preg_quote($text[0], '/').'/',
-                '^^^'.$id.'^^^',
-                $content,
-                1
-            );
-        }
-
         // Remove all carriage returns and line-feeds from the document stream
         $content = str_replace(["\r", "\n"], ' ', trim($content));
 
         // Find all dictionary << >> commands and replace them so they
         // aren't affected by the next steps
         $dictstore = [];
-        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
+        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
             $dictid = uniqid('DICT_', true);
             $dictstore[$dictid] = $dicttext[1];
             $content = preg_replace(
@@ -317,16 +317,6 @@ private function formatContent(?string $content): string
             $content = str_replace('###'.$id.'###', $dict, $content);
         }
 
-        // Restore the original content of any inline images
-        $pdfInlineImages = array_reverse($pdfInlineImages, true);
-        foreach ($pdfInlineImages as $id => $image) {
-            $content = str_replace(
-                '^^^'.$id.'^^^',
-                "\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n",
-                $content
-            );
-        }
-
         // Restore the original string content
         $pdfstrings = array_reverse($pdfstrings, true);
         foreach ($pdfstrings as $id => $text) {
@@ -343,6 +333,16 @@ private function formatContent(?string $content): string
             $content = str_replace('@@@'.$id.'@@@', $text, $content);
         }
 
+        // Restore the original content of any inline images
+        $pdfInlineImages = array_reverse($pdfInlineImages, true);
+        foreach ($pdfInlineImages as $id => $image) {
+            $content = str_replace(
+                '^^^'.$id.'^^^',
+                "\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n",
+                $content
+            );
+        }
+
         $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
 
         return $content;

diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php
@@ -284,12 +284,22 @@ public function testFormatContent(): void
 
         // Binary check is done before a regexp that causes an error
         $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
+    }
+
+    /**
+     * Check that inline image data does not corrupt the stream
+     *
+     * @see: https://github.com/smalot/pdfparser/issues/691
+     */
+    public function testFormatContentInlineImages(): void
+    {
+        $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent');
+        $formatContent->setAccessible(true);
 
-        // Check that inline image data does not corrupt the stream
-        // See: https://github.com/smalot/pdfparser/issues/691
         $cleaned = $formatContent->invoke(
             $this->getPdfObjectInstance(new Document()),
-            'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 /BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
+            'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150
+/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
         );
 
         // PdfParser should not be fooled by Q's in inline image data;