From 8d005080cc21dc1e476260f31b6e85681a307379 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Mon, 25 Mar 2024 16:10:21 -0400 Subject: [PATCH] More robust check for BI within strings `BI` "commands" within strings should not be parsed as the beginning of inline image blocks. Detect if the `BI` we found is inside a (string) and if it is, note the offset and move past it for the next match. --- src/Smalot/PdfParser/PDFObject.php | 59 +++++++++++++++++---- tests/PHPUnit/Integration/PDFObjectTest.php | 18 ++++++- 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index 20e92334..ab3b66e8 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -227,15 +227,54 @@ private function formatContent(?string $content): string // Find all inline image content and replace them so they aren't // affected by the next steps $pdfInlineImages = []; - while (preg_match('/\sBI\s(.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text)) { - $id = uniqid('IMAGE_', true); - $pdfInlineImages[$id] = [$text[1], $text[2]]; - $content = preg_replace( - '/'.preg_quote($text[0], '/').'/', - '^^^'.$id.'^^^', - $content, - 1 - ); + $offsetBI = 0; + while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { + // Attempt to detemine if this instance of the 'BI' command + // actually occured within a (string) using the following + // steps: + + // Remove any escaped parentheses from the alleged image + // characteristics data + $para = str_replace(['\\(', '\\)'], '', $text[1][0]); + + // Remove all correctly ordered and balanced parentheses + // from (strings) + do { + $paraTest = $para; + $para = preg_replace('/\(([^)]*)\)/', '$1', $paraTest); + } while ($para != $paraTest); + + $paraOpen = strpos($para, '('); + $paraClose = strpos($para, ')'); + + // If the remaining text contains a close parenthesis ')' + // AND it occurs before any open parenthesis, then we are + // almost certain to be inside a (string) + if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { + // Bump the search offset forward and match again + $offsetBI = (int) $text[1][1]; + continue; + } + + // Double check that this is actually inline image data by + // parsing the alleged image characteristics as a dictionary + $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); + + // Check if an image Width and Height are set in the dict + if ((isset($dict['W']) || isset($dict['Width'])) + && (isset($dict['H']) || isset($dict['Height']))) { + $id = uniqid('IMAGE_', true); + $pdfInlineImages[$id] = [ + preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), + preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), + ]; + $content = preg_replace( + '/'.preg_quote($text[0][0], '/').'/', + '^^^'.$id.'^^^', + $content, + 1 + ); + } } // Find all strings () and replace them so they aren't affected @@ -338,7 +377,7 @@ private function formatContent(?string $content): string foreach ($pdfInlineImages as $id => $image) { $content = str_replace( '^^^'.$id.'^^^', - "\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n", + "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", $content ); } diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index a2ee699f..947a6944 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -298,15 +298,29 @@ public function testFormatContentInlineImages(): void $cleaned = $formatContent->invoke( $this->getPdfObjectInstance(new Document()), - 'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 + 'BT (This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD ET q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 /BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g' ); // PdfParser should not be fooled by Q's in inline image data; // Only one 'Q' command should be found $commandQ = preg_match_all('/Q\r\n/', $cleaned); - $this->assertEquals(1, $commandQ); + + // The 'BI' inside a string should not be interpreted as the + // beginning of an inline image command + $this->assertStringContainsString('(This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD', $cleaned); + + $cleaned = $formatContent->invoke( + $this->getPdfObjectInstance(new Document()), + 'BT (This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD (There is no ID inline image in this data) TD (Nothing but text EI should be found) TD ET' + ); + + $this->assertEquals('BT'."\r\n". +'(This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD'."\r\n". +'(There is no ID inline image in this data) TD'."\r\n". +'(Nothing but text EI should be found) TD'."\r\n". +'ET', $cleaned); } public function testGetSectionsText(): void