Skip to content

Commit

Permalink
More robust check for BI within strings
Browse files Browse the repository at this point in the history
`BI` "commands" within strings should not be parsed as the beginning of inline image blocks. Detect if the `BI` we found is inside a (string) and if it is, note the offset and move past it for the next match.
  • Loading branch information
GreyWyvern committed Mar 25, 2024
1 parent 6f0ef9c commit 8d00508
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 12 deletions.
59 changes: 49 additions & 10 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -227,15 +227,54 @@ private function formatContent(?string $content): string
// Find all inline image content and replace them so they aren't
// affected by the next steps
$pdfInlineImages = [];
while (preg_match('/\sBI\s(.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text)) {
$id = uniqid('IMAGE_', true);
$pdfInlineImages[$id] = [$text[1], $text[2]];
$content = preg_replace(
'/'.preg_quote($text[0], '/').'/',
'^^^'.$id.'^^^',
$content,
1
);
$offsetBI = 0;
while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) {
// Attempt to detemine if this instance of the 'BI' command
// actually occured within a (string) using the following
// steps:

// Remove any escaped parentheses from the alleged image
// characteristics data
$para = str_replace(['\\(', '\\)'], '', $text[1][0]);

// Remove all correctly ordered and balanced parentheses
// from (strings)
do {
$paraTest = $para;
$para = preg_replace('/\(([^)]*)\)/', '$1', $paraTest);
} while ($para != $paraTest);

$paraOpen = strpos($para, '(');
$paraClose = strpos($para, ')');

// If the remaining text contains a close parenthesis ')'
// AND it occurs before any open parenthesis, then we are
// almost certain to be inside a (string)
if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) {
// Bump the search offset forward and match again
$offsetBI = (int) $text[1][1];
continue;
}

// Double check that this is actually inline image data by
// parsing the alleged image characteristics as a dictionary
$dict = $this->parseDictionary('<<'.$text[1][0].'>>');

// Check if an image Width and Height are set in the dict
if ((isset($dict['W']) || isset($dict['Width']))
&& (isset($dict['H']) || isset($dict['Height']))) {
$id = uniqid('IMAGE_', true);
$pdfInlineImages[$id] = [
preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]),
preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]),
];
$content = preg_replace(
'/'.preg_quote($text[0][0], '/').'/',
'^^^'.$id.'^^^',
$content,
1
);
}
}

// Find all strings () and replace them so they aren't affected
Expand Down Expand Up @@ -338,7 +377,7 @@ private function formatContent(?string $content): string
foreach ($pdfInlineImages as $id => $image) {
$content = str_replace(
'^^^'.$id.'^^^',
"\r\nBI\r\n".$image[0]."\r\nID\r\n".$image[1]."\r\nEI\r\n",
"\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n",
$content
);
}
Expand Down
18 changes: 16 additions & 2 deletions tests/PHPUnit/Integration/PDFObjectTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -298,15 +298,29 @@ public function testFormatContentInlineImages(): void

$cleaned = $formatContent->invoke(
$this->getPdfObjectInstance(new Document()),
'q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150
'BT (This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD ET q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150
/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
);

// PdfParser should not be fooled by Q's in inline image data;
// Only one 'Q' command should be found
$commandQ = preg_match_all('/Q\r\n/', $cleaned);

$this->assertEquals(1, $commandQ);

// The 'BI' inside a string should not be interpreted as the
// beginning of an inline image command
$this->assertStringContainsString('(This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD', $cleaned);

$cleaned = $formatContent->invoke(
$this->getPdfObjectInstance(new Document()),
'BT (This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD (There is no ID inline image in this data) TD (Nothing but text EI should be found) TD ET'
);

$this->assertEquals('BT'."\r\n".
'(This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD'."\r\n".
'(There is no ID inline image in this data) TD'."\r\n".
'(Nothing but text EI should be found) TD'."\r\n".
'ET', $cleaned);
}

public function testGetSectionsText(): void
Expand Down

0 comments on commit 8d00508

Please sign in to comment.