Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix hexadecimal decoding (fixes #715) #716

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added samples/bugs/Issue715.pdf
Binary file not shown.
7 changes: 3 additions & 4 deletions src/Smalot/PdfParser/Element/ElementHexa.php
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,11 @@ public static function parse(string $content, ?Document $document = null, int &$
if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
$name = $match['name'];
$offset += strpos($content, '<'.$name) + \strlen($name) + 2; // 1 for '>'
// repackage string as standard
$name = '('.self::decode($name).')';
$element = ElementDate::parse($name, $document);
$name = self::decode($name);
$element = ElementDate::parse('('.$name.')', $document);

if (!$element) {
$element = ElementString::parse($name, $document);
$element = new self($name);
}
Comment on lines -50 to 55
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please give me a short summary what you try to accomplish here.

Why removing the reference to $document?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also why switching from ElementString to ElementHexa? Is it to match others existing behavior from other elements classes?


return $element;
Expand Down
92 changes: 75 additions & 17 deletions src/Smalot/PdfParser/Element/ElementString.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,51 @@ public function equals($value): bool
return $value == $this->value;
}

/**
* Part of parsing process to handle escaped characters.
* Note, most parameters are passed by reference.
*
* Further information in PDF specification (page 53):
* https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
*/
private static function handleEscapedCharacters(string &$name, int &$position, string &$processedName, string $char): void
{
// escaped chars
$nextChar = substr($name, 0, 1);
switch ($nextChar) {
// end-of-line markers (CR, LF, CRLF) should be ignored
case "\r":
case "\n":
preg_match('/^\\r?\\n?/', $name, $matches);
$name = substr($name, \strlen($matches[0]));
$position += \strlen($matches[0]);
break;
// process LF, CR, HT, BS, FF
case 'n':
case 't':
case 'r':
case 'b':
case 'f':
$processedName .= stripcslashes('\\'.$nextChar);
$name = substr($name, 1);
++$position;
break;
// decode escaped parentheses and backslash
case '(':
case ')':
case '\\':
case ' ': // TODO: this should probably be removed - kept for compatibility
$processedName .= $nextChar;
$name = substr($name, 1);
++$position;
break;
Comment on lines +87 to +91
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean with "kept for compatibility"? To what?

// TODO: process octal encoding (but it is also processed later)
// keep backslash in other cases
default:
$processedName .= $char;
}
}

/**
* @return bool|ElementString
*/
Expand All @@ -59,25 +104,38 @@ public static function parse(string $content, ?Document $document = null, int &$
if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
$name = $match['name'];

// Find next ')' not escaped.
$cur_start_text = $start_search_end = 0;
while (false !== ($cur_start_pos = strpos($name, ')', $start_search_end))) {
$cur_extract = substr($name, $cur_start_text, $cur_start_pos - $cur_start_text);
preg_match('/(?P<escape>[\\\]*)$/s', $cur_extract, $match);
if (!(\strlen($match['escape']) % 2)) {
break;
$delimiterCount = 0;
$position = 0;
$processedName = '';
do {
$char = substr($name, 0, 1);
$name = substr($name, 1);
++$position;
switch ($char) {
// matched delimiters should be treated as part of string
case '(':
$processedName .= $char;
++$delimiterCount;
break;
case ')':
if (0 === $delimiterCount) {
$name = substr($name, 1);
break 2;
}
$processedName .= $char;
--$delimiterCount;
break;
case '\\':
self::handleEscapedCharacters($name, $position, $processedName, $char);
break;
default:
$processedName .= $char;
}
$start_search_end = $cur_start_pos + 1;
}
} while ('' !== $name);

$offset += strpos($content, '(') + 1 + $position;

// Extract string.
$name = substr($name, 0, (int) $cur_start_pos);
$offset += strpos($content, '(') + $cur_start_pos + 2; // 2 for '(' and ')'
$name = str_replace(
['\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'],
['\\', ' ', '/', '(', ')', "\n", "\r", "\t"],
$name
);
$name = $processedName;

// Decode string.
$name = Font::decodeOctal($name);
Expand Down
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ protected function parseHeaderElement(?string $type, $value, ?Document $document
return ElementString::parse('('.$value.')', $document);

case '<':
return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
return ElementHexa::parse('<'.$value.'>', $document);

case '/':
return ElementName::parse('/'.$value, $document);
Expand Down
13 changes: 13 additions & 0 deletions tests/PHPUnit/Integration/Element/ElementHexaTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,17 @@ public function testParse(): void

$this->assertEquals('pasqua, primavera, resurrezione, festa cristiana, gesù, uova di cioccolata, coniglietti, pulcini, pasquale, campane, dina rebucci, uova di pasqua, ', $element);
}

/**
* Closing round bracket encoded in hexadecimal format breaks parsing - string is truncated.
*
* @see https://github.com/smalot/pdfparser/issues/715
*/
public function testIssue715(): void
{
$offset = 0;
$testString = '()\\';
$element = ElementHexa::parse('<'.bin2hex($testString).'>', null, $offset);
$this->assertEquals($testString, (string) $element);
}
}
32 changes: 32 additions & 0 deletions tests/PHPUnit/Integration/Element/ElementStringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,38 @@ public function testParse(): void
$this->assertEquals(27, $offset);
}

/**
* @see https://github.com/smalot/pdfparser/issues/715
*/
public function testParseIssue715(): void
{
$element = ElementString::parse('(())');
$this->assertEquals('()', $element->getContent());

// source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
// page 54
$string = '(Strings may contain balanced parentheses ( ) and
special characters (*!&}^% and so on).)';
$element = ElementString::parse($string);
$this->assertEquals('Strings may contain balanced parentheses ( ) and
special characters (*!&}^% and so on).', $element->getContent());

// source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
// page 55
$string = '( This string has an end−of−line at the end of it.
)';
$element = ElementString::parse($string);
$this->assertEquals(' This string has an end−of−line at the end of it.'.\PHP_EOL, $element->getContent());

// source: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf
// page 55
$string = '( These \
two strings \
are the same.)';
$element = ElementString::parse($string);
$this->assertEquals(' These two strings are the same.', $element->getContent());
}

public function testGetContent(): void
{
$element = new ElementString('Copyright');
Expand Down
21 changes: 21 additions & 0 deletions tests/PHPUnit/Integration/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,27 @@ public function testIgnoreEncryption(): void

// without the configuration option set, an exception would be thrown.
}

/**
* Tests special chars encoded as hex.
*
* @see https://github.com/smalot/pdfparser/issues/715
*/
public function testIssue715SpecialCharsEncodedAsHex(): void
{
$filename = $this->rootDir.'/samples/bugs/Issue715.pdf';

$this->fixture = new Parser();
$document = $this->fixture->parseFile($filename);
$sigObject = $document->getObjectsByType('Sig');

$this->assertTrue(isset($sigObject['4_0']));
$this->assertEquals('()\\', (string) $sigObject['4_0']->getHeader()->get('Contents'));

$details = $document->getDetails();
$this->assertEquals('x(y)', $details['Producer'] ?? null);
$this->assertEquals('a(b)', $details['Creator'] ?? null);
}
}

class ParserSub extends Parser
Expand Down
Loading