diff --git a/.php-cs-fixer.php b/.php-cs-fixer.php index c860c516..683637a2 100644 --- a/.php-cs-fixer.php +++ b/.php-cs-fixer.php @@ -14,10 +14,8 @@ $config = new Config(); $config ->setFinder($finder) - ->setRiskyAllowed(true) ->setRules([ - '@Symfony' => true, - '@Symfony:risky' => true, + '@PSR12' => true, 'array_syntax' => ['syntax' => 'short'], 'no_empty_phpdoc' => true, 'no_unused_imports' => true, diff --git a/samples/bugs/Issue727.pdf b/samples/bugs/Issue727.pdf new file mode 100644 index 00000000..1d2f89ae Binary files /dev/null and b/samples/bugs/Issue727.pdf differ diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index df0a6402..964592e4 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -255,7 +255,7 @@ public function extractXMPMetadata(string $content): void if ('rdf:li' == $val['tag']) { $metadata[] = $val['value']; - // Else assign a value to this property + // Else assign a value to this property } else { $metadata[$val['tag']] = $val['value']; } diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index 407614d2..84f3f15e 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -451,7 +451,7 @@ public function getSectionsText(?string $content): array $inTextBlock = true; $sections[] = $line; - // If an 'ET' is encountered, unset the $inTextBlock flag + // If an 'ET' is encountered, unset the $inTextBlock flag } elseif ('ET' == $line) { $inTextBlock = false; $sections[] = $line; diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 5e17083a..a6256f21 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -214,8 +214,11 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = []) } } if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { - // get previous xref - $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref); + $offset = (int) $matches[1]; + if (0 != $offset) { + // get previous xref + $xref = $this->getXrefData($pdfData, $offset, $xref); + } } } else { throw new \Exception('Unable to find trailer'); @@ -264,7 +267,8 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref if ( ('/' == $v[0]) && ('Type' == $v[1]) - && (isset($sarr[$k + 1]) + && ( + isset($sarr[$k + 1]) && '/' == $sarr[$k + 1][0] && 'XRef' == $sarr[$k + 1][1] ) @@ -290,7 +294,8 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref if ( '/' == $vdc[0] && 'Columns' == $vdc[1] - && (isset($decpar[$kdc + 1]) + && ( + isset($decpar[$kdc + 1]) && 'numeric' == $decpar[$kdc + 1][0] ) ) { @@ -298,7 +303,8 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref } elseif ( '/' == $vdc[0] && 'Predictor' == $vdc[1] - && (isset($decpar[$kdc + 1]) + && ( + isset($decpar[$kdc + 1]) && 'numeric' == $decpar[$kdc + 1][0] ) ) { @@ -404,7 +410,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref } $prev_row = $ddata[$k]; } // end for each row - // complete decoding + // complete decoding } else { // number of bytes in a row $rowlen = array_sum($wb); diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 7a586932..c0b7cf9f 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -194,4 +194,23 @@ public function testGetXrefDataIssue673(): void self::assertStringContainsString('6 rue des Goutais', $text); } + + /** + * Handle self referencing xref + * + * It seems that some PDF creators output `Prev 0` when there is no previous xref. + * + * @see https://github.com/smalot/pdfparser/pull/727 + */ + public function testDecodeXrefIssue727(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue727.pdf'; + + // Parsing this document would previously cause an infinite loop + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + self::assertStringContainsString('', $text); + } }