From e1edb34ef860b549880cf62cb2306a93c7edf567 Mon Sep 17 00:00:00 2001 From: Alastair Irvine Date: Fri, 19 Jan 2024 17:29:58 +0800 Subject: [PATCH 01/20] Fix for two bugs related to Unicode translation support by Font objects Symptom was that some documents' contents was rendering as a bunch of control characters. These are the untranslated strings. This was happening because for two different reasons, these strings weren't being translated \Smalot\PdfParser\Font::decodeContent() in some circumstances. First fix is to \Smalot\PdfParser\Font::loadTranslateTable(): - Fixed bug where bfchar sections weren't loaded due to mistake in regexp. - It now uses `*` instead of `+` and thus supports translation tables with lines like `<0000><0000>`. (Required `<0000> <0000>` before.) Second fix is for documents that attach their Font objects to the Pages object instead of each Page object: - \Smalot\PdfParser\Page now has a setFonts() method - \Smalot\PdfParser\Pages now declares its $fonts variable - \Smalot\PdfParser\Pages::getPages() now applies the object's fonts to each child Page - \Smalot\PdfParser\Pages::getFonts() copied from Page class --- src/Smalot/PdfParser/Font.php | 2 +- src/Smalot/PdfParser/Page.php | 7 +++++ src/Smalot/PdfParser/Pages.php | 53 ++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php index cfe85d79..af2d0f9f 100644 --- a/src/Smalot/PdfParser/Font.php +++ b/src/Smalot/PdfParser/Font.php @@ -191,7 +191,7 @@ public function loadTranslateTable(): array // Support for multiple bfchar sections if (preg_match_all('/beginbfchar(?P.*?)endbfchar/s', $content, $matches)) { foreach ($matches['sections'] as $section) { - $regexp = '/<(?P[0-9A-F]+)> +<(?P[0-9A-F]+)>[ \r\n]+/is'; + $regexp = '/<(?P[0-9A-F]+)> *<(?P[0-9A-F]+)>[ \r\n]+/is'; preg_match_all($regexp, $section, $matches); diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index d6ffaf08..c3bddc21 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -54,6 +54,13 @@ class Page extends PDFObject */ protected $dataTm; + public function setFonts($fonts) + { + if (empty($this->fonts)) { + $this->fonts = $fonts; + } + } + /** * @return Font[] */ diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 6b878650..edaa24dd 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -39,6 +39,11 @@ */ class Pages extends PDFObject { + /** + * @var Font[] + */ + protected $fonts; + /** * @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them, * @@ -57,6 +62,9 @@ public function getPages(bool $deep = false): array return $kidsElement->getContent(); } + // Prepare to apply the Pages' object's fonts to each page + $fonts = $this->getFonts(); + $kids = $kidsElement->getContent(); $pages = []; @@ -64,10 +72,55 @@ public function getPages(bool $deep = false): array if ($kid instanceof self) { $pages = array_merge($pages, $kid->getPages(true)); } elseif ($kid instanceof Page) { + if (!empty($this->fonts)) { + $kid->setFonts($fonts); + } $pages[] = $kid; } } return $pages; } + + /** + * @return Font[] + */ + public function getFonts() + { + if (null !== $this->fonts) { + return $this->fonts; + } + + $resources = $this->get('Resources'); + + if (method_exists($resources, 'has') && $resources->has('Font')) { + if ($resources->get('Font') instanceof ElementMissing) { + return []; + } + + if ($resources->get('Font') instanceof Header) { + $fonts = $resources->get('Font')->getElements(); + } else { + $fonts = $resources->get('Font')->getHeader()->getElements(); + } + + $table = []; + + foreach ($fonts as $id => $font) { + if ($font instanceof Font) { + $table[$id] = $font; + + // Store too on cleaned id value (only numeric) + $id = preg_replace('/[^0-9\.\-_]/', '', $id); + if ('' != $id) { + $table[$id] = $font; + } + } + } + + return $this->fonts = $table; + } + + return []; + } } From c96c1459bda4be3cf50cfaafe11898389d6a6a28 Mon Sep 17 00:00:00 2001 From: Alastair Irvine Date: Fri, 5 Apr 2024 18:12:29 +0800 Subject: [PATCH 02/20] Pages.php typo fix Added relative namespace for `ElementMissing` --- src/Smalot/PdfParser/Pages.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index edaa24dd..32b01ce1 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -94,7 +94,7 @@ public function getFonts() $resources = $this->get('Resources'); if (method_exists($resources, 'has') && $resources->has('Font')) { - if ($resources->get('Font') instanceof ElementMissing) { + if ($resources->get('Font') instanceof Element\ElementMissing) { return []; } From cd947ad3cb83023b08b34ed878559b5723736268 Mon Sep 17 00:00:00 2001 From: Alastair Irvine Date: Wed, 10 Apr 2024 18:57:54 +0800 Subject: [PATCH 03/20] Update src/Smalot/PdfParser/Page.php to make doc comment for setFonts() Co-authored-by: Konrad Abicht --- src/Smalot/PdfParser/Page.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index c3bddc21..aceb5bc7 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -54,6 +54,9 @@ class Page extends PDFObject */ protected $dataTm; + /** + * @param array<\Smalot\PdfParser\Font> $fonts + */ public function setFonts($fonts) { if (empty($this->fonts)) { From 6c1c3497e3006a879de367a26086a719a8915e14 Mon Sep 17 00:00:00 2001 From: Alastair Irvine Date: Wed, 10 Apr 2024 19:10:23 +0800 Subject: [PATCH 04/20] Update Pages.php to make getFonts() protected Requested by @k00ni --- src/Smalot/PdfParser/Pages.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 32b01ce1..11b3b908 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -85,7 +85,7 @@ public function getPages(bool $deep = false): array /** * @return Font[] */ - public function getFonts() + protected function getFonts() { if (null !== $this->fonts) { return $this->fonts; From 8a8ea2cee1ab4ac0d93105fd0c9fd5214bf1ba85 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 19 Apr 2024 12:42:13 +0200 Subject: [PATCH 05/20] Refined Pages instance new getFonts method not only returned stored fonts but also built related fonts list. With this changes its easier to test and the replacement (setupFonts) only builds the fonts list. Also refined some phpdoc comments. --- src/Smalot/PdfParser/Pages.php | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 11b3b908..1d677d43 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -40,7 +40,7 @@ class Pages extends PDFObject { /** - * @var Font[] + * @var array<\Smalot\PdfParser\Font>|null */ protected $fonts; @@ -63,7 +63,10 @@ public function getPages(bool $deep = false): array } // Prepare to apply the Pages' object's fonts to each page - $fonts = $this->getFonts(); + if (false === \is_array($this->fonts)) { + $this->setupFonts(); + } + $fontsAvailable = 0 < \count($this->fonts); $kids = $kidsElement->getContent(); $pages = []; @@ -72,8 +75,8 @@ public function getPages(bool $deep = false): array if ($kid instanceof self) { $pages = array_merge($pages, $kid->getPages(true)); } elseif ($kid instanceof Page) { - if (!empty($this->fonts)) { - $kid->setFonts($fonts); + if ($fontsAvailable) { + $kid->setFonts($this->fonts); } $pages[] = $kid; } @@ -83,19 +86,18 @@ public function getPages(bool $deep = false): array } /** - * @return Font[] + * Gathers information about fonts and collects them in a list. + * + * @return void */ - protected function getFonts() + protected function setupFonts() { - if (null !== $this->fonts) { - return $this->fonts; - } - $resources = $this->get('Resources'); if (method_exists($resources, 'has') && $resources->has('Font')) { + // no fonts available, therefore stop here if ($resources->get('Font') instanceof Element\ElementMissing) { - return []; + return; } if ($resources->get('Font') instanceof Header) { @@ -118,9 +120,9 @@ protected function getFonts() } } - return $this->fonts = $table; + $this->fonts = $table; + } else { + $this->fonts = []; } - - return []; } } From b20ee6a06b7b11fb57192ccf27ee4c886feda2f8 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 19 Apr 2024 12:43:15 +0200 Subject: [PATCH 06/20] Added PagesTest.php with two integration tests --- tests/PHPUnit/Integration/PagesTest.php | 125 ++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tests/PHPUnit/Integration/PagesTest.php diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php new file mode 100644 index 00000000..69a0ce9e --- /dev/null +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -0,0 +1,125 @@ + + * + * @date 2024-04-19 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Element\ElementArray; +use Smalot\PdfParser\Font; +use Smalot\PdfParser\Header; +use Smalot\PdfParser\Page; +use Smalot\PdfParser\Pages; + +/** + * @internal only for test purposes + */ +class PagesDummy extends Pages +{ + /** + * @param array<\Smalot\PdfParser\Font> $fonts + * + * @return void + */ + public function setFonts($fonts) + { + $this->fonts = $fonts; + } +} + +class PagesTest extends TestCase +{ + /** + * If fonts are not stored in Page instances but in the Pages instance. + * + * @see https://github.com/smalot/pdfparser/pull/698 + */ + public function testPullRequest698NoFontsSet(): void + { + $document = $this->createMock(Document::class); + + // create a Page mock and tell PHPUnit that its setFonts has to be called once + // otherwise an error is raised + $page1 = $this->createMock(Page::class); + $page1->expects($this->once())->method('setFonts'); + + // setup header + $header = new Header([ + 'Kids' => new ElementArray([ + $page1, + ]), + ], $document); + + $font1 = $this->createMock(Font::class); + + $pages = new PagesDummy($document, $header); + $pages->setFonts([$font1]); + + // we expect setFonts is called on $page1 + $pages->getPages(true); + } + + /** + * Dont override fonts list in Page, if available. + * + * @see https://github.com/smalot/pdfparser/pull/698 + */ + public function testPullRequest698DontOverride(): void + { + $document = $this->createMock(Document::class); + + // create a Page mock and tell PHPUnit that its setFonts has to be called once + // otherwise an error is raised + $font2 = new Font($document); + $page1 = new Page($document); + $page1->setFonts([$font2]); + + // setup header + $header = new Header([ + 'Kids' => new ElementArray([ + $page1, + ]), + ], $document); + + $font1 = $this->createMock(Font::class); + + $pages = new PagesDummy($document, $header); + $pages->setFonts([$font1]); + + $pages->getPages(true); + + // note: $font1 and $font2 are intenionally not both of the same type. + // one is a mock and the other one a real instance of Font. + // this way we can simply check the return value of getFonts here. + // if both were one of the other, we had to use a different assertation approach. + $this->assertEquals([$font2], $page1->getFonts()); + } +} From f761dd7deea6854f142f802f5e3a40ff2f6ac9a9 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 19 Apr 2024 12:49:10 +0200 Subject: [PATCH 07/20] Update Pages.php --- src/Smalot/PdfParser/Pages.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 1d677d43..1bd232fa 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -42,7 +42,7 @@ class Pages extends PDFObject /** * @var array<\Smalot\PdfParser\Font>|null */ - protected $fonts; + protected $fonts = null; /** * @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them, From ab1c62b6ab39ae03c5de5379fb69e73638e8769d Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 19 Apr 2024 12:52:50 +0200 Subject: [PATCH 08/20] Update Pages.php --- src/Smalot/PdfParser/Pages.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 1bd232fa..1d677d43 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -42,7 +42,7 @@ class Pages extends PDFObject /** * @var array<\Smalot\PdfParser\Font>|null */ - protected $fonts = null; + protected $fonts; /** * @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them, From ff83e115f5f1bd557ed651e73a0623bea9493b53 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Mon, 29 Apr 2024 09:00:37 +0200 Subject: [PATCH 09/20] added more information to PagesTest.php --- tests/PHPUnit/Integration/PagesTest.php | 27 ++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 69a0ce9e..534187f1 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -45,6 +45,9 @@ class PagesDummy extends Pages { /** + * The purpose of this function is to bypass the tedious + * work to setup instances which lead to a valid $fonts variable. + * * @param array<\Smalot\PdfParser\Font> $fonts * * @return void @@ -60,13 +63,20 @@ class PagesTest extends TestCase /** * If fonts are not stored in Page instances but in the Pages instance. * + * Pages + * | `--- Font[] + * | + * | + * `--+ Page1 (no fonts) + * `--+ ... + * * @see https://github.com/smalot/pdfparser/pull/698 */ public function testPullRequest698NoFontsSet(): void { $document = $this->createMock(Document::class); - // create a Page mock and tell PHPUnit that its setFonts has to be called once + // Create a Page mock and tell PHPUnit that its setFonts has to be called once // otherwise an error is raised $page1 = $this->createMock(Page::class); $page1->expects($this->once())->method('setFonts'); @@ -80,15 +90,26 @@ public function testPullRequest698NoFontsSet(): void $font1 = $this->createMock(Font::class); + // Preset fonts variable so we don't have to prepare all the + // prerequisites manually (like creating a Ressources instance + // with Font instances, see Pages::setupFonts()) $pages = new PagesDummy($document, $header); $pages->setFonts([$font1]); - // we expect setFonts is called on $page1 + // We expect setFonts is called on $page1, therefore no assertion here $pages->getPages(true); } /** - * Dont override fonts list in Page, if available. + * Dont override fonts list in a Page instance, if available. + * + * Pages + * | `--- Font[] <=== less important than fonts in Page instance + * | + * | + * `--+ Page1 + * `--- Font[] <=== must be kept + * `--+ ... * * @see https://github.com/smalot/pdfparser/pull/698 */ From 9831b93afc63f49d3bef9278b4e74481d34a69b2 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Mon, 29 Apr 2024 09:02:22 +0200 Subject: [PATCH 10/20] Update Pages.php --- src/Smalot/PdfParser/Pages.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index 1d677d43..f95134b1 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -45,7 +45,8 @@ class Pages extends PDFObject protected $fonts; /** - * @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them, + * @todo Objects other than Pages or Page might need to be treated specifically + * in order to get Page objects out of them. * * @see https://github.com/smalot/pdfparser/issues/331 */ @@ -89,6 +90,8 @@ public function getPages(bool $deep = false): array * Gathers information about fonts and collects them in a list. * * @return void + * + * @internal */ protected function setupFonts() { From ab010ff25c0cab76e28751ed6751ab86b9bda651 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Mon, 29 Apr 2024 09:14:34 +0200 Subject: [PATCH 11/20] Update Page.php --- src/Smalot/PdfParser/Page.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index aceb5bc7..0dafbfcc 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -56,6 +56,8 @@ class Page extends PDFObject /** * @param array<\Smalot\PdfParser\Font> $fonts + * + * @internal */ public function setFonts($fonts) { From fe212466543492d0b09d1e5913770c9b867139ea Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Mon, 29 Apr 2024 09:17:48 +0200 Subject: [PATCH 12/20] Update PagesTest.php --- tests/PHPUnit/Integration/PagesTest.php | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 534187f1..0dcdd443 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -135,12 +135,14 @@ public function testPullRequest698DontOverride(): void $pages = new PagesDummy($document, $header); $pages->setFonts([$font1]); + // Trigger setupFonts method in $pages $pages->getPages(true); - // note: $font1 and $font2 are intenionally not both of the same type. - // one is a mock and the other one a real instance of Font. - // this way we can simply check the return value of getFonts here. - // if both were one of the other, we had to use a different assertation approach. + // Note: + // $font1 and $font2 are intenionally not both of the same type. + // One is a mock and the other one a real instance of Font. + // This way we can simply check the return value of getFonts here. + // If both were one of the other, we had to use a different assertation approach. $this->assertEquals([$font2], $page1->getFonts()); } } From 6899a17fb4cccbf7f9e529cba6bbecdaab5301c2 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Mon, 29 Apr 2024 09:22:37 +0200 Subject: [PATCH 13/20] Update PagesTest.php --- tests/PHPUnit/Integration/PagesTest.php | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 0dcdd443..ac8c188b 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -64,10 +64,11 @@ class PagesTest extends TestCase * If fonts are not stored in Page instances but in the Pages instance. * * Pages - * | `--- Font[] + * | `--- fonts = Font[] <=== will be used to override fonts in Page1 ... * | * | - * `--+ Page1 (no fonts) + * `--+ Page1 + * | `--- fonts = null <=== Will be overwritten with the content of Pages.fonts * `--+ ... * * @see https://github.com/smalot/pdfparser/pull/698 @@ -104,11 +105,11 @@ public function testPullRequest698NoFontsSet(): void * Dont override fonts list in a Page instance, if available. * * Pages - * | `--- Font[] <=== less important than fonts in Page instance + * | `--- fonts = Font[] <=== Has to be ignored because fonts in Page1 is set * | * | * `--+ Page1 - * `--- Font[] <=== must be kept + * | `--- fonts = Font[] <=== must not be overwritten * `--+ ... * * @see https://github.com/smalot/pdfparser/pull/698 From 34012c29278aeccd8c97163d2f0864bd8d50acad Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Mon, 29 Apr 2024 09:24:01 +0200 Subject: [PATCH 14/20] Update PagesTest.php --- tests/PHPUnit/Integration/PagesTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index ac8c188b..b75ccb5c 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -139,7 +139,7 @@ public function testPullRequest698DontOverride(): void // Trigger setupFonts method in $pages $pages->getPages(true); - // Note: + // Note: // $font1 and $font2 are intenionally not both of the same type. // One is a mock and the other one a real instance of Font. // This way we can simply check the return value of getFonts here. From 7d93afe2e819436c9fa78bef61ec9b47a7eb4031 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Tue, 30 Apr 2024 14:04:30 +0200 Subject: [PATCH 15/20] PagesTest.php: fixed comment --- tests/PHPUnit/Integration/PagesTest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index b75ccb5c..dfe346b2 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -118,8 +118,8 @@ public function testPullRequest698DontOverride(): void { $document = $this->createMock(Document::class); - // create a Page mock and tell PHPUnit that its setFonts has to be called once - // otherwise an error is raised + // Setup an empty Page instance and insert a Font instance. + // We wanna see later on, if $font2 is overwritten by $font1. $font2 = new Font($document); $page1 = new Page($document); $page1->setFonts([$font2]); From 80b3e3008047a7d9fc0ac2f43a53739ce1e7256d Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 10 May 2024 13:45:59 +0200 Subject: [PATCH 16/20] PagesTest.php: added test case which doesn't depend on setFonts explicitly --- tests/PHPUnit/Integration/PagesTest.php | 39 +++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index dfe346b2..41a3504d 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -146,4 +146,43 @@ public function testPullRequest698DontOverride(): void // If both were one of the other, we had to use a different assertation approach. $this->assertEquals([$font2], $page1->getFonts()); } + + /** + * In this example a Document instance is created, which has one Pages instance with a few Font instances. + * With the new functionality, related Font instances are passed down to related Page instances. + * + * @see https://github.com/smalot/pdfparser/pull/698 + */ + public function testPullRequest698WithoutUsageOfSetFonts(): void + { + $path = $this->rootDir.'/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf'; + $document = (new Parser())->parseFile($path); + + // get Pages instance from generated Document instance + $objectsOfTypePages = $document->getObjectsByType('Pages'); + $this->assertEquals(1, count($objectsOfTypePages)); + + $pagesInstance = array_values($objectsOfTypePages)[0]; + + // collect Font instance(s) of Page instances + $fonts = []; + foreach ($pagesInstance->getPages() as $page) { + foreach ($page->getFonts() as $font) { + $fonts[$font->getName()] = $font; + } + } + + // collect Font instance(s) of Pages instance itself + $list = $pagesInstance->get('Resources')->get('Font')->getHeader()->getElements(); + $fontsToCompareAgainst = []; + foreach ($list as $font) { + $fontsToCompareAgainst[$font->getName()] = $font; + } + + // check if font names are equal + $this->assertEquals( + array_keys($fonts), + array_keys($fontsToCompareAgainst), + ); + } } From 4c64f4e7de40afaed6715fbafb5c5cf0d32cc039 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 10 May 2024 13:49:19 +0200 Subject: [PATCH 17/20] PagesTest.php: fixed syntax error --- tests/PHPUnit/Integration/PagesTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 41a3504d..2abafb5a 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -182,7 +182,7 @@ public function testPullRequest698WithoutUsageOfSetFonts(): void // check if font names are equal $this->assertEquals( array_keys($fonts), - array_keys($fontsToCompareAgainst), + array_keys($fontsToCompareAgainst) ); } } From bf91ad2fe5dce190541b8d3812979ac431607fdc Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 10 May 2024 13:51:17 +0200 Subject: [PATCH 18/20] Update PagesTest.php --- tests/PHPUnit/Integration/PagesTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 2abafb5a..096d7a6b 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -160,7 +160,7 @@ public function testPullRequest698WithoutUsageOfSetFonts(): void // get Pages instance from generated Document instance $objectsOfTypePages = $document->getObjectsByType('Pages'); - $this->assertEquals(1, count($objectsOfTypePages)); + $this->assertEquals(1, \count($objectsOfTypePages)); $pagesInstance = array_values($objectsOfTypePages)[0]; From 4b213190d6904f651e18cd988d6ee2eb38b001e1 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Fri, 10 May 2024 13:57:20 +0200 Subject: [PATCH 19/20] Update PagesTest.php --- tests/PHPUnit/Integration/PagesTest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 096d7a6b..708a900a 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -38,6 +38,7 @@ use Smalot\PdfParser\Header; use Smalot\PdfParser\Page; use Smalot\PdfParser\Pages; +use Smalot\PdfParser\Parser; /** * @internal only for test purposes From aa3adfc1c0c412ebeb327d631245e8fa2e4e2768 Mon Sep 17 00:00:00 2001 From: Konrad Abicht Date: Thu, 16 May 2024 08:58:21 +0200 Subject: [PATCH 20/20] PagesTest.php: deployed suggested test from @GreyWyvern Reference: https://github.com/smalot/pdfparser/pull/698#issuecomment-2112946865 --- tests/PHPUnit/Integration/PagesTest.php | 129 +++++------------------- 1 file changed, 23 insertions(+), 106 deletions(-) diff --git a/tests/PHPUnit/Integration/PagesTest.php b/tests/PHPUnit/Integration/PagesTest.php index 708a900a..fb069c08 100644 --- a/tests/PHPUnit/Integration/PagesTest.php +++ b/tests/PHPUnit/Integration/PagesTest.php @@ -38,7 +38,6 @@ use Smalot\PdfParser\Header; use Smalot\PdfParser\Page; use Smalot\PdfParser\Pages; -use Smalot\PdfParser\Parser; /** * @internal only for test purposes @@ -61,129 +60,47 @@ public function setFonts($fonts) class PagesTest extends TestCase { - /** - * If fonts are not stored in Page instances but in the Pages instance. - * - * Pages - * | `--- fonts = Font[] <=== will be used to override fonts in Page1 ... - * | - * | - * `--+ Page1 - * | `--- fonts = null <=== Will be overwritten with the content of Pages.fonts - * `--+ ... - * - * @see https://github.com/smalot/pdfparser/pull/698 - */ - public function testPullRequest698NoFontsSet(): void + public function testFontsArePassedFromPagesToPage(): void { + // Create mock Document, Font and Page objects $document = $this->createMock(Document::class); + $font1 = new Font($document); + $page = new Page($document); - // Create a Page mock and tell PHPUnit that its setFonts has to be called once - // otherwise an error is raised - $page1 = $this->createMock(Page::class); - $page1->expects($this->once())->method('setFonts'); - - // setup header + // Create a Header object that indicates $page is a child $header = new Header([ 'Kids' => new ElementArray([ - $page1, + $page, ]), ], $document); - $font1 = $this->createMock(Font::class); - - // Preset fonts variable so we don't have to prepare all the - // prerequisites manually (like creating a Ressources instance - // with Font instances, see Pages::setupFonts()) + // Use this header to create a mock Pages object $pages = new PagesDummy($document, $header); - $pages->setFonts([$font1]); - - // We expect setFonts is called on $page1, therefore no assertion here - $pages->getPages(true); - } - - /** - * Dont override fonts list in a Page instance, if available. - * - * Pages - * | `--- fonts = Font[] <=== Has to be ignored because fonts in Page1 is set - * | - * | - * `--+ Page1 - * | `--- fonts = Font[] <=== must not be overwritten - * `--+ ... - * - * @see https://github.com/smalot/pdfparser/pull/698 - */ - public function testPullRequest698DontOverride(): void - { - $document = $this->createMock(Document::class); - // Setup an empty Page instance and insert a Font instance. - // We wanna see later on, if $font2 is overwritten by $font1. - $font2 = new Font($document); - $page1 = new Page($document); - $page1->setFonts([$font2]); - - // setup header - $header = new Header([ - 'Kids' => new ElementArray([ - $page1, - ]), - ], $document); - - $font1 = $this->createMock(Font::class); - - $pages = new PagesDummy($document, $header); + // Apply $font1 as a Font object to this Pages object; + // setFonts is used here as part of PagesDummy, only to access + // the protected Pages::fonts variable; it is not a method + // available in production $pages->setFonts([$font1]); // Trigger setupFonts method in $pages $pages->getPages(true); - // Note: - // $font1 and $font2 are intenionally not both of the same type. - // One is a mock and the other one a real instance of Font. - // This way we can simply check the return value of getFonts here. - // If both were one of the other, we had to use a different assertation approach. - $this->assertEquals([$font2], $page1->getFonts()); - } + // Since the $page object font list is empty, $font1 from Pages + // object must be passed to the Page object + $this->assertEquals([$font1], $page->getFonts()); - /** - * In this example a Document instance is created, which has one Pages instance with a few Font instances. - * With the new functionality, related Font instances are passed down to related Page instances. - * - * @see https://github.com/smalot/pdfparser/pull/698 - */ - public function testPullRequest698WithoutUsageOfSetFonts(): void - { - $path = $this->rootDir.'/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf'; - $document = (new Parser())->parseFile($path); + // Create a second $font2 using a different method + $font2 = $this->createMock(Font::class); - // get Pages instance from generated Document instance - $objectsOfTypePages = $document->getObjectsByType('Pages'); - $this->assertEquals(1, \count($objectsOfTypePages)); + // Update the fonts in $pages + $pages->setFonts([$font1, $font2]); - $pagesInstance = array_values($objectsOfTypePages)[0]; - - // collect Font instance(s) of Page instances - $fonts = []; - foreach ($pagesInstance->getPages() as $page) { - foreach ($page->getFonts() as $font) { - $fonts[$font->getName()] = $font; - } - } - - // collect Font instance(s) of Pages instance itself - $list = $pagesInstance->get('Resources')->get('Font')->getHeader()->getElements(); - $fontsToCompareAgainst = []; - foreach ($list as $font) { - $fontsToCompareAgainst[$font->getName()] = $font; - } + // Trigger setupFonts method in $pages + $pages->getPages(true); - // check if font names are equal - $this->assertEquals( - array_keys($fonts), - array_keys($fontsToCompareAgainst) - ); + // Now that $page already has a font, updates from $pages + // should not overwrite it + $this->assertEquals([$font1], $page->getFonts()); } }