From e826d4ddef290ce1e67f2643406d95c667f8757b Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Thu, 14 Mar 2024 12:42:39 -0400 Subject: [PATCH 1/2] Account for inaccurate offsets in getXrefData() Normally offset pointers to `xref` keywords in a PDF document are exact to the byte. However, in some cases the pointer may point to some whitespace *before* the `xref` keyword. Adobe Acrobat takes these 'errors' in stride, displaying the document anyway, and so should PdfParser. Clean up the getXrefData() function in **RawDataParser.php**. It now only needs to do one `preg_match_all()` and pushes the caret past whitespace when looking for `xref` keywords. Use existing **Issue557.pdf** to create a new file: **Issue673.pdf** where the last `/Prev 13486` command has been decremented to `/Prev 13485`. Trying to parse this file would cause an Exception without this fix. --- samples/bugs/Issue673.pdf | Bin 0 -> 23847 bytes .../PdfParser/RawData/RawDataParser.php | 52 +++++++++--------- .../Integration/RawData/RawDataParserTest.php | 22 ++++++++ 3 files changed, 48 insertions(+), 26 deletions(-) create mode 100644 samples/bugs/Issue673.pdf diff --git a/samples/bugs/Issue673.pdf b/samples/bugs/Issue673.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a2138b51e81f73b0d5e9746740d249732c8fc084 GIT binary patch literal 23847 zcmeHv2Ur$Yx2_cx#TKz8#?M$mf$7EArYcqh?3#%1sSpK2u||!Gn%H{}b|vvlq|8q~yeeS^wGqda3d$0BGcdb37YJhI~(7vLFyWpT4 zQC%WD-0RfwaQ6%D*URbf3kd1v#7jU(v@x}j@dxXR~b!TW)nA5MRGqPK3cZWnOl4>!r(aw;b9`1eTtn73?`oo5k z2DXWRHGj`2kMr9Cm)|b4t7WP1FKQiU47rQ=@I;>3vl1M+x90wNR?^ccPxB3&RI$u+ z&)KJ@d*>Ot`}VTnUq8wB^|zCkGA*2~s&5@M_{Hz{7Jgo_e*Z7;Jt^T+<3*+O^=Gvh zU$Jie#Xn9Oz5J)~`Fs|g98*>+ywtN!KmVO47qx41eO-IBnAWPD->&LqUiW?Rd%mWx z61snOR+SEQc-p>37PjeD z=6Y1apLXr6->>0k2@}pVyqj`1d2;-C`uu_KYpvinyqq!mNP4k@r8k}ry|*j=^3itl z^tEBhO>ZVle!iqd^0AwpXQmElYHpZxgH3J~vUu9c)b)P9{dA}JX3vrz-}N0;;>q@6 z_c~n+u35yq^Py*8N%vdcp@m))ZCsq{)TmmspVE$dxm|x=ZL#l$PY+dm;pMwk?MyEb*HujJ+wPIulnKQTmJXaSdt8-6 z506#3xpnnw8qzO$?~Y<}UD55p z>1lz}66kv6Yo~6W%iKtr^b(Pp-avT1bA*^t`eD(+?{C(yfE<81HGa zy05~7A}2p{pVa-jV^X2vOKO%KIpe~t+VX&-zsEF}=N2zGrl?VBOvU&u!)`TCNPX6* z^OW6*8|VK}*thNUv%z0R>VakB^Z3;)QuIup(7YwTZxpw!HYr^v;eN zF+E46e&;uJr|*#QWu|+|J6A1#8qqku{I(C~kL%hn@ymhnjb{u# zv!YPpYX`T+_r2G9RB*$N)HkP(t(md8!GuNZ#iF%p2tPl1bp6HkT?g6wt5*D0^L|qF zrq3oO^ZkO3RA0F;Peeeu{VP=C$cssrQ)?TIfxpkSHPxsDaT*&)76Q1{+lM|Ry8*u8gu z+&(Di-r8*zyV9|5WBjCpf#FHvui`o#ZId3s4(aK2(f7Q!qA%L8tw-9;n-y2)+w@>X z=d~VH{0p%MN}rFK{fzBbfA#s5&k9YcxooR{)xg_Z`+rd~^-__ZM~Y0@du&yY*$rB6 zU;+}t^M4ziKiNGpb==)I3+a($1F~=TS1@qv#Pb!t;bt~+>7 zLSMGU{psBwp9xy}n_KCrQyM=>Y5Qb!OvL6=CuY4U#^j0L7dv`FnQl?zn;%QrS-0Nn zs)LQq&L^k7+`sX0=jbmkwm&-f^ERKi`!1kz`2r;?k1+Z-9$T=mG$T*D8t2>nq0a9U zusiS4XVq4eS$*(D;USIko*J26g!ek%HDG)n_0E}DA1|9cVEXEj6&`h6pbxlo;nIoz zdyXdm+w^ADPn-`h;(@$T0IGHLl(%-G7 z;z)bErf+Qjty;*Q#radu+@OwWr@Ttm7`>Be)hnvouGN)2haWszu<78F{l>R0s?PM? zK5B^5ck9J@XVN>ZuQYhlpl!L_zMiwa-@)bffe2OO*@U4Bj3!p625BR*j%*&0uOcEIdaUjq9cyB8#V3tiO;t{nU%-x*bmS@$A>;$n??Ge6}pU)?oh5Vr^n3<~j4svr^fi_iAjueQ#%r58HCV zfq6!so6$3PThEb|`2xi}uP<5Oy63_ZFTC1>?-@3tssEE^i~9=elz|^N5CR9@eQ~mM zm7#x9gTH)~zBRJ(p<(MAuamxQvFq1$lkZPxvZnT(YG3Zp9or>(e4aok3C&6t(N!yEKS+$c|Q%xZdJVdt&Xxka&be&>)s^m{vedWSA7S$Oh-fd#&D zODsNhq%wAFWA04s>wQCF&n)<6;l8h1ot-!In}tKlZ|!$dF-rWDr>f_X{GLqPwQ4}b zT|NJt*af?4O(abGa0i1#S>(up;^ z56)BFl2&dvuYNnK&xy9v^A;){5ZI_r%Zckc&#AuR!27rLJSw?t#65lS%hDB6o_p+H zJE3###T)m1;JN$7jfa;$T{gzkvsTlV!6%gbE#)E+D-!2zDXmvMyRn9Mce=Py>N|&C>ghmBY+mUNq@>)J3Y~q|kcdr4kYY28S{UYd$HwVCV?1QH^USD#0J;;sb;4b}YPL z(yjKzJ-3%FhzX4-?penYE`LiUuS>XvFWV%1G%ImK^YRI6k5n#FYhHZs?hEovZM%DB z^R^Q)h&`qBs(FKpPu)9j|0wraUkpoxvKje(+}v6o%;l-)_xOb@31*orNm1{xHKo^7 zncR1v_sYr5ojo2D^e9}T*}Z4M@y_8Te3!P!?-5kWQi!6%%GdPnz6x5=+h>$}`y%-) z?aA+fe!>EvP7R<=Zl27qgMUgczrCL4ohqTe*SwPg)_lD#&*~QWW4B*!fBoV2^WIZQ z3KuAG=QoG$RIvw~~H-_Y29`h{t=l9g$Esv6p|CAOK{@pL$-DVf~ zxVwT;{;)$0n0|7pYY80Quq{td?#$4{Dj8EYG;z@|_XR`UyuWG8Kylk*+Nun5FIU9- zYOw~pKk&S>C{8|K=ZC^U7^6|Jb=ykO?=P0uu&U+urUl(=9zE?rg@#w|%;;R?v$>-> z|2gp3dpD-J#k_9yl{dz-4{fc{iSV7C8j5k!-9|>9Ny(DlP^!KtCVLgDDVkoc#%gx9q<})X6cFA;W0bM z{m9pvw`g}-(3NSPXAhW**=na}zq)Y0KD{RR)K)2$)Hb@y4nmEF^9=qjUTf%z}a)jmC|ONWYkHZ|Rt zG$4Is4{7n83cD{Kb50%FBBVpq!Cm#^iWR)m`f|$l4ri8+df~rnmychWq*p)JjczwF zZPND7etC6#_CDvS!)Z==do0u8a2YYqsQ1<5XGac>ymRnundaXQ>^f#p<5kP6Zn@BL zdisf7>6=z8pB=IDlk~4z@y9N=47wFqhtSGbYwMl&x}JVl=YH(= z)$JhW^A*#tAAQ{_Y4zUK#RuYowQ zdUt#@_T&#I$5y-$QLk^{YGF{~^wh0hX`b63tlPOP?dCr6*&^$Q zZy0!|!GMxS#ic#x^*-xAXsaV^%c?tvo-Rv$lGyGP`}%T$#IABbBVl8Gzvf3@R!$uE z`Kp8W#;I}YVVW7o^GBQwd#*{f3EGg%76XS;FKq^b=$8leQZ@) z%9(WMyw*_zH!$%lX;c)Rk@SJ z8npD8QmgU`wxp<+EYojEuV4D*-MVH^?8H37s3D;@qK^J@<8gy%DenOHgO{3C{~&KN zb0y!>?#_AgN%xY+F8%!Rg5>ny3N?K`v`5WZBlr4MFX7R8(?WT%qnPP7>%kbu+8#Z3 zFKTtZ%B~vIgT|FjFEC-_(vR~^+xTblJ^w2`p7pPD;#SGtG$ra%7KyZspVLu=c*?GIXWVf~~deFrZZ+2!K0 zGi+q7KbKrRx!Q}cBoCz9BO~DQr!#u9DQ}G zXN^+ZR_xd^vi_|TYgaXxJ;=A&M?tFwbgw(UOIqzczaB69(cuqI?rj^mKlp2}BR{q- z{W5l9BhS3IPwnXN>+(f+m#$hrso|-RT3*2$s}G_lSC#0i1^h}>{y4B>*-Poa(XWE5 znRV{^jt%XX`_Ak<+b6D2>WrxpQlvtu`g0!qS!uh}{KGxFjqN4kH;!xla(e5rUq$@B z!C3#XaXzm(B6ityRd&HV=06dFb`s6913~xfjg#o^|Oef4StUUX$)`+J5a-*sk8qwhyS4^6TQ- zrM=zOoI9B(sq?*qvBOXKl%|s1eYgC&|5WR}VeJ>~Z zE$J`Cdp{UCqe(*2S+}Rv@1*qW8*y__Z{K^*z05ye^}cb^H*xs9@*gFi>$5xh@Z&CF zwM#$fd$(59wW7&<#ZmKp8$I;7aXutC^nuTqfv0_ome-c-YW~&N^S|8oaKfvm{*5=> zN}0LAznJIz>yDK6iQ|@dJ-%K5Fx;|ASp~SDjDW81!i3#@`=J zEU+r%S@U1b)7x8|8PX$}-88Wn%a=Svg?TkvP+}4Npv=_8tIKUFu<~NjM&9L$1iK$y zG6eZ>vv6mkleqW&)iAvUKUB{xN7>qwA1_R-|Vny+MmO# zrCh!7;h1q{?iSwpXPEz>3x~yK{VwtqU!I=1a(dF}elL&L-_lXMKED2|8p}>?-yD%} ztlmY>r0H!IIdXN`w0X;r7OT#USUsuZnLe+cwLV`lsY=q3tI?fSPy1YJKWSU^cgH8@ zzxUge-yeA-pIx!FlSik1mwz3TYe`d_4hqGf3l+P z?)86^uNOgg;71SMe`(h8`=LR{F4w&j|ME~iM?~)qgR1rVq)3haDe}4CK~JWo{`Blz zpMv2Ck7{7RF+WG!P&DT+sdHZAx5B+-$W?`fnaZ{HBzDJ@m?eEi?Jzk01EWVO|xPHp>0L=ryk^ z>U`t5Rw&#KYuizA67RO6;=IIiZ-V6{pq^96J;n;M$06iSg(u0p*%11 zzKqBV4vC|EB}tY#p-7YPuoy=N62T87uYt~(sJQ4*XN-dV{vCe7 z&c3k@KmVQx^=qhXi%e?1k`dmB^TII{|s5hVG0BjSlY>8x`%K zT_WMwB045k>mCyAp!}MKxE>jvM-ySO-D5g9BwGGQFGge?EM}OMp$?qj;3?4|@scGI z4b5PA+PX+OFYykmJksoXRvl9BUtA2!S^bbU?28j5hb;0Ak)s_V#bjJ~76jHRW2NKW zmXEv(4CjzpA)^l7Nn3V(@=WsWdYNsz>XEV7@{_)a%%pAmIqN-ZI;lf0G6y7Glw5O_ zWz5z*VGcZ89J$J4WGrv|Ee=+k&4G6H3lmG0%9>LV7iFhIi|3zmwKTF3Izc4e1dcszi3{ z;k4d^Vx2u(V@GiHqh{_Fu4T(EF+uH|Bu4$efm*T|y@6UKIy|HYQfx&1cOcfa9sO&F z&8{;4-S5P7Ecmk_7HVg!w+)MyOs;%8&A#WTLM}7Snn%i$D{C5grYYHltv1FIn;^DD zu+n9Vc14m#Dv>?F&a<%DRW1{!U2V`5RMu+9)nL}ywIv~Gq>SCT{Y*Nw)3QoWo{0dY zzFn5wNjoIX7M48QhDx4^ff7I>`Q%RWNS&Mv`M)zQ>)42#_GOR5o}bK37-j4;$=;+pKXEdyOnNhK(@QpLAEqYbI>`dmvsTNKU)Hj z2C`?^9ou)3pHsQ4=_EhdyAtK}ZgCsCBeKuC z_WDfDNwnX#3StG|F*et?&6Kuc(*6QezvgIH2{)k*WnmBW3pU%gh*u)TI7*@Vxi=^v$58mTUWWPb581KE;3m#@}2|PBxy9o zvl)fSie|ZpKv^%lL0en0-pRs{ce_m1JJB3#@gRgXuxxiyHm6N5bD*vP+mn)08=37x z4(r*zyA+EkM|P;q>07|J-DLtS@Sk&Fv^G}#1Gt-0JcURJVgH8KvdGRxYwa<-54UxF z4Ee9|Hmd^s6VJBt*vLYZ!s512Hezv>wZMXiRauBdlnn2g^=t+D7kAr0NE?|K6h4E& zxF9ie0$^QPag=mM1S9!b(=AIv*-VSFtrF!jaoLqd>SYR^4H8J(w)NR}QkFbh^=!*R zIUI3-|X4629$M1{((*UD?kJ{Q8^$*}I3Fy8*za`O$H#i#{(j75c zB)t#DTHlSM&cEce*3SJu_?M+fuo_}XL_vvCWZ)vS)B<4_!yp-!DH01HX|6j-|937M zgfp#xbcP+D@YhUGhQ$!1tIRuXWXa2B3#7_l4c%4^ zyE*$#y3cx9DDF~+cMzR`F4^K;s$}0upLTmWl2t4XQ6FpB@SL>_QGu*iW;?Fw%KHA6 znq|$mrwaG%Ae?zfm*o#&{*Lt80QL_6a1QvZg&-yz)eYcC{%9x0{$ri2f$FzmTZxP9 zj$^MHUZ|qsuW(|+qat6Cef~;?+Q3A~(IQ~{SqzO2id^-)@yi<;%p!M0x@iMF?TbjT~*=3Qao5y&O#oaC_gbi&+ui6x!{@ zMu&v=a7MfSRxB_o3MbXjNpcYEAK5j^K{SoL1cmo^T8l*%F(x)7IyPekX_k`&5BI87 zjTWYdyN7#D0kgkQXg?)1I?9ZS?um9YddaFFR@xhCqAQLXevObby!LwC}#p6WJZw$PLc#uq;yy}wk#zhm&z!Guecbb(=1JyEQK55Lv1|x{NA*m{- zFf=b|x~2&#i#NOXY!YLS$3G%1ttpUFML8LAusjdh2(sZT2A*YUT4X7j&&*MfWAK8& zi;N)itRM@bU@#g4lm$az83Q#iVw#LNqb(JaAiPt9;yDqYriiS_p{{}UI8nrOGLV{P zAvG&yZ2=TWLhCX^0k9!8D=^eMZDx@IQ=_9@fwahAIOqv9!D?R?6^MggN$&!WToaOO zqRi49kA(qxxg>`mtoBYXC?RNQkri1;gA4qK7Z{|;kdVA#W&|B27!mTKH0BDu%K~6q zV3AIq(I@iWmWw0hExAP&lF$YX?IL4^AeeSmvPMoukAJ*k<+8}7APS8;!?LI+lZ(d; z{_o2EPhDBqGtEGcGjQP@%Oti5(-MaWg3i3F{aFml%%i#A)e+L(d129(KU&sC7IV5#*i2VKQIlI zl}uAoWu8|>l~&QN%9@5`iUu(nLo-Z{)m7*-phloI6aDK7r%SqFN(QA7BGj&v2^FNa#wWc$(97NG8als)-_H z0vL3Sh9&6`iq|=n7j#w@RFmdKQK41J1oSby$tWhR$$&#zGI$eM26!=LN|gR_HB5$LP=>7QoKB;AN)SaB zu!$8F4TEP**%Vn-0^CCkUT014A5j)0g=Auluy#OxzyoxIR#=Hvp-B?0kbQ>iO+Z~p zO7dxpk;AbBQ6cLMPljCrW$h9i@`;84Nd$u15<_;Rj2%M6T?3G!LEcnoURNj{dj&7! z!p?xc$j)Zr9d-g9(tfj>>re_%dCO@$AD_#H&>;uo9>dnU5C%Kk;G#IN{|c!;WK07*#&<6s#e86zoz3d5sCT?DQJ z;|#&%SryYQabO;b37n-&0EeM52I6yoCI%dU>^*=*0oDq%RZUaDt_*DAK_v`^hv!78 zuz$dzVbC0qSpmoh0%k=t6ag*~2m4__WmE-vB5GhBm~UVl_zFp9O^(BEz$?If&LDJ0 zY%C!7uzqYSx=F(w5qVmL#;a)B0I<*oNCoHy z5FeRPMdA!Gm@or13~aa}tFxFjGJ8NDnbBA@1~-D@Oioi}O(qVV3OIzupbx>o2PFoD zEfw+#3YY+D@mM5~Jr($d$rMdMry>EMDG|VEs0KJ3%kWrOh{6Cpz|#Z{I~c}-AyKl7 z)d8A-w_zuFi-%wpft!|25q4mElNJ|&w~G|Av;H;zKo}P2;X*Ie0(T+4FX71y$S-Z+ z(qM?n+iU=S7}Ns43CaOLm^U22$`Jsr0xVRp!jps>kqIOI5&pgc8yIX#5U~BiMi4L~ zM#iBuI?c|5Fg_aQZ1)c9&0r&3&b^~wEErlMzVq9R3QiuZSq4u0+pSr64cTlAz$l2H z4oNXJ+1n@32X^*OI`|&=DRh^^`dD#{x8y=wH0C!O+sZb-7;Of}!pO+nWp4ol<^N=b zM69$Wk;PsyJsfsjgh~)DL0|z&&6*TEbQTVeDe4kyfc-J5rfG^QqDKR349=Mozk^co zAn+<}Xaf8t04~V23W^S=nKQ8pa7i>-L~noLjafVI8-6DcLI_eNQ-)$l1`I~f0ChTy zQ#Ux!0v-;EDKQ++D<*JSK`~$#Qjwr?p!AXkFaRS1`2`*+AZ-9H42&lGhRA|PDR9!z zjwHj;(=?IMF-ilDfC}rEQ3oCc2VMi@hs~+rAO@|1#u^3!$k1gV5Tjv`jDqb7xCTAs zK+;siZ2&uPyg<4XnNej)CWa3F3kX)huoT0jv3h6<3E+P`7>XhyAR{Uqiw1yK7!gJW7T(z@90EVZjLnY=@Er9MlX20FE;nY`(;pFd7K70tXYDC`=l}8vdS))j@3y z+>+uAodYHysKa1a!@#g>LZhrZ2Nq3WjqG4nG>#=3p9ts&XbO}{_?2z~I-z~=b{LHL z@3ahU{%;g0it0L9{2nOKkh#bHpRXHoz1oQ&)WGMudKs6C5$2NjYL=+HE(G}S+ zL8stci(riaP&n1Z^M*gK$hyMAm*xy`D1?0A;=;e9foL))8n6igF$M83jx%JQp%E&B zp8|yC6tFr4^i5)Agq)BBP6g=IOaO$6kTbBIM+gL`3cO)}+X)&XSBMXRI53<9EC%sJ zUkw_*>9fj}UfHXhL` zxWL#E6dIT4-z|;NThqt}cLx^BU*@!heWYV4vU(4Qz{SS)2r)p9OhFSGvKN zI?x+Cr-T>)HUmxofU^>22XR6~O~BNE^RfzBo;B(P#3dmzqEoOe1UIvD5wgjEK@tW= zc+)%if5moKFvxe=BwD~h!8@@v!Wu}Vj6~YV9snc|5YWpgM}~?$1z|N8`&2O>)(H>} zA|#GSd3Z_|R|8_a%ce++V4T_cu1J0|Wpte&*sKiEF`IXi^OLvO8QGyQ4vj{{!Dg`; z9M)>`Z`!okr#0G)_#$EF*aPtuAQ&#fWbjHD!X6PR$r`I8K5nM0Z?kjE3&NNoH> z{tTY}hQkqtOA}T{Vz_UQ1fAxPhI#%Q|H6uIBC2JtBGC>q?-aa3Py=}J;6Av&Id=lc z6UY1Q_z8qUOdB>y>=G_GXk?DO^F3H=X4DqqWZ}L4abz+ZbA?s$SbIYN{b5Dog{z3# zgHIa>UtydkqG=KWkKn8zMnF>u@CH_EA}$KhBXJ1XR7GG9Fa~5+5hctUB7yLx1RCrQ z5hWd`j})(2HiTfIY{HR~(K3JmQAV`QBGxX;AhY=5#{g3j5p`67u)sce*@zz_NCzZ9 zJX1ip9e@C~4&?yngD#kef2v@r2->nVx&q?`)q?{LLXB7*qN>F2MeGvi9pI>-eToD| z2fj``J3~Q7PrGmm2dL_bj;dy$F=Bpv}5IL1-n$cpge5kn+FUl!BEs5oy$ z-zseQ(1$Yd%;?Wk1E6F z5RyVf6rmX&Ou$5RT)^Q8Lgp|ZlQ(!MzKRe%IT@GHEg&2~4%OCaRny^w!Wn|+hj=>ib#TN6H-?iDYBCVD zS2e_$L30uPMwAaOHQq!8z9@yjrw-)=mIwlRfG9Xp6L4sNEnHLai2^JL3QiO1Ploz#w9>+arC1KplXl<#0ya^oC3n#1$0175jX^OBatRRC1{O_Pbw5q2OO!e zO!(J0SCDXk!6F`pb2qRY5Djd>a52fmOIT+_;bg?z;i$=AVhRb!5+)}B8(7R24rFlX zDPvQRaNuPi><2!@!`y*3i1g#UfCL<|YB+L4xX{FKaTEYBMOz?EV3sn2163JkocI7i z$N4J=BRTz~C0bC)r$TDKSZ82JB))^g|E2>$HhlI4qJod6FqTe^zxzRD>(6LhU(w+M zIjEHN6b&&zsM`@b`JH!id~v{jLt5 zU*WSZs}1b%>^onU0q^lwoa86-%RCzAuJ5MdEc~5v_*UYbG)~U)W!`&zmB#Rx$be{P ZZ+s7hgY1ks$?}OBh%b*X)?J^d{TK9XUWWhx literal 0 HcmV?d00001 diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index b136afb1..81ec964a 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -864,39 +864,39 @@ private function getHeaderValue(?array $headerDic, string $key, string $type, $d */ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array { - $startxrefPreg = preg_match( - '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', + // If the $offset is currently pointed at whitespace, bump it + // forward until it isn't; affects loosely targetted offsets + // for the 'xref' keyword + // See: https://github.com/smalot/pdfparser/issues/673 + $bumpOffset = $offset; + while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) { + ++$bumpOffset; + } + + // Find all startxref tables from this $offset forward + $startxrefPreg = preg_match_all( + '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $pdfData, - $matches, - \PREG_OFFSET_CAPTURE, + $startxrefMatches, + \PREG_SET_ORDER, $offset ); - if (0 == $offset) { - // find last startxref - $pregResult = preg_match_all( - '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', - $pdfData, - $matches, - \PREG_SET_ORDER, - $offset - ); - if (0 == $pregResult) { - throw new \Exception('Unable to find startxref'); - } - $matches = array_pop($matches); - $startxref = $matches[1]; - } elseif (strpos($pdfData, 'xref', $offset) == $offset) { + if (0 == $startxrefPreg) { + // No startxref tables were found + throw new \Exception('Unable to find startxref'); + } elseif (0 == $offset) { + // Use the last startxref in the document + $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1]; + } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { // Already pointing at the xref table - $startxref = $offset; - } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) { + $startxref = $bumpOffset; + } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $bumpOffset)) { // Cross-Reference Stream object - $startxref = $offset; - } elseif ($startxrefPreg) { - // startxref found - $startxref = $matches[1][0]; + $startxref = $bumpOffset; } else { - throw new \Exception('Unable to find startxref'); + // Use the next startxref from this $offset + $startxref = (int) $startxrefMatches[0][1]; } if ($startxref > \strlen($pdfData)) { diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index dec70977..7a586932 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -172,4 +172,26 @@ public function testDecodeXrefStreamIssue479(): void $this->assertArrayHasKey('Subject', $details); $this->assertArrayHasKey('Title', $details); } + + /** + * Account for inaccurate offset values in getXrefData. + * + * Normally offset values extracted from the PDF document are exact. + * However in some cases, they may point to whitespace *before* a + * valid xref keyword. Move the offset forward past whitespace to + * make this function a little more lenient. + * + * @see https://github.com/smalot/pdfparser/issues/673 + */ + public function testGetXrefDataIssue673(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue673.pdf'; + + // Parsing this document would previously throw an Exception + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $text = $document->getText(); + + self::assertStringContainsString('6 rue des Goutais', $text); + } } From 45f7e5382b86865d00b6177ad0c6ab6c297ca3b2 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Mon, 25 Mar 2024 16:22:58 -0400 Subject: [PATCH 2/2] Drop unnecessary PREG_OFFSET_CAPTURE No need to use `PREG_OFFSET_CAPTURE` here. --- src/Smalot/PdfParser/RawData/RawDataParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 81ec964a..5e17083a 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -891,7 +891,7 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { // Already pointing at the xref table $startxref = $bumpOffset; - } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $bumpOffset)) { + } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) { // Cross-Reference Stream object $startxref = $bumpOffset; } else {