From 7a07955c94f43cd391337f433109a403a00e2a8f Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Mon, 5 Aug 2024 14:50:24 +0900 Subject: [PATCH 1/6] feat: Bump Lucene@8.11.3 and lucene-gosen@8.11.0 --- .../org/languagetool/commandline/Main.java | 16 +++- .../languagetool/commandline/MainTest.java | 2 +- .../resource/yy/ngram-index/1grams/_0.cfe | Bin 284 -> 352 bytes .../resource/yy/ngram-index/1grams/_0.cfs | Bin 1233 -> 1670 bytes .../resource/yy/ngram-index/1grams/_0.si | Bin 248 -> 397 bytes .../resource/yy/ngram-index/1grams/segments_1 | Bin 102 -> 154 bytes .../resource/yy/ngram-index/2grams/_0.cfe | Bin 284 -> 352 bytes .../resource/yy/ngram-index/2grams/_0.cfs | Bin 979 -> 1445 bytes .../resource/yy/ngram-index/2grams/_0.si | Bin 248 -> 397 bytes .../resource/yy/ngram-index/2grams/segments_1 | Bin 102 -> 154 bytes .../resource/yy/ngram-index/3grams/_0.cfe | Bin 284 -> 352 bytes .../resource/yy/ngram-index/3grams/_0.cfs | Bin 962 -> 1428 bytes .../resource/yy/ngram-index/3grams/_0.si | Bin 248 -> 397 bytes .../resource/yy/ngram-index/3grams/segments_1 | Bin 102 -> 154 bytes languagetool-dev/pom.xml | 4 + .../dev/archive/StartTokenCounter.java | 51 +++++------ .../dev/bigdata/AggregatedNgramToLucene.java | 17 ++-- .../dev/bigdata/CommonCrawlToNgram.java | 19 ++-- .../bigdata/GermanUppercasePhraseFinder.java | 83 +++++++++--------- .../dev/bigdata/LargestNGramFinder.java | 4 +- .../dev/bigdata/NeededNGramCounter.java | 3 +- .../dev/bigdata/TextIndexCreator.java | 2 +- languagetool-language-modules/ja/pom.xml | 2 +- .../dev/dumpcheck/SentenceSourceIndexer.java | 2 +- .../dev/index/AnyCharTokenizer.java | 7 +- .../dev/index/PatternRuleQueryBuilder.java | 2 +- .../org/languagetool/dev/index/Searcher.java | 9 +- .../dev/index/SearcherResult.java | 6 +- .../index/PatternRuleQueryBuilderTest.java | 4 +- pom.xml | 7 +- 30 files changed, 126 insertions(+), 114 deletions(-) diff --git a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java index 5cbe62d89365..f9ae81837db7 100644 --- a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java +++ b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java @@ -20,9 +20,13 @@ import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.BOMInputStream; -import org.languagetool.*; +import org.languagetool.JLanguageTool; +import org.languagetool.Language; +import org.languagetool.Languages; +import org.languagetool.MultiThreadedJLanguageTool; import org.languagetool.bitext.TabBitextReader; -import org.languagetool.language.*; +import org.languagetool.language.AmericanEnglish; +import org.languagetool.language.English; import org.languagetool.language.identifier.LanguageIdentifier; import org.languagetool.language.identifier.LanguageIdentifierService; import org.languagetool.rules.Rule; @@ -35,7 +39,13 @@ import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java index f9e6c3f87917..f130c02887fd 100644 --- a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java +++ b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java @@ -277,7 +277,7 @@ public void testEnglishStdIn4() throws Exception { String[] args = {"-l", "en", "--json", "-"}; Main.main(args); String output = new String(this.out.toByteArray()); - assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":\"")); + assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":")); assertTrue("Got: " + output, output.contains("\"language\":{\"name\":\"English\",\"code\":\"en\"")); assertTrue("Got: " + output, output.contains("{\"message\":\"Use \\\"a\\\" instead of 'an' if the following word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.\"")); assertTrue("Got: " + output, output.contains("\"replacements\":[{\"value\":\"a\"}]")); diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe index b798387ded9d6ce8c58ee6f175bcc9d66732ed6e..ee091ce488475241577147c62052f889c8e1ad9e 100644 GIT binary patch literal 352 zcmcD&o+B>qQ<|KbmuhO@oS$2eUz(TVnpaYknOe*M1V-{FSFT<4E&9@*#XWgTrrc)W z=8uOcw=jt}&@0FXs{$(21E~gr5Ei|(lw1g#p$SUQL(!Cyp9~RaRDjC+p~#nHLiID| zLFHRn^zzD5ARNZeP{DLGO$88frYNYq7pi+&tj1dPkCcl6xU|4W>&p`lSN=9%1 literal 284 zcmcD&o+B>loS$2eUz(TVmYI_pUX)pqTI8BnQk0om%)r3F$jPFYmX`};fq*=OV)WvV z_bE+I%}X^gj5p9r$xnugXF_O(U=;a+e2_^%z_;<5TVZO2ka(oyy(EodoMHddo#b8*|Cr1!5xz| zO_(C3OjwnhR?XBiMW;wWPVIlY^s4o9`u*>hkIcnqpUD_l@A0fhho`KnW0lpY{#8xIswmh@{z!F#wQY z^}@n4N`Oz)rhAY41j_Vy7v(&qQaHNUndP-E8D}gF-sqO`WEd^~c<2G54UCs9ZLVMo zQ4n^enlUs7V0zlq3=tPUkw_@1R1am0#^Mcjj#EzKjcYZdtV01zLle$*(IZRTRja7x zeMwF!mt2YFO^islZuM@HktK&`x_jKWBjvc!34xiZ` z&DE{zUy=oN80qTr_k|5u^P8g^Yv{E%=)P#Ly2N8%T}W>tiENc}&S`?BQX`1Rv-tw} z#X+|U#8AT7YX%l_Pzw%)7>!zj1z0l5T38U8q{-7!1Kx6>02k!Ew~_l{5*E^n_t0n8 z24j-b>*&&+Nrc4dhj&!;AM*W2@{VN--5a)SvFi8@&^c>lfGtVSKTwIqeBu2>#Vl1B zNKuJo8mcB_rY003t4M>2J*mnx(SLv@!&$uevfCYiosr(dO6pzu&*y_2_YeJqY=I*W z{-u&0$VN8U$0!%Pjr4gqfh(cwAmMO-BtNx~v;i;NwBpj0GfVD=j&%^uW9F<~ ztYByeg~Ikt3EOoG`Y#LPR&-95+)eh07~agRS}T=|I#ozagU_-9?e_1Xw>p+nEElXf YVNT#9?0-BG=Jkm5m4)w5X6E1g0fBS+0ssI2 literal 1233 zcmcIj%}*0S6rVSrJj=+T>r2V?Z&&7*$-IiWN*4=8$tA^JmS~X8g>1? zHOFI?z3CMlw{5a3P5+@ws?6WoyiT)eZNq%1&iybr5$*_cnAxg252$Sj6?`HpZ)cx< z=Yja=h5mR(Bsmd`AA}>rcIvzh%*r`pE)}ciN;3Hjl~n=$%)CS^QjBd zD&?Eq521fhasffvmq#jaZt)-`fHCRi+A6;pPu@2-At5{Ri2&p`{l97fZ09#lRp0ye zynMuVg*h(ei^n(1E7=XY%?U8(nt;;$gVt5_O=j#1bo_|-70S#;e_sWxeVcqI5UrP zxgp>d01X?Lu!s&&IIVAVtX6~n7c|?dQ^M}kK~q{bkq~6LO=8Rv0tELkAp~>ETn-U@ z=)VnE1OW!z(*OWN7P9@}&{K7=`1MC4!g?~WuI4N+8+W^ak%nJuFF*bmLxNWqioQ*q zkAnu9I%J}dRRvx>!COKgaV)vgYQAtUh>jF1evHVU%6AG#;MBrlf9Z}Dv|dn{UR;+` SZmA{?Hr~Bg-@nf7fB6Mo06QQ6 diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si index 7fce7afe0e397b9faf6fe36c7177f8f0f9e81734..4a4978085455384adc642e3561270b455a65f521 100644 GIT binary patch literal 397 zcmcD&o+B>oQ<|Kbmug`aoSL4SnpfhPmzK`}1V-{FSFT<4E&9@*#XWgTrrZXJaR4zl z5Hm9(bAW6XMlPoOVpgBbywVEpti-ZJy|UE2l>8#L(4^A5l2RV1P*HJaejdA_xt@WZ zp&>s+Fc+v4EG`KZ*S6G!=+ZSbGSV|J(KF=B0Xq?^QF*n7` zLk!0Jl(fGynhq literal 248 zcmXwzIS#@w5Jkh1upjU5mb1r-8`f`SrR&IF7S8`%ls9$bi9&{1;?ES4!| z{d)7~kEGvQ=K>V4$rcVNsBEcX?Rj1yV2EHuU0W`=RIOz)1)K5o+#-yQr(-h1A!f9$ zExLfQQmYO{gd#i&>FD5)Mv9`g{#vU1+D<-S{>mRbusSya{5Z+&vEo47h@;QbXu_H& ze&K(4m7poY`;at#TNP>3-54$s7%a3oYr`{V!1*i%dyGVEzXg0ZS$QAcUhejZ-Rc>= E0bDgmaR2}S diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 index 6e93fa9e6973c01b51bcc3a79747c632d654f624..e49de5d67abe728d773a509daaccb59008afc4cf 100644 GIT binary patch literal 154 zcmcD&o+HjtoSL4SnpaZHz`(#|B!6<{+Ew48Fa25Elec8bJw`(gZe|Xk3<$6=0trT- y08_jHn!?*0KBdX2d8roW|Dgb+0mg>WATdTXop(Wk3=9V}Ccl8N7#M7t5<>tlctT+S literal 102 zcmcD&o+HjtoSL4SnpaZHz`(!^#2^5o89|sS-hk7mG&wad)x^-?KNNsez}PSvBnATq OG$y}*u^(Ul;0XZK{w37_ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe index d04ba2ee5c95f28b6760c81745ffd5063850c7bc..15e6f67ed0a2125695017585030aa47d803fac0d 100644 GIT binary patch literal 352 zcmcD&o+B>qQ<|KbmuhO@oS$2eUz(TVnpaYknOe*M1V-{FSFT<4E&9@*#XWgTrrc-X z=8uOcw=jt}&@0FXs{$(21E~grKo-5Ulw1g#p&m-lL(!Cyp9~Ral!eOsp~#nHLiICd zL*>g^^zzD5pqd^*1+CFE6+pz9V1|04%I8A$Go?ZGzXiFp0>WVqh6*}>1WTZTlc9nw iAnT!SViAW5b^}%B!2}mU1wS9qnEV3DX4t|meh>floS$2eUz(TVmYI_pUX)pqTI8BnQk0om%)r3F$jPFYmX`};fq*=OVtB$I z?^BwbnwM%~7;m7LlAjC_XJm%T2cyUr&^1&0xl&msgeo;V||<1+{=SrBpx# zw?PH%Kq{fuGk$>zx}z9hk_nY(vWChZM3XOoh%-k(<+DLHl|VR5f1rXf2Q((XfU+6N H99mKVhXFI| diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs index 36ff0f65c13957d1c9f0b2008c4a2bb1d9faf582..5d62d79a170c8194813d55beab1a1753385c3aba 100644 GIT binary patch literal 1445 zcmcD&o+B>eQ<|KbmuhO@oS$2eUz(TVl30?+00c(zCs(dr^)33+pT#|SOQzgsum`G^ zgs8SK3CJ%l$;?YH4ll|qNi72MfI1k_wQ#{y#~Xl5JfJc81&9WL?(kVVK?aILm6*C^ zrskv+d*-F2R`{lZUBH5F9796`D-%#J2y;PcAjJSS9K>({5oiD=Y35XcCIjTcBuf|> znUGz#Q|)XC9`Cy3Cj&L(^R5rFX^!O*wjk4loO1G$vqOqfQ$tdVa*JIulS_ccu%UYs zDb!d+jEq?s8JSsY7;TvuS=TV;WhSR8B$a07qyPiIBqLP;B*4PU0ajwq%!q95;AGTjRu^wk!Qypj4CC`UNS8D${h+2c zaOe?;fCfe;Mkc0PK+MEw#As{|GLG3TI0UGTospUG7BCGlGB7bs0?IS-qJ%03ELbg2 z^#Ij^^gw_olrBdOwZj#SS3!Eju%v0v6marGk86}X5tN{41=24A)o&hLl3$dX0?8R} ziNz(*xWmjC!0f=pn$y6@%)|l*0t^N~uNVN6hygH-7;rEe8!+%f*uWHI08By#z%&FD zH$wKqkBvSJAUBAjBpOhBrQk|9prH5<1<+st@qtWGMg;>Inh6y7mnOO7EixP7g7?|PCVUi6ioS$2eUz(TVmYI_pUX)pqTI7;glE}coz-SMY7V;@gPR&a-F>}jI%}Mdh zOUo|?N-;69=A{=U<}xq}{D%UNsBA!L5m;S7esM`=UV5=xeo<~>iC!9*;(!`uh)Y>< zXAU{gN90qeXG#K)efy$Ve6cm}1Syqap@xFo+Q6&QEG$SL;BOG&K&JDCv}dkm^T%;1A;kLd4beIVVU zDE9cKmL#S?q8#crpcyrcU{``zpa|wiHm3CUQBSZjPCow5-ti$puCDPRu0g)R@h+au zAwVrGAX#bbp&F73^cu*K4UDXuj7E&cEQ~;RvN z0EDbL4Ida7;4a|+TFJ}FZ~);(Mk8diPyKv831qeyM&Nk*xwu9EodOOWX18E)NHH)n l13d$FA|nGxAF{+Pn1s9qSOo(^AhK0_+)d@kd}SqbK>(306J-DZ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si index 3b116de2a4f01b45a4aab36a13ec4eeab71413d6..ce3c306f6fabfc7e50966bc58493e21b978653cd 100644 GIT binary patch literal 397 zcmcD&o+B>oQ<|Kbmug`aoSL4SnpfhPmzK`}1V-{FSFT<4E&9@*#XWgTrrZaKaR4zl z5Hm9(bAW6nMlPoOVpgBbywVEpti-ZJy|UE2l>8#L(4^A5l2RV1P*HJaejdA_xt@WZ zp&>s+Fc+v4EG`KZ*S6G!=+ZSbGSV|J(KF=B0Xq?^QF*n7` zL*0J*DnApigX literal 248 zcmcD&o+B>oQ<|Kbmug}boSL4SnpfhPmzK}Kz`)36qGxEJXAERAF#-t=&XUaB)Z&uF z+yY)h6JsNDBSSMw14|Q2uKZ%XvecsD%=|n-V?9G-Jp)}+W8HM1kwuxwO!>vEKACx? z6?{2hTVN`ohO-stmlh?bvZm#f7H6;n4N5FZ&R|WWATdTXoex2R3=9V}Ccl8N7#P0D%bNlKG!#N% literal 102 zcmcD&o+HjtoSL4SnpaZHz`(!^#2^5o89|sS-hk7mG&wad)x^-?KNNsez}PSvBnATq OG$y}*u^(Ul;0XZK{w37_ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe index dfbaafafcd618835374131435fef57f611757a9d..fb7f2188cace8a2fb106dc01d8e091b5d980d4d0 100644 GIT binary patch literal 352 zcmcD&o+B>qQ<|KbmuhO@oS$2eUz(TVnpaYknOe*M1V-{FSFT<4E&9@*#XWgTraWfg z=8uOcw=jt}&@0FXs{$(21E~fAe-^#8lw1g#p%zNdL(!Cyp9~Ral!nUtp~#nHLiICd zLgjN=^zzD5pqj2h1+CFE6+pz91flYtsPef`{Y+6%`ClNHRzNt+Ay7dFkYEW^a0*ng j2xL9fO)P9s!ET_+Jec4_sNm-V8k1i@*$iEKx9S4`@s&kY literal 284 zcmcD&o+B>loS$2eUz(TVmYI_pUX)pqTI8BnQk0om%)r3F$jPFYmX`};fq*=OVtB$I z?^BwbnwM%~7;m7LlAjC_XJm%T2cyUr&^1&0xZ!msgeo;V^bV1+{=SrBpx# zw?GB$Kq{fuGk$^!x}z9hk_nY(vV_X-MUyXph%>uG<+DLHl|VR5-=Knl2Q((XfU+6b I`rB6l0DKBF;s5{u diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs index 552a0d09868e01398720e16e1a527905fa834359..c853d3bd23f01fcedbb784d2aacfbd47e6fd3473 100644 GIT binary patch literal 1428 zcmcD&o+B>eQ<|KbmuhO@oS$2eUz(TVl30?+00c(zCs(dr^)33+pT#|SOQt+#um`G^ zgs8SK3CJ%l$;?YH4ll|qNi72MfI1k_wQ#{y#~Uy(9MG8j0%R@-@Fwtn2N@?0Rbc9t znVOSQ?3tI6TH%`twx0#vFouQ(RwkfcMj++_Vh|eyfJQ@reFKbzpkb0`PB0ci4z4qa zk&zMEb#o8Y*y8c5OMWuY5PY8XK{jnwfYK3=X+lmp`N`QKMX9MFsYSWPE}6+CKx5d@ zy@?cN%sht7K&LU+FxWEpFqUMbD&%D*rz#|sX6B>-!=70XEN6#oS$M~EJCJ39(111Y z%LmzoD@u{IUOVkG3#3&T)t%rtA>dHd$bCHbT6rVpx*3XG#S~J$iJbWDAA$&OIRgGEn{I!6o@csVR`m;g(oj z0*y$_yurZ6B*w_h!~y{V3IX zg$X0zm{{}DixP7g7?|PCVUi6ioS$2eUz(TVmYI_pUX)pqTI7;glE}coz-SMY7V;@gPR&a-F>}jI%}Mdh zOUo|?N-;69=A{=U<}xq}{D%UNsBA!L5m;S7esM`=UV5=xeo<~>iC!9*;(!`uh)Y>< zXAU{gN90qeXG#K)efy$Ve6cm}1SyoQ<|Kbmug`aoSL4SnpfhPmzK`}1V-{FSFT<4E&9@*#XWgTraT6TaR4zl z5Hm9(bAW6{MlPoOVpgBbywVEpti-ZJy|UE2l>8#L(4^A5l2RV1P*HJaejdA_xt@WZ zp&>s+Fc+v4EG`KZ*S6G!=+ZSbGSV|J(KF=B0Xq?^QF*n7` zLoQ<|Kbmug}boSL4SnpfhPmzK}Kz`)36qGxEJXAEL80tpVzlFZ!H;*!MN z0$xKCV$rsb3tXRreeN-RpwU`@WATdTXolil63=9V}Ccl8N7#Nc6JevSERzh+B literal 102 zcmcD&o+HjtoSL4SnpaZHz`(!^#2^5o89|sS-hk7mG&wad)x^-?KNNsez}PSvBnATq OG$y}*u^(Ul;0XZK{w37_ diff --git a/languagetool-dev/pom.xml b/languagetool-dev/pom.xml index de8a11b203bb..bba8bd88b0cd 100644 --- a/languagetool-dev/pom.xml +++ b/languagetool-dev/pom.xml @@ -58,6 +58,10 @@ org.languagetool languagetool-wikipedia + + org.apache.lucene + lucene-core + org.mariadb.jdbc mariadb-java-client diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java index ab45341f7edc..39c001d227a5 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java @@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException { try (FSDirectory directory = FSDirectory.open(dir.toPath()); IndexReader reader = DirectoryReader.open(directory)) { IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); - Terms ngrams = fields.terms("ngram"); - TermsEnum iterator = ngrams.iterator(); - BytesRef next; - int i = 0; - while ((next = iterator.next()) != null) { - String term = next.utf8ToString(); - if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { - if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { - //System.out.println("ignore: " + term); - continue; - } - TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); - if (topDocs.totalHits == 0) { - throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits); - } else if (topDocs.totalHits == 1) { - int docId = topDocs.scoreDocs[0].doc; - Document document = reader.document(docId); - Long count = Long.parseLong(document.get("count")); - //System.out.println(term + " -> " + count); - totalCount += count; - if (++i % 10_000 == 0) { - System.out.println(i + " ... " + totalCount); + for (String field : FieldInfos.getIndexedFields(reader)) { + Terms ngrams = MultiTerms.getTerms(reader, field); + TermsEnum iterator = ngrams.iterator(); + BytesRef next; + int i = 0; + while ((next = iterator.next()) != null) { + String term = next.utf8ToString(); + if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { + if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { + //System.out.println("ignore: " + term); + continue; + } + TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); + if (topDocs.totalHits.value == 0) { + throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value); + } else if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document document = reader.document(docId); + Long count = Long.parseLong(document.get("count")); + //System.out.println(term + " -> " + count); + totalCount += count; + if (++i % 10_000 == 0) { + System.out.println(i + " ... " + totalCount); + } + } else { + throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits); } - } else { - throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits); } } } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java index 8b352f75be7b..3a06ee9819d0 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java @@ -1,4 +1,4 @@ -/* LanguageTool, a natural language style checker +/* LanguageTool, a natural language style checker * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or @@ -20,7 +20,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -95,13 +99,8 @@ private Document getDoc(String ngram, long count) { } @NotNull - private LongField getCountField(long count) { - FieldType fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setOmitNorms(true); - fieldType.setNumericType(FieldType.NumericType.LONG); - fieldType.setDocValuesType(DocValuesType.NUMERIC); - return new LongField("count", count, fieldType); + private LongPoint getCountField(long count) { + return new LongPoint("count", count); } private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java index 4e130e87bcb5..8e2e92b25566 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java @@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws if (newReader != null) { reader = newReader; }*/ - index.reader = DirectoryReader.open(index.indexWriter, true); + index.reader = DirectoryReader.open(index.indexWriter, true, true); index.searcher = new IndexSearcher(index.reader); for (Map.Entry entry : ngramToCount.entrySet()) { Term ngram = new Term("ngram", entry.getKey()); TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2); //System.out.println(ngram + " ==> " + topDocs.totalHits); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { Document doc = getDoc(entry.getKey(), entry.getValue()); index.indexWriter.addDocument(doc); - } else if (topDocs.totalHits == 1) { + } else if (topDocs.totalHits.value == 1) { int docNumber = topDocs.scoreDocs[0].doc; Document document = index.reader.document(docNumber); long oldCount = Long.parseLong(document.getField("count").stringValue()); @@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue())); // would probably be faster, but we currently rely on the count being a common field: //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue()); - } else if (topDocs.totalHits > 1) { + } else if (topDocs.totalHits.value > 1) { throw new RuntimeException("Got more than one hit for: " + ngram); } //System.out.println(" " + entry.getKey() + " -> " + entry.getValue()); @@ -221,13 +221,8 @@ private Document getDoc(String ngram, long count) { } @NotNull - private LongField getCountField(long count) { - FieldType fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setOmitNorms(true); - fieldType.setNumericType(FieldType.NumericType.LONG); - fieldType.setDocValuesType(DocValuesType.NUMERIC); - return new LongField("count", count, fieldType); + private LongPoint getCountField(long count) { + return new LongPoint("count", count); } private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { @@ -269,7 +264,7 @@ static class LuceneLiveIndex { IndexWriterConfig config = new IndexWriterConfig(analyzer); directory = FSDirectory.open(dir.toPath()); indexWriter = new IndexWriter(directory, config); - reader = DirectoryReader.open(indexWriter, false); + reader = DirectoryReader.open(indexWriter, false, false); searcher = new IndexSearcher(reader); } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java index ad0e60af36a3..e8a2365c3111 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java @@ -34,7 +34,9 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Set; /** * Prototype to find potential upper-only phrases like "Persischer Golf". @@ -57,47 +59,48 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms("ngram"); - TermsEnum termsEnum = terms.iterator(); - int count = 0; - BytesRef next; - while ((next = termsEnum.next()) != null) { - String term = next.utf8ToString(); - count++; - //term = "persischer Golf"; // for testing - String[] parts = term.split(" "); - boolean useful = true; - int lcCount = 0; - List ucParts = new ArrayList<>(); - for (String part : parts) { - if (part.length() < MIN_TERM_LEN) { - useful = false; - break; + for (String field: FieldInfos.getIndexedFields(reader)) { + Terms terms = MultiTerms.getTerms(reader, field); + TermsEnum termsEnum = terms.iterator(); + int count = 0; + BytesRef next; + while ((next = termsEnum.next()) != null) { + String term = next.utf8ToString(); + count++; + //term = "persischer Golf"; // for testing + String[] parts = term.split(" "); + boolean useful = true; + int lcCount = 0; + List ucParts = new ArrayList<>(); + for (String part : parts) { + if (part.length() < MIN_TERM_LEN) { + useful = false; + break; + } + String uc = StringTools.uppercaseFirstChar(part); + if (!part.equals(uc)) { + lcCount++; + } + ucParts.add(uc); } - String uc = StringTools.uppercaseFirstChar(part); - if (!part.equals(uc)) { - lcCount++; + if (!useful || lcCount == 0 || lcCount == 2) { + continue; } - ucParts.add(uc); - } - if (!useful || lcCount == 0 || lcCount == 2) { - continue; - } - String uppercase = String.join(" ", ucParts); - if (term.equals(uppercase)){ - continue; - } - long thisCount = getOccurrenceCount(reader, searcher, term); - long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); - if (count % 10_000 == 0) { - System.err.println(count + " @ " + term); - } - if (thisCount > LIMIT || thisUpperCount > LIMIT) { - if (thisUpperCount > thisCount) { - if (isRelevant(lt, term)) { - float factor = (float)thisUpperCount / thisCount; - System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); + String uppercase = String.join(" ", ucParts); + if (term.equals(uppercase)) { + continue; + } + long thisCount = getOccurrenceCount(reader, searcher, term); + long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); + if (count % 10_000 == 0) { + System.err.println(count + " @ " + term); + } + if (thisCount > LIMIT || thisUpperCount > LIMIT) { + if (thisUpperCount > thisCount) { + if (isRelevant(lt, term)) { + float factor = (float) thisUpperCount / thisCount; + System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); + } } } } @@ -117,7 +120,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException { TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { return 0; } int docId = topDocs.scoreDocs[0].doc; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java index 54d6dc8d8587..74a6b0c8f887 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java @@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); + Terms terms = MultiTerms.getTerms(reader, "ngram"); long max = 0; String maxTerm = ""; - Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; @@ -71,5 +70,6 @@ public static void main(String[] args) throws IOException { } System.out.println("Max: " + max + " for " + maxTerm); } + } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java index d5caea350778..b7c43dc46619 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java @@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException { String ngramIndexDir = args[0]; FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath()); IndexReader reader = DirectoryReader.open(fsDir); - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms("ngram"); + Terms terms = MultiTerms.getTerms(reader, "ngram"); TermsEnum termsEnum = terms.iterator(); int i = 0; int needed = 0; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java index 0d97a4df0bab..e4ed2832f7a0 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java @@ -20,7 +20,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; diff --git a/languagetool-language-modules/ja/pom.xml b/languagetool-language-modules/ja/pom.xml index 12dfea7942b2..58ee2b8b9d68 100644 --- a/languagetool-language-modules/ja/pom.xml +++ b/languagetool-language-modules/ja/pom.xml @@ -40,7 +40,7 @@ - com.github.lucene-gosen + org.omegat.lucene lucene-gosen ipadic diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java index d3b2adfc4abe..5f99596bf348 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java @@ -19,8 +19,8 @@ package org.languagetool.dev.dumpcheck; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java index effff0cf9460..8cb404ed5924 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java @@ -20,8 +20,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.AttributeFactory; import java.io.IOException; @@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer { private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length! private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096); - private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class); @@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException { while(true) { if(this.bufferIndex >= this.dataLen) { this.offset += this.dataLen; - this.charUtils.fill(this.ioBuffer, this.input); + CharacterUtils.fill(this.ioBuffer, this.input); if(this.ioBuffer.getLength() == 0) { this.dataLen = 0; if(length <= 0) { @@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException { this.bufferIndex = 0; } - int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength()); + int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex); int charCount = Character.charCount(c); this.bufferIndex += charCount; if(this.isTokenChar(c)) { diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java index 622c7f7291f5..e356998b9f95 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java @@ -124,7 +124,7 @@ private SpanQuery asSpanQuery(BooleanClause query) { } else { Set terms = new HashSet<>(); try { - indexSearcher.createWeight(query.getQuery(), false).extractTerms(terms); + indexSearcher.createWeight(query.getQuery(), ScoreMode.COMPLETE_NO_SCORES, 1.0f).extractTerms(terms); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java index 22dbb89e625c..8564e197a446 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java @@ -33,6 +33,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; @@ -101,7 +102,7 @@ public int getDocCount() throws IOException { private int getDocCount(IndexSearcher indexSearcher) throws IOException { Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL); TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1); - if (search.totalHits != 1) { + if (search.totalHits.value != 1) { return -1; } ScoreDoc scoreDoc = search.scoreDocs[0]; @@ -334,7 +335,7 @@ class SearchRunnable implements Runnable { private List matchingSentences; private Exception exception; private boolean tooManyLuceneMatches; - private int luceneMatchCount; + private long luceneMatchCount; private int maxDocChecked; private int docsChecked; private int numDocs; @@ -356,7 +357,7 @@ public void run() { PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query); long luceneTime = System.currentTimeMillis() - t2; long t3 = System.currentTimeMillis(); - luceneMatchCount = limitedTopDocs.topDocs.totalHits; + luceneMatchCount = limitedTopDocs.topDocs.totalHits.value; tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits; MatchingSentencesResult res = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); matchingSentences = res.matchingSentences; @@ -382,7 +383,7 @@ boolean hasTooManyLuceneMatches() { return tooManyLuceneMatches; } - int getLuceneMatchCount() { + long getLuceneMatchCount() { return luceneMatchCount; } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java index 40c860af0650..6c39036346e0 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java @@ -35,7 +35,7 @@ public class SearcherResult { private int docCount; private int maxDocChecked; private boolean hasTooManyLuceneMatches; - private int luceneMatchCount; + private long luceneMatchCount; private int skipHits; private int numDocs; @@ -81,11 +81,11 @@ public boolean hasTooManyLuceneMatches() { return hasTooManyLuceneMatches; } - public void setLuceneMatchCount(int luceneMatchCount) { + public void setLuceneMatchCount(long luceneMatchCount) { this.luceneMatchCount = luceneMatchCount; } - public int getLuceneMatchCount() { + public long getLuceneMatchCount() { return luceneMatchCount; } diff --git a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java index 8febf19486a4..be0b8a93e752 100644 --- a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java +++ b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java @@ -271,11 +271,11 @@ public void testSeveralElements() throws Exception { assertMatches(makeRule("How do you"), 1); // known overmatching } - private void assertMatches(AbstractPatternRule patternRule, int expectedMatches) throws Exception { + private void assertMatches(AbstractPatternRule patternRule, long expectedMatches) throws Exception { PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher); Query query = queryBuilder.buildRelaxedQuery(patternRule); //System.out.println("QUERY: " + query); - int matches = searcher.search(query, 1000).totalHits; + long matches = searcher.search(query, 1000).totalHits.value; assertEquals("Query failed: " + query, expectedMatches, matches); } diff --git a/pom.xml b/pom.xml index f6ccc7e61d3c..b1423e960ece 100644 --- a/pom.xml +++ b/pom.xml @@ -163,6 +163,7 @@ 0.8.2 2.1.2 6.2.1 + 8.11.0 1.2.2 portable-1.8.2 70.1 @@ -217,7 +218,7 @@ 2.16.1 0.02 1.18.30 - 5.5.5 + 8.11.3 2.1.9 0.6 @@ -275,9 +276,9 @@ ${jackson.version} - com.github.lucene-gosen + org.omegat.lucene lucene-gosen - ${com.github.lucene-gosen.version} + ${org.omegat.lucene.lucene-gosen.version} ipadic From b3ae1658347d2323d3cb4293c622a542fe70e090 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 6 Aug 2024 16:23:46 +0900 Subject: [PATCH 2/6] fix: update for migrating lucene 8.11.3 Signed-off-by: Hiroshi Miura --- .../languagemodel/LuceneSingleIndexLanguageModel.java | 10 +++++----- .../languagetool/dev/HomophoneOccurrenceDumper.java | 6 ++---- .../main/java/org/languagetool/dev/index/Searcher.java | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java b/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java index 0b0bf65fe165..77bf868ee913 100644 --- a/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java +++ b/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java @@ -146,10 +146,10 @@ public long getTotalTokenCount() { try { RegexpQuery query = new RegexpQuery(new Term("totalTokenCount", ".*")); TopDocs docs = luceneSearcher.searcher.search(query, 1000); // Integer.MAX_VALUE might cause OOE on wrong index - if (docs.totalHits == 0) { + if (docs.totalHits.value == 0) { throw new RuntimeException("Expected 'totalTokenCount' meta documents not found in 1grams index: " + luceneSearcher.directory); - } else if (docs.totalHits > 1000) { - throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents: " + docs.totalHits + " in " + luceneSearcher.directory); + } else if (docs.totalHits.value > 1000) { + throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents: " + docs.totalHits.value + " in " + luceneSearcher.directory); } else { long result = 0; for (ScoreDoc scoreDoc : docs.scoreDocs) { @@ -194,9 +194,9 @@ private long getCount(Term term, LuceneSearcher luceneSearcher) { long result = 0; try { TopDocs docs = luceneSearcher.searcher.search(new TermQuery(term), 2000); - if (docs.totalHits > 2000) { + if (docs.totalHits.value > 2000) { throw new RuntimeException("More than 2000 matches for '" + term + "' not supported for performance reasons: " + - docs.totalHits + " matches in " + luceneSearcher.directory); + docs.totalHits.value + " matches in " + luceneSearcher.directory); } for (ScoreDoc scoreDoc : docs.scoreDocs) { String countStr = luceneSearcher.reader.document(scoreDoc.doc).get("count"); diff --git a/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java b/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java index 2b4d22c4c684..3fa2ff50743a 100644 --- a/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java +++ b/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java @@ -18,8 +18,7 @@ */ package org.languagetool.dev; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -112,8 +111,7 @@ private void dumpOccurrences(Set tokens) throws IOException { private TermsEnum getIterator() throws IOException { LuceneSearcher luceneSearcher = getLuceneSearcher(3); - Fields fields = MultiFields.getFields(luceneSearcher.getReader()); - Terms terms = fields.terms("ngram"); + Terms terms = MultiTerms.getTerms(luceneSearcher.getReader(), "ngram"); return terms.iterator(); } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java index 8564e197a446..8bc871e3cd57 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java @@ -201,7 +201,7 @@ public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language } private PossiblyLimitedTopDocs getTopDocs(Query query) throws IOException { - TopScoreDocCollector topCollector = TopScoreDocCollector.create(maxHits); + TopScoreDocCollector topCollector = TopScoreDocCollector.create(maxHits, Integer.MAX_VALUE); Counter clock = Counter.newCounter(true); int waitMillis = 1000; // TODO: if we interrupt the whole thread anyway, do we still need the TimeLimitingCollector? From f7fa3cf36c64d34de21dd4d9deaa54b1d804fc09 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 6 Aug 2024 22:37:12 +0900 Subject: [PATCH 3/6] chore: import lucene-core in -wikipedia --- languagetool-wikipedia/pom.xml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/languagetool-wikipedia/pom.xml b/languagetool-wikipedia/pom.xml index 844bddb8351d..72f31e458c5e 100644 --- a/languagetool-wikipedia/pom.xml +++ b/languagetool-wikipedia/pom.xml @@ -62,6 +62,10 @@ org.apache.commons commons-compress + + org.apache.lucene + lucene-core + org.apache.lucene lucene-analyzers-common From 23aba0ef7041ce18be08a6b0fc90a61da90a6a32 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 7 Aug 2024 07:44:22 +0900 Subject: [PATCH 4/6] chore: bump lucene 8.11.3 for lt-wikiepdia Signed-off-by: Hiroshi Miura --- languagetool-wikipedia/pom.xml | 2 +- .../main/java/org/languagetool/dev/index/AnyCharTokenizer.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/languagetool-wikipedia/pom.xml b/languagetool-wikipedia/pom.xml index 72f31e458c5e..7201ec773787 100644 --- a/languagetool-wikipedia/pom.xml +++ b/languagetool-wikipedia/pom.xml @@ -38,7 +38,7 @@ - 5.5.5 + 8.11.3 diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java index 8cb404ed5924..e7474412aced 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java @@ -18,9 +18,9 @@ */ package org.languagetool.dev.index; +import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.AttributeFactory; From 9940b686d2da9302e299d9c10766485b79bc79c2 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 13 Aug 2024 17:04:22 +0900 Subject: [PATCH 5/6] style: revert non-mandatory changes Signed-off-by: Hiroshi Miura --- .../java/org/languagetool/commandline/Main.java | 16 +++------------- .../org/languagetool/commandline/MainTest.java | 2 +- languagetool-dev/pom.xml | 4 ---- .../dev/bigdata/AggregatedNgramToLucene.java | 6 +----- .../dev/bigdata/LargestNGramFinder.java | 1 - .../dev/bigdata/TextIndexCreator.java | 2 +- languagetool-wikipedia/pom.xml | 4 ---- 7 files changed, 6 insertions(+), 29 deletions(-) diff --git a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java index f9ae81837db7..5cbe62d89365 100644 --- a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java +++ b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java @@ -20,13 +20,9 @@ import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.BOMInputStream; -import org.languagetool.JLanguageTool; -import org.languagetool.Language; -import org.languagetool.Languages; -import org.languagetool.MultiThreadedJLanguageTool; +import org.languagetool.*; import org.languagetool.bitext.TabBitextReader; -import org.languagetool.language.AmericanEnglish; -import org.languagetool.language.English; +import org.languagetool.language.*; import org.languagetool.language.identifier.LanguageIdentifier; import org.languagetool.language.identifier.LanguageIdentifierService; import org.languagetool.rules.Rule; @@ -39,13 +35,7 @@ import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java index f130c02887fd..f9e6c3f87917 100644 --- a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java +++ b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java @@ -277,7 +277,7 @@ public void testEnglishStdIn4() throws Exception { String[] args = {"-l", "en", "--json", "-"}; Main.main(args); String output = new String(this.out.toByteArray()); - assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":")); + assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":\"")); assertTrue("Got: " + output, output.contains("\"language\":{\"name\":\"English\",\"code\":\"en\"")); assertTrue("Got: " + output, output.contains("{\"message\":\"Use \\\"a\\\" instead of 'an' if the following word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.\"")); assertTrue("Got: " + output, output.contains("\"replacements\":[{\"value\":\"a\"}]")); diff --git a/languagetool-dev/pom.xml b/languagetool-dev/pom.xml index bba8bd88b0cd..de8a11b203bb 100644 --- a/languagetool-dev/pom.xml +++ b/languagetool-dev/pom.xml @@ -58,10 +58,6 @@ org.languagetool languagetool-wikipedia - - org.apache.lucene - lucene-core - org.mariadb.jdbc mariadb-java-client diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java index 3a06ee9819d0..8a0b5fb0ddc0 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java @@ -20,11 +20,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.LongPoint; -import org.apache.lucene.document.StringField; +import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java index 74a6b0c8f887..693dc866b2d5 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java @@ -71,5 +71,4 @@ public static void main(String[] args) throws IOException { System.out.println("Max: " + max + " for " + maxTerm); } - } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java index e4ed2832f7a0..5ddbf1d7a423 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java @@ -19,8 +19,8 @@ package org.languagetool.dev.bigdata; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; diff --git a/languagetool-wikipedia/pom.xml b/languagetool-wikipedia/pom.xml index 7201ec773787..575a81cfd8f1 100644 --- a/languagetool-wikipedia/pom.xml +++ b/languagetool-wikipedia/pom.xml @@ -62,10 +62,6 @@ org.apache.commons commons-compress - - org.apache.lucene - lucene-core - org.apache.lucene lucene-analyzers-common From 4edeeb2d5053f24a59108f77ec7aa5cee5be8b3d Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 1 Oct 2024 16:52:09 +0900 Subject: [PATCH 6/6] fix: adjust for review comment comment from CodeRabbbit ## language-dev sub-project - Add StoredField addition to LongPoint for ngram - Use DirectoryReader.open(IndexWriter) instead of open(IndexWriter, boolean, boolean) - Null and empty check for FieldInfo and Terms Signed-off-by: Hiroshi Miura --- .../dev/bigdata/AggregatedNgramToLucene.java | 8 ++------ .../dev/bigdata/CommonCrawlToNgram.java | 12 ++++-------- .../dev/bigdata/GermanUppercasePhraseFinder.java | 13 +++++++++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java index 8a0b5fb0ddc0..08ce8200c9a2 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java @@ -90,15 +90,11 @@ private void indexLine(String line) throws IOException { private Document getDoc(String ngram, long count) { Document doc = new Document(); doc.add(new Field("ngram", ngram, StringField.TYPE_NOT_STORED)); // use StringField.TYPE_STORED for easier debugging with e.g. Luke - doc.add(getCountField(count)); + doc.add(new LongPoint("count", count)); + doc.add(new StoredField("count", count)); return doc; } - @NotNull - private LongPoint getCountField(long count) { - return new LongPoint("count", count); - } - private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS); diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java index 8e2e92b25566..43f87f102592 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java @@ -177,7 +177,7 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws if (newReader != null) { reader = newReader; }*/ - index.reader = DirectoryReader.open(index.indexWriter, true, true); + index.reader = DirectoryReader.open(index.indexWriter); index.searcher = new IndexSearcher(index.reader); for (Map.Entry entry : ngramToCount.entrySet()) { Term ngram = new Term("ngram", entry.getKey()); @@ -216,15 +216,11 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws private Document getDoc(String ngram, long count) { Document doc = new Document(); doc.add(new Field("ngram", ngram, StringField.TYPE_NOT_STORED)); - doc.add(getCountField(count)); + doc.add(new LongPoint("count", count)); + doc.add(new StoredField("count", count)); return doc; } - @NotNull - private LongPoint getCountField(long count) { - return new LongPoint("count", count); - } - private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS); @@ -264,7 +260,7 @@ static class LuceneLiveIndex { IndexWriterConfig config = new IndexWriterConfig(analyzer); directory = FSDirectory.open(dir.toPath()); indexWriter = new IndexWriter(directory, config); - reader = DirectoryReader.open(indexWriter, false, false); + reader = DirectoryReader.open(indexWriter); searcher = new IndexSearcher(reader); } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java index e8a2365c3111..b70700a1ed18 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java @@ -34,9 +34,7 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.List; -import java.util.Set; /** * Prototype to find potential upper-only phrases like "Persischer Golf". @@ -59,8 +57,15 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - for (String field: FieldInfos.getIndexedFields(reader)) { - Terms terms = MultiTerms.getTerms(reader, field); + FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader); + for (FieldInfo fieldInfo: fieldInfos) { + if (fieldInfo.getIndexOptions() == IndexOptions.NONE) { + continue; + } + Terms terms = MultiTerms.getTerms(reader, fieldInfo.name); + if (terms == null) { + continue; + } TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next;