From b5e340b3a5c143e6abf5a0a93b91754bd43657bc Mon Sep 17 00:00:00 2001
From: Andriy Rysin <arysin@gmail.com>
Date: Sun, 8 Sep 2024 20:54:35 -0400
Subject: [PATCH] move disambig stats to new artifact

---
 .github/workflows/gradle.yml                  |  2 +-
 README.md                                     |  2 +-
 build.gradle                                  | 73 +++++++++++++------
 gradle.properties                             |  6 +-
 .../groovy/ua/net/nlp/tools/TagText.groovy    |  3 +-
 .../ua/net/nlp/tools/tag/DisambigStats.groovy | 34 ---------
 .../ua/net/nlp/tools/tag/SemTags.groovy       |  5 +-
 .../ua/net/nlp/tools/tag/TagOptions.groovy    |  2 -
 .../ua/net/nlp/tools/tag/TagTextCore.groovy   | 36 ++-------
 .../ua/net/nlp/tools/tag/TagUnknown.groovy    | 40 ++--------
 .../ua/net/nlp/tools/tag/UdModule.groovy      |  4 +-
 11 files changed, 75 insertions(+), 132 deletions(-)

diff --git a/.github/workflows/gradle.yml b/.github/workflows/gradle.yml
index 3ff4a5c..6bd9726 100644
--- a/.github/workflows/gradle.yml
+++ b/.github/workflows/gradle.yml
@@ -31,4 +31,4 @@ jobs:
     - name: Build with Gradle
       uses: gradle/gradle-build-action@67421db6bd0bf253fb4bd25b31ebb98943c375e1
       with:
-        arguments: --info downloadResources processResources test
+        arguments: --info test
diff --git a/README.md b/README.md
index 6173bc5..6192a71 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ This is a project to demonstrate NLP API from LanguageTool for Ukrainian languag
 Це — проект демонстрації API для обробляння природної мови в LanguageTool для української мови.
 
 Використовує мову [groovy](http://www.groovy-lang.org/), засоби для токенізації та тегування також мають скрипти-обгортки для python3 та java.
-Рекомендована версія groovy - 4.0.10 або новіше.
+Рекомендована версія groovy - 4.0.22 або новіше.
 
 Для запуску скриптів потрібно встановити мову [groovy](http://www.groovy-lang.org/) 
 
diff --git a/build.gradle b/build.gradle
index fd6abd9..d813bd2 100644
--- a/build.gradle
+++ b/build.gradle
@@ -17,6 +17,9 @@ compileJava.options.encoding = 'UTF-8'
 group = 'ua.net.nlp'
 ext.artifactId = 'nlp_uk'
 
+ext.statsArtifactId="${artifactId}-stats"
+ext.statsJarName="${statsArtifactId}-${project.version}"
+
 repositories {
     mavenCentral()
     mavenLocal()
@@ -106,29 +109,6 @@ eclipse {
   }
 }
 
-task downloadResources(type: JavaExec) {
-    classpath = sourceSets.main.runtimeClasspath
-    mainClass = "ua.net.nlp.tools.tag.TagTextCore"
-    args "--download"
-
-    // ugly hack - downloadResources depends on compiling and thus processResources
-    // but after the download we need to call/force processResources again
-    // TODO: find how to do it nicely
-    // for now - just copy the files manually
-    doLast {
-        def outputDir = tasks.processResources.outputs.files.files.iterator().next()
-        File semtags = new File(outputDir, "/ua/net/nlp/tools/semtags")
-        semtags.mkdirs()
-        File stats = new File(outputDir, "/ua/net/nlp/tools/stats")
-        stats.mkdirs()
-        tasks.processResources.inputs.files.files
-            .findAll { File file -> file.name =~ /\.(csv|txt)$/ }
-            .each { File file ->
-                def toDir = file.path.contains("semtag") ? semtags : stats
-                Files.copy(file.toPath(), new File(toDir, file.name).toPath(), StandardCopyOption.REPLACE_EXISTING)
-            }
-    }
-}
 
 test {
     useJUnitPlatform()
@@ -216,6 +196,14 @@ jar {
     exclude("ua/net/nlp/tools/StressText.class")
 }
 
+task statsJar(type: Jar) {
+    setArchiveFileName "${statsJarName}.jar"
+    version=statsVersion
+    
+    from('src/main/resources') {
+        include '/ua/net/nlp/tools/stats/*.txt'
+    }
+}
 
 task sourceJar(type: Jar) {
     archiveClassifier = "sources"
@@ -236,7 +224,7 @@ if( project.hasProperty("ossrhUsername") ) {
 
 publishing {
     publications {
-        maven(MavenPublication) {
+        mainJar(MavenPublication) {
 //            groupId = group
             artifactId = artifactId
 //            version = version
@@ -274,6 +262,43 @@ publishing {
                   url = "https://github.com/brown-uk/nlp_uk.git"
                 }
             }
+        
+        }
+
+        statsJarPublication(MavenPublication) {
+            artifact statsJar {
+//                classifier = 'stats'
+            }
+//                groupId = 'com.example'
+            artifactId = statsArtifactId
+            version=statsVersion
+            
+            pom {
+                name = 'NLP Stats for Ukrainian'
+                description = 'NLP statistic files for Ukrainian language'
+
+                url = "https://github.com/brown-uk/nlp_uk"
+                
+                licenses {
+                  license {
+                    name = 'GNU General Public License v3'
+                    url = 'https://www.gnu.org/licenses/gpl-3.0.txt'
+                  }
+                }
+                
+                developers {
+                  developer {
+                    id = 'arysin'
+                    name = 'Andriy Rysin'
+                    email = 'arysin@gmail.com'
+                  }
+                }
+                
+                scm {
+                  url = "https://github.com/brown-uk/nlp_uk.git"
+                }
+            }
+
         }
     }
 
diff --git a/gradle.properties b/gradle.properties
index 5a0453a..2ab7f6e 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,6 +1,8 @@
+# for LT snapshots we can use https://repo.languagetool.org/ui/native/languagetool-os-snapshot
 ltBaseVersion=6.5-SNAPSHOT
 ltDevVersion=6.5-SNAPSHOT
-morfologik_ukrainian_lt_version=6.5.1-SNAPSHOT
+morfologik_ukrainian_lt_version=6.5.1
 groovyVersion=4.0.22
 # nlp_uk version
-version=3.3.2-SNAPSHOT
+version=3.3.5-SNAPSHOT
+statsVersion=3.3.5
diff --git a/src/main/groovy/ua/net/nlp/tools/TagText.groovy b/src/main/groovy/ua/net/nlp/tools/TagText.groovy
index e5bd88f..025f5ce 100755
--- a/src/main/groovy/ua/net/nlp/tools/TagText.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/TagText.groovy
@@ -7,7 +7,8 @@ package ua.net.nlp.tools
 @Grab(group='org.languagetool', module='languagetool-core', version='6.5-SNAPSHOT')
 @Grab(group='org.languagetool', module='language-uk', version='6.5-SNAPSHOT')
 @Grab(group='org.languagetool', module='language-ru', version='6.4')
-//@Grab(group='ua.net.nlp', module='morfologik-ukrainian-lt', version='6.3.1-SNAPSHOT')
+@Grab(group='ua.net.nlp', module='morfologik-ukrainian-lt', version='6.5.1')
+@Grab(group='ua.net.nlp', module='nlp_uk-stats', version='3.3.5')
 
 @Grab(group='ch.qos.logback', module='logback-classic', version='1.4.+')
 @Grab(group='info.picocli', module='picocli', version='4.6.+')
diff --git a/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy b/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy
index 95b8917..74a73b7 100644
--- a/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy
@@ -1,6 +1,5 @@
 package ua.net.nlp.tools.tag;
 
-import java.util.function.Consumer
 import java.util.regex.Pattern
 import java.util.stream.Collectors
 
@@ -13,10 +12,7 @@ import groovy.transform.ToString
 import ua.net.nlp.bruk.ContextToken
 import ua.net.nlp.bruk.WordContext
 import ua.net.nlp.bruk.WordReading
-import ua.net.nlp.tools.tag.TagTextCore
-import ua.net.nlp.tools.tag.TagTextCore.TTR
 import ua.net.nlp.tools.tag.TagTextCore.TokenInfo
-import ua.net.nlp.tools.tag.TagOptions
 
 
 @CompileStatic
@@ -88,12 +84,6 @@ public class DisambigStats {
     // lemma_xpN -> rate
     Map<String, Map<String, Double>> statsForLemmaXp = new HashMap<>(256).withDefault{ new HashMap<>() }
     
-    @groovy.transform.SourceURI
-    static URI SOURCE_URI
-    // if this script is called from GroovyScriptEngine SourceURI is data: and does not work for File()
-    static File SCRIPT_DIR = SOURCE_URI.scheme == "data"
-        ? null // new File("src/main/groovy/ua/net/nlp/tools/tag")
-        : new File(SOURCE_URI).getParentFile()
 
     @CompileStatic
     static double round(double d) {
@@ -629,26 +619,6 @@ public class DisambigStats {
         [new WordContext(contextToken, offset)] as Set
     }
 
-    void download() {
-        if( SCRIPT_DIR == null ) { // should not happen - jar will bundle the stats
-            System.err.println "Can't download from inside the jar"
-            System.exit 1
-        }
-        
-        def targetDir = new File(SCRIPT_DIR, "../../../../../../resources/")
-        targetDir.mkdirs()
-        assert targetDir.isDirectory()
-
-        File targetFile = new File(targetDir, statsFile)
-        targetFile.parentFile.mkdirs()
-        
-        def remoteStats = "https://github.com/brown-uk/nlp_uk/releases/download/v${statsVersion}/lemma_freqs_hom.txt"
-        System.err.println("Downloading $remoteStats...");
-        def statTxt = new URL(remoteStats).getText('UTF-8')
-        
-        targetFile.setText(statTxt, 'UTF-8')
-    }
-    
     
     @CompileStatic
     def loadDisambigStats() {
@@ -658,10 +628,6 @@ public class DisambigStats {
         long tm1 = System.currentTimeMillis()
 
         def statsFileRes = getClass().getResource(statsFile)
-        if( statsFileRes == null ) {
-            throw new IllegalStateException("Disambiguation stats not found, run \"TagText.groovy --download\" to download it from github, and then retry")
-        }
-        
         
         String word
         WordReading wordReading
diff --git a/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy b/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy
index 57ff072..81f4577 100644
--- a/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy
@@ -27,6 +27,7 @@ public class SemTags {
     TagOptions options
     Map<String, Map<String,List<String>>> semanticTags = new HashMap<>()
     
+
     def loadSemTags() {
         if( semanticTags.size() > 0 )
             return
@@ -85,7 +86,7 @@ public class SemTags {
         }
     }
 
-    @CompileStatic
+
     String getSemTags(AnalyzedToken tkn, String posTag) {
         if( options.semanticTags && tkn.getLemma() != null && posTag != null ) {
             def lemma = tkn.getLemma()
@@ -108,7 +109,6 @@ public class SemTags {
     }
 
     
-    @CompileStatic
     private static boolean filterSemtag(String lemma, String posTag, String semtag) {
         if( posTag.contains("pron") )
             return semtag =~ ":deictic|:quantif"
@@ -133,6 +133,5 @@ public class SemTags {
         }
         true
     }
-
     
 }
diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy
index 83fa552..c9e59a8 100644
--- a/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy
@@ -76,8 +76,6 @@ public class TagOptions extends OptionsBase {
     boolean singleThread
     @Option(names = ["--timing"], description = "Pring timing information", hidden = true)
     boolean timing
-    @Option(names = ["--download"], description = "Download file with disambiguation statistics and semantic tags (for tagging from CLI only)")
-    boolean download
     @Option(names = ["--progress"], description = "Pring progress information every <n> files", hidden = true)
     int progress=0
 
diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy
index 79f2630..9679521 100644
--- a/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy
@@ -19,6 +19,7 @@ import org.languagetool.MultiThreadedJLanguageTool
 import org.languagetool.language.Ukrainian
 
 import groovy.transform.Canonical
+import groovy.transform.CompileDynamic
 import groovy.transform.CompileStatic
 import picocli.CommandLine
 import picocli.CommandLine.ParameterException
@@ -28,6 +29,7 @@ import ua.net.nlp.tools.TextUtils.OutputFormat
 import ua.net.nlp.tools.TextUtils.ResultBase
 
 
+@CompileStatic
 class TagTextCore {
 
     public static final Pattern PUNCT_PATTERN = Pattern.compile(/[,.:;!?\/()\[\]{}«»„“"'…\u2013\u2014\u201D\u201C•■♦-]+/)               // "
@@ -39,7 +41,7 @@ class TagTextCore {
     private final Pattern CONTROL_CHAR_PATTERN_R = Pattern.compile(/[\u0000-\u0008\u000B-\u0012\u0014-\u001F\u0A0D]/, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE)
     enum TaggingLevel { tagger, stats }
     
-    def language = new Ukrainian() {
+    Ukrainian language = new Ukrainian() {
         @Override
         protected synchronized List<?> getPatternRules() { return [] }
     }
@@ -187,19 +189,17 @@ class TagTextCore {
         tagTextCore(analyzedSentences, stats)
     }
     
-//    @CompileDynamic
+    @CompileDynamic
     List<AnalyzedSentence> analyzeText(String text) {
         options.sentencePerLine
             ? langTool.analyzeSentences( text.split("\n") as List )
             : langTool.analyzeText(text)
     }
 
-    @CompileStatic
     public List<TaggedSentence> tagTextCore(List<AnalyzedSentence> analyzedSentences) {
         tagTextCore(analyzedSentences, null);
     }
         
-    @CompileStatic
     List<TaggedSentence> tagTextCore(List<AnalyzedSentence> analyzedSentences, TagStats stats) {
         List<TaggedSentence> taggedSentences = 
           analyzedSentences.parallelStream().map { AnalyzedSentence analyzedSentence ->
@@ -248,7 +248,7 @@ class TagTextCore {
         taggedSentences
     }
 
-    @CompileStatic
+
     void cleanup(AnalyzedSentence analyzedSentence) {
         // multiwords are very LT-specific
         analyzedSentence.getTokens().each { AnalyzedTokenReadings t ->
@@ -261,13 +261,11 @@ class TagTextCore {
         }
     }
         
-    @CompileStatic
     private static boolean hasPosTag(AnalyzedTokenReadings tokenReadings) {
         tokenReadings.getReadings().stream()
             .anyMatch{ t -> ! isTagEmpty(t.getPOSTag()) }
     }   
 
-    @CompileStatic
     @Canonical
     static class TaggedToken {
         String value
@@ -541,6 +539,7 @@ class TagTextCore {
     }
 
 
+    @CompileDynamic
     def process() {
 //        def stats = new TagStats()
 //        stats.options = options
@@ -554,6 +553,7 @@ class TagTextCore {
         });
     }
 
+    @CompileDynamic
     def process(IOFiles fileInfo) {
         def stats = new TagStats()
         stats.options = options
@@ -571,6 +571,7 @@ class TagTextCore {
         addUnknownPct(stats, fileInfo)
     }
 
+    @CompileDynamic
     def addUnknownPct(TagStats stats, IOFiles fileInfo) {
 //    println "== ${fileInfo.filename}, ${stats.knownCnt}, ${stats.unknownMap}"
       if( fileInfo.filename
@@ -606,14 +607,10 @@ class TagTextCore {
         }
     }
 
-    
-    @CompileStatic
     static boolean isTagEmpty(String posTag) {
         posTag == null || posTag.endsWith("_END")
     }
     
-
-    @CompileStatic
     static TagOptions parseOptions(String[] argv) {
         TagOptions options = new TagOptions()
         CommandLine commandLine = new CommandLine(options)
@@ -657,17 +654,11 @@ class TagTextCore {
                 System.err.println ("Semantic tagging only available in xml/json output")
                 System.exit 1
             }
-            
             semTags.loadSemTags()
         }
 
         disambigStats.setOptions(options)
         if( options.disambiguate ) {
-//            if( options.outputFormat == OutputFormat.txt ) {
-//                System.err.println ("Disambiguation only available in xml/json output")
-//                System.exit 1
-//            }
-
             disambigStats.loadDisambigStats()
         }
         if( options.tagUnknown ) {
@@ -703,12 +694,6 @@ class TagTextCore {
 
     }
     
-    void download() {
-        disambigStats.download()
-        tagUnknown.download()
-        semTags.download()
-    }
-    
 	
     static void main(String[] args) {
 
@@ -723,11 +708,6 @@ class TagTextCore {
             System.err.println(e.getMessage())
             System.exit(1)
         }
-        
-        if( options.download ) {
-            nlpUk.download()
-            return
-        }
 
         if( ! options.quiet ) {
             printLtVersion()
diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy
index cbf26f9..48702e8 100644
--- a/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy
@@ -5,35 +5,29 @@ import java.util.regex.Pattern
 import org.languagetool.AnalyzedTokenReadings
 import org.languagetool.tools.StringTools
 
+import groovy.transform.CompileDynamic
 import groovy.transform.CompileStatic
 import ua.net.nlp.bruk.WordReading
 import ua.net.nlp.tools.tag.TagTextCore.TaggedToken
 
+
+@CompileStatic
 public class TagUnknown {
     private static final String statsFile = "/ua/net/nlp/tools/stats/lemma_suffix_freqs.txt"
 
-    @groovy.transform.SourceURI
-    static SOURCE_URI
-    // if this script is called from GroovyScriptEngine SourceURI is data: and does not work for File()
-    static File SCRIPT_DIR = SOURCE_URI.scheme == "data"
-        ? null // new File("src/main/groovy/ua/net/nlp/tools/tag")
-        : new File(SOURCE_URI).getParentFile()
-
     Map<String, Map<WordReading, Integer>> lemmaSuffixStatsF = [:].withDefault { [:].withDefault { 0 } }
     int lemmaSuffixLenB = 4
         
     TagUnknown() {
     }
 
+    @CompileDynamic
     void loadStats() {
         if( lemmaSuffixStatsF.size() > 0 )
             return
         
         def statsFileRes = getClass().getResource(statsFile)
-        if( statsFileRes == null ) {
-            System.err.println "Lemma stats not found, run with --download to download it from github"
-            System.exit 1
-        }
+        assert statsFileRes, "Disambig stats not found :$statsFile"
 
         statsFileRes.eachLine { String line ->
             def (suffix, rs, postag, cnt) = line.split("\t+")
@@ -44,7 +38,6 @@ public class TagUnknown {
         
 //    private static Pattern DASHED = ~/(?iu)([а-яіїєґ']{4,})-([а-яіїєґ']{4,})/
     
-    @CompileStatic
     List<TaggedToken> tag(String token, int idx, AnalyzedTokenReadings[] tokens) {
 //        def m = DASHED.matcher(token)
 //        m.find()
@@ -67,7 +60,6 @@ public class TagUnknown {
     // НС-фільтрів
     static final Pattern PREFIXED = Pattern.compile(/([А-ЯІЇЄҐA-Z0-9]+[-\u2013])([а-яіїєґ].*)/)
     
-    @CompileStatic
     List<TaggedToken> tagInternal(String token, int idx, AnalyzedTokenReadings[] tokens) {
         if( token ==~ /[А-ЯІЇЄҐ]+-[0-9]+[а-яіїєґА-ЯІЇЄҐ]*/ ) // ФАТ-10
             return [new TaggedToken(value: token, lemma: token, tags: 'noninfl', confidence: -0.7)]
@@ -147,14 +139,12 @@ public class TagUnknown {
     static Pattern mascPrefix = ~/пан|містер|гер|сеньйор|монсеньйор|добродій|князь/
     static Pattern femPrefix = ~/пані|міс|місіс|княгиня|фрау|сеньора|сеньйоріта|мадам|маде?муазель|добродійка/
 
-    @CompileStatic
     private static String gen(String postag) {
         def m = postag =~ /:[mf]:/
         return m ? m[0] : null
     }
 
         
-    @CompileStatic
     private static int getCoeff(Map.Entry<WordReading, Integer> e, String token, int idx, AnalyzedTokenReadings[] tokens) {
         if( e.key.postag.contains("prop") ) {
             if( ! StringTools.isCapitalizedWord(token) ) {
@@ -188,24 +178,4 @@ public class TagUnknown {
         return e.value
     }
     
-    void download() {
-        if( SCRIPT_DIR == null ) { // should not happen - jar will bundle the stats
-            System.err.println "Can't download from inside the jar"
-            System.exit 1
-        }
-        
-        def targetDir = new File(SCRIPT_DIR, "../../../../../../resources/")
-        targetDir.mkdirs()
-        assert targetDir.isDirectory()
-
-        File targetFile = new File(targetDir, statsFile)
-        targetFile.parentFile.mkdirs()
-        
-        def remoteStats = "https://github.com/brown-uk/nlp_uk/releases/download/v${DisambigStats.statsVersion}/lemma_suffix_freqs.txt"
-        System.err.println("Downloading $remoteStats...");
-        def statTxt = new URL(remoteStats).getText('UTF-8')
-        
-        targetFile.setText(statTxt, 'UTF-8')
-    }
-
 }
diff --git a/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy b/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy
index d840c86..d5b62b4 100644
--- a/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy
+++ b/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy
@@ -221,7 +221,9 @@ class UdModule {
             VESUM_TO_UD[vesum]=ud
         }
         
-        println "Got ${VESUM_TO_UD.size()} UD conversions"
+        if( ! options.quiet ) {
+            println "Got ${VESUM_TO_UD.size()} UD conversions"
+        }
         
         NEGATIVES = new File(getClass().getResource('/ua/net/nlp/tools/ud/negatives.txt').toURI()).readLines('UTF-8')
     }