From b5e340b3a5c143e6abf5a0a93b91754bd43657bc Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Sun, 8 Sep 2024 20:54:35 -0400 Subject: [PATCH] move disambig stats to new artifact --- .github/workflows/gradle.yml | 2 +- README.md | 2 +- build.gradle | 73 +++++++++++++------ gradle.properties | 6 +- .../groovy/ua/net/nlp/tools/TagText.groovy | 3 +- .../ua/net/nlp/tools/tag/DisambigStats.groovy | 34 --------- .../ua/net/nlp/tools/tag/SemTags.groovy | 5 +- .../ua/net/nlp/tools/tag/TagOptions.groovy | 2 - .../ua/net/nlp/tools/tag/TagTextCore.groovy | 36 ++------- .../ua/net/nlp/tools/tag/TagUnknown.groovy | 40 ++-------- .../ua/net/nlp/tools/tag/UdModule.groovy | 4 +- 11 files changed, 75 insertions(+), 132 deletions(-) diff --git a/.github/workflows/gradle.yml b/.github/workflows/gradle.yml index 3ff4a5c..6bd9726 100644 --- a/.github/workflows/gradle.yml +++ b/.github/workflows/gradle.yml @@ -31,4 +31,4 @@ jobs: - name: Build with Gradle uses: gradle/gradle-build-action@67421db6bd0bf253fb4bd25b31ebb98943c375e1 with: - arguments: --info downloadResources processResources test + arguments: --info test diff --git a/README.md b/README.md index 6173bc5..6192a71 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This is a project to demonstrate NLP API from LanguageTool for Ukrainian languag Це — проект демонстрації API для обробляння природної мови в LanguageTool для української мови. Використовує мову [groovy](http://www.groovy-lang.org/), засоби для токенізації та тегування також мають скрипти-обгортки для python3 та java. -Рекомендована версія groovy - 4.0.10 або новіше. +Рекомендована версія groovy - 4.0.22 або новіше. Для запуску скриптів потрібно встановити мову [groovy](http://www.groovy-lang.org/) diff --git a/build.gradle b/build.gradle index fd6abd9..d813bd2 100644 --- a/build.gradle +++ b/build.gradle @@ -17,6 +17,9 @@ compileJava.options.encoding = 'UTF-8' group = 'ua.net.nlp' ext.artifactId = 'nlp_uk' +ext.statsArtifactId="${artifactId}-stats" +ext.statsJarName="${statsArtifactId}-${project.version}" + repositories { mavenCentral() mavenLocal() @@ -106,29 +109,6 @@ eclipse { } } -task downloadResources(type: JavaExec) { - classpath = sourceSets.main.runtimeClasspath - mainClass = "ua.net.nlp.tools.tag.TagTextCore" - args "--download" - - // ugly hack - downloadResources depends on compiling and thus processResources - // but after the download we need to call/force processResources again - // TODO: find how to do it nicely - // for now - just copy the files manually - doLast { - def outputDir = tasks.processResources.outputs.files.files.iterator().next() - File semtags = new File(outputDir, "/ua/net/nlp/tools/semtags") - semtags.mkdirs() - File stats = new File(outputDir, "/ua/net/nlp/tools/stats") - stats.mkdirs() - tasks.processResources.inputs.files.files - .findAll { File file -> file.name =~ /\.(csv|txt)$/ } - .each { File file -> - def toDir = file.path.contains("semtag") ? semtags : stats - Files.copy(file.toPath(), new File(toDir, file.name).toPath(), StandardCopyOption.REPLACE_EXISTING) - } - } -} test { useJUnitPlatform() @@ -216,6 +196,14 @@ jar { exclude("ua/net/nlp/tools/StressText.class") } +task statsJar(type: Jar) { + setArchiveFileName "${statsJarName}.jar" + version=statsVersion + + from('src/main/resources') { + include '/ua/net/nlp/tools/stats/*.txt' + } +} task sourceJar(type: Jar) { archiveClassifier = "sources" @@ -236,7 +224,7 @@ if( project.hasProperty("ossrhUsername") ) { publishing { publications { - maven(MavenPublication) { + mainJar(MavenPublication) { // groupId = group artifactId = artifactId // version = version @@ -274,6 +262,43 @@ publishing { url = "https://github.com/brown-uk/nlp_uk.git" } } + + } + + statsJarPublication(MavenPublication) { + artifact statsJar { +// classifier = 'stats' + } +// groupId = 'com.example' + artifactId = statsArtifactId + version=statsVersion + + pom { + name = 'NLP Stats for Ukrainian' + description = 'NLP statistic files for Ukrainian language' + + url = "https://github.com/brown-uk/nlp_uk" + + licenses { + license { + name = 'GNU General Public License v3' + url = 'https://www.gnu.org/licenses/gpl-3.0.txt' + } + } + + developers { + developer { + id = 'arysin' + name = 'Andriy Rysin' + email = 'arysin@gmail.com' + } + } + + scm { + url = "https://github.com/brown-uk/nlp_uk.git" + } + } + } } diff --git a/gradle.properties b/gradle.properties index 5a0453a..2ab7f6e 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,6 +1,8 @@ +# for LT snapshots we can use https://repo.languagetool.org/ui/native/languagetool-os-snapshot ltBaseVersion=6.5-SNAPSHOT ltDevVersion=6.5-SNAPSHOT -morfologik_ukrainian_lt_version=6.5.1-SNAPSHOT +morfologik_ukrainian_lt_version=6.5.1 groovyVersion=4.0.22 # nlp_uk version -version=3.3.2-SNAPSHOT +version=3.3.5-SNAPSHOT +statsVersion=3.3.5 diff --git a/src/main/groovy/ua/net/nlp/tools/TagText.groovy b/src/main/groovy/ua/net/nlp/tools/TagText.groovy index e5bd88f..025f5ce 100755 --- a/src/main/groovy/ua/net/nlp/tools/TagText.groovy +++ b/src/main/groovy/ua/net/nlp/tools/TagText.groovy @@ -7,7 +7,8 @@ package ua.net.nlp.tools @Grab(group='org.languagetool', module='languagetool-core', version='6.5-SNAPSHOT') @Grab(group='org.languagetool', module='language-uk', version='6.5-SNAPSHOT') @Grab(group='org.languagetool', module='language-ru', version='6.4') -//@Grab(group='ua.net.nlp', module='morfologik-ukrainian-lt', version='6.3.1-SNAPSHOT') +@Grab(group='ua.net.nlp', module='morfologik-ukrainian-lt', version='6.5.1') +@Grab(group='ua.net.nlp', module='nlp_uk-stats', version='3.3.5') @Grab(group='ch.qos.logback', module='logback-classic', version='1.4.+') @Grab(group='info.picocli', module='picocli', version='4.6.+') diff --git a/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy b/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy index 95b8917..74a73b7 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy @@ -1,6 +1,5 @@ package ua.net.nlp.tools.tag; -import java.util.function.Consumer import java.util.regex.Pattern import java.util.stream.Collectors @@ -13,10 +12,7 @@ import groovy.transform.ToString import ua.net.nlp.bruk.ContextToken import ua.net.nlp.bruk.WordContext import ua.net.nlp.bruk.WordReading -import ua.net.nlp.tools.tag.TagTextCore -import ua.net.nlp.tools.tag.TagTextCore.TTR import ua.net.nlp.tools.tag.TagTextCore.TokenInfo -import ua.net.nlp.tools.tag.TagOptions @CompileStatic @@ -88,12 +84,6 @@ public class DisambigStats { // lemma_xpN -> rate Map> statsForLemmaXp = new HashMap<>(256).withDefault{ new HashMap<>() } - @groovy.transform.SourceURI - static URI SOURCE_URI - // if this script is called from GroovyScriptEngine SourceURI is data: and does not work for File() - static File SCRIPT_DIR = SOURCE_URI.scheme == "data" - ? null // new File("src/main/groovy/ua/net/nlp/tools/tag") - : new File(SOURCE_URI).getParentFile() @CompileStatic static double round(double d) { @@ -629,26 +619,6 @@ public class DisambigStats { [new WordContext(contextToken, offset)] as Set } - void download() { - if( SCRIPT_DIR == null ) { // should not happen - jar will bundle the stats - System.err.println "Can't download from inside the jar" - System.exit 1 - } - - def targetDir = new File(SCRIPT_DIR, "../../../../../../resources/") - targetDir.mkdirs() - assert targetDir.isDirectory() - - File targetFile = new File(targetDir, statsFile) - targetFile.parentFile.mkdirs() - - def remoteStats = "https://github.com/brown-uk/nlp_uk/releases/download/v${statsVersion}/lemma_freqs_hom.txt" - System.err.println("Downloading $remoteStats..."); - def statTxt = new URL(remoteStats).getText('UTF-8') - - targetFile.setText(statTxt, 'UTF-8') - } - @CompileStatic def loadDisambigStats() { @@ -658,10 +628,6 @@ public class DisambigStats { long tm1 = System.currentTimeMillis() def statsFileRes = getClass().getResource(statsFile) - if( statsFileRes == null ) { - throw new IllegalStateException("Disambiguation stats not found, run \"TagText.groovy --download\" to download it from github, and then retry") - } - String word WordReading wordReading diff --git a/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy b/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy index 57ff072..81f4577 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy @@ -27,6 +27,7 @@ public class SemTags { TagOptions options Map>> semanticTags = new HashMap<>() + def loadSemTags() { if( semanticTags.size() > 0 ) return @@ -85,7 +86,7 @@ public class SemTags { } } - @CompileStatic + String getSemTags(AnalyzedToken tkn, String posTag) { if( options.semanticTags && tkn.getLemma() != null && posTag != null ) { def lemma = tkn.getLemma() @@ -108,7 +109,6 @@ public class SemTags { } - @CompileStatic private static boolean filterSemtag(String lemma, String posTag, String semtag) { if( posTag.contains("pron") ) return semtag =~ ":deictic|:quantif" @@ -133,6 +133,5 @@ public class SemTags { } true } - } diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy index 83fa552..c9e59a8 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy @@ -76,8 +76,6 @@ public class TagOptions extends OptionsBase { boolean singleThread @Option(names = ["--timing"], description = "Pring timing information", hidden = true) boolean timing - @Option(names = ["--download"], description = "Download file with disambiguation statistics and semantic tags (for tagging from CLI only)") - boolean download @Option(names = ["--progress"], description = "Pring progress information every files", hidden = true) int progress=0 diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy index 79f2630..9679521 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy @@ -19,6 +19,7 @@ import org.languagetool.MultiThreadedJLanguageTool import org.languagetool.language.Ukrainian import groovy.transform.Canonical +import groovy.transform.CompileDynamic import groovy.transform.CompileStatic import picocli.CommandLine import picocli.CommandLine.ParameterException @@ -28,6 +29,7 @@ import ua.net.nlp.tools.TextUtils.OutputFormat import ua.net.nlp.tools.TextUtils.ResultBase +@CompileStatic class TagTextCore { public static final Pattern PUNCT_PATTERN = Pattern.compile(/[,.:;!?\/()\[\]{}«»„“"'…\u2013\u2014\u201D\u201C•■♦-]+/) // " @@ -39,7 +41,7 @@ class TagTextCore { private final Pattern CONTROL_CHAR_PATTERN_R = Pattern.compile(/[\u0000-\u0008\u000B-\u0012\u0014-\u001F\u0A0D]/, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE) enum TaggingLevel { tagger, stats } - def language = new Ukrainian() { + Ukrainian language = new Ukrainian() { @Override protected synchronized List getPatternRules() { return [] } } @@ -187,19 +189,17 @@ class TagTextCore { tagTextCore(analyzedSentences, stats) } -// @CompileDynamic + @CompileDynamic List analyzeText(String text) { options.sentencePerLine ? langTool.analyzeSentences( text.split("\n") as List ) : langTool.analyzeText(text) } - @CompileStatic public List tagTextCore(List analyzedSentences) { tagTextCore(analyzedSentences, null); } - @CompileStatic List tagTextCore(List analyzedSentences, TagStats stats) { List taggedSentences = analyzedSentences.parallelStream().map { AnalyzedSentence analyzedSentence -> @@ -248,7 +248,7 @@ class TagTextCore { taggedSentences } - @CompileStatic + void cleanup(AnalyzedSentence analyzedSentence) { // multiwords are very LT-specific analyzedSentence.getTokens().each { AnalyzedTokenReadings t -> @@ -261,13 +261,11 @@ class TagTextCore { } } - @CompileStatic private static boolean hasPosTag(AnalyzedTokenReadings tokenReadings) { tokenReadings.getReadings().stream() .anyMatch{ t -> ! isTagEmpty(t.getPOSTag()) } } - @CompileStatic @Canonical static class TaggedToken { String value @@ -541,6 +539,7 @@ class TagTextCore { } + @CompileDynamic def process() { // def stats = new TagStats() // stats.options = options @@ -554,6 +553,7 @@ class TagTextCore { }); } + @CompileDynamic def process(IOFiles fileInfo) { def stats = new TagStats() stats.options = options @@ -571,6 +571,7 @@ class TagTextCore { addUnknownPct(stats, fileInfo) } + @CompileDynamic def addUnknownPct(TagStats stats, IOFiles fileInfo) { // println "== ${fileInfo.filename}, ${stats.knownCnt}, ${stats.unknownMap}" if( fileInfo.filename @@ -606,14 +607,10 @@ class TagTextCore { } } - - @CompileStatic static boolean isTagEmpty(String posTag) { posTag == null || posTag.endsWith("_END") } - - @CompileStatic static TagOptions parseOptions(String[] argv) { TagOptions options = new TagOptions() CommandLine commandLine = new CommandLine(options) @@ -657,17 +654,11 @@ class TagTextCore { System.err.println ("Semantic tagging only available in xml/json output") System.exit 1 } - semTags.loadSemTags() } disambigStats.setOptions(options) if( options.disambiguate ) { -// if( options.outputFormat == OutputFormat.txt ) { -// System.err.println ("Disambiguation only available in xml/json output") -// System.exit 1 -// } - disambigStats.loadDisambigStats() } if( options.tagUnknown ) { @@ -703,12 +694,6 @@ class TagTextCore { } - void download() { - disambigStats.download() - tagUnknown.download() - semTags.download() - } - static void main(String[] args) { @@ -723,11 +708,6 @@ class TagTextCore { System.err.println(e.getMessage()) System.exit(1) } - - if( options.download ) { - nlpUk.download() - return - } if( ! options.quiet ) { printLtVersion() diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy index cbf26f9..48702e8 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/TagUnknown.groovy @@ -5,35 +5,29 @@ import java.util.regex.Pattern import org.languagetool.AnalyzedTokenReadings import org.languagetool.tools.StringTools +import groovy.transform.CompileDynamic import groovy.transform.CompileStatic import ua.net.nlp.bruk.WordReading import ua.net.nlp.tools.tag.TagTextCore.TaggedToken + +@CompileStatic public class TagUnknown { private static final String statsFile = "/ua/net/nlp/tools/stats/lemma_suffix_freqs.txt" - @groovy.transform.SourceURI - static SOURCE_URI - // if this script is called from GroovyScriptEngine SourceURI is data: and does not work for File() - static File SCRIPT_DIR = SOURCE_URI.scheme == "data" - ? null // new File("src/main/groovy/ua/net/nlp/tools/tag") - : new File(SOURCE_URI).getParentFile() - Map> lemmaSuffixStatsF = [:].withDefault { [:].withDefault { 0 } } int lemmaSuffixLenB = 4 TagUnknown() { } + @CompileDynamic void loadStats() { if( lemmaSuffixStatsF.size() > 0 ) return def statsFileRes = getClass().getResource(statsFile) - if( statsFileRes == null ) { - System.err.println "Lemma stats not found, run with --download to download it from github" - System.exit 1 - } + assert statsFileRes, "Disambig stats not found :$statsFile" statsFileRes.eachLine { String line -> def (suffix, rs, postag, cnt) = line.split("\t+") @@ -44,7 +38,6 @@ public class TagUnknown { // private static Pattern DASHED = ~/(?iu)([а-яіїєґ']{4,})-([а-яіїєґ']{4,})/ - @CompileStatic List tag(String token, int idx, AnalyzedTokenReadings[] tokens) { // def m = DASHED.matcher(token) // m.find() @@ -67,7 +60,6 @@ public class TagUnknown { // НС-фільтрів static final Pattern PREFIXED = Pattern.compile(/([А-ЯІЇЄҐA-Z0-9]+[-\u2013])([а-яіїєґ].*)/) - @CompileStatic List tagInternal(String token, int idx, AnalyzedTokenReadings[] tokens) { if( token ==~ /[А-ЯІЇЄҐ]+-[0-9]+[а-яіїєґА-ЯІЇЄҐ]*/ ) // ФАТ-10 return [new TaggedToken(value: token, lemma: token, tags: 'noninfl', confidence: -0.7)] @@ -147,14 +139,12 @@ public class TagUnknown { static Pattern mascPrefix = ~/пан|містер|гер|сеньйор|монсеньйор|добродій|князь/ static Pattern femPrefix = ~/пані|міс|місіс|княгиня|фрау|сеньора|сеньйоріта|мадам|маде?муазель|добродійка/ - @CompileStatic private static String gen(String postag) { def m = postag =~ /:[mf]:/ return m ? m[0] : null } - @CompileStatic private static int getCoeff(Map.Entry e, String token, int idx, AnalyzedTokenReadings[] tokens) { if( e.key.postag.contains("prop") ) { if( ! StringTools.isCapitalizedWord(token) ) { @@ -188,24 +178,4 @@ public class TagUnknown { return e.value } - void download() { - if( SCRIPT_DIR == null ) { // should not happen - jar will bundle the stats - System.err.println "Can't download from inside the jar" - System.exit 1 - } - - def targetDir = new File(SCRIPT_DIR, "../../../../../../resources/") - targetDir.mkdirs() - assert targetDir.isDirectory() - - File targetFile = new File(targetDir, statsFile) - targetFile.parentFile.mkdirs() - - def remoteStats = "https://github.com/brown-uk/nlp_uk/releases/download/v${DisambigStats.statsVersion}/lemma_suffix_freqs.txt" - System.err.println("Downloading $remoteStats..."); - def statTxt = new URL(remoteStats).getText('UTF-8') - - targetFile.setText(statTxt, 'UTF-8') - } - } diff --git a/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy b/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy index d840c86..d5b62b4 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/UdModule.groovy @@ -221,7 +221,9 @@ class UdModule { VESUM_TO_UD[vesum]=ud } - println "Got ${VESUM_TO_UD.size()} UD conversions" + if( ! options.quiet ) { + println "Got ${VESUM_TO_UD.size()} UD conversions" + } NEGATIVES = new File(getClass().getResource('/ua/net/nlp/tools/ud/negatives.txt').toURI()).readLines('UTF-8') }