Skip to content

Commit

Permalink
move disambig stats to new artifact
Browse files Browse the repository at this point in the history
  • Loading branch information
arysin committed Sep 9, 2024
1 parent b513856 commit b5e340b
Show file tree
Hide file tree
Showing 11 changed files with 75 additions and 132 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gradle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ jobs:
- name: Build with Gradle
uses: gradle/gradle-build-action@67421db6bd0bf253fb4bd25b31ebb98943c375e1
with:
arguments: --info downloadResources processResources test
arguments: --info test
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ This is a project to demonstrate NLP API from LanguageTool for Ukrainian languag
Це — проект демонстрації API для обробляння природної мови в LanguageTool для української мови.

Використовує мову [groovy](http://www.groovy-lang.org/), засоби для токенізації та тегування також мають скрипти-обгортки для python3 та java.
Рекомендована версія groovy - 4.0.10 або новіше.
Рекомендована версія groovy - 4.0.22 або новіше.

Для запуску скриптів потрібно встановити мову [groovy](http://www.groovy-lang.org/)

Expand Down
73 changes: 49 additions & 24 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ compileJava.options.encoding = 'UTF-8'
group = 'ua.net.nlp'
ext.artifactId = 'nlp_uk'

ext.statsArtifactId="${artifactId}-stats"
ext.statsJarName="${statsArtifactId}-${project.version}"

repositories {
mavenCentral()
mavenLocal()
Expand Down Expand Up @@ -106,29 +109,6 @@ eclipse {
}
}

task downloadResources(type: JavaExec) {
classpath = sourceSets.main.runtimeClasspath
mainClass = "ua.net.nlp.tools.tag.TagTextCore"
args "--download"

// ugly hack - downloadResources depends on compiling and thus processResources
// but after the download we need to call/force processResources again
// TODO: find how to do it nicely
// for now - just copy the files manually
doLast {
def outputDir = tasks.processResources.outputs.files.files.iterator().next()
File semtags = new File(outputDir, "/ua/net/nlp/tools/semtags")
semtags.mkdirs()
File stats = new File(outputDir, "/ua/net/nlp/tools/stats")
stats.mkdirs()
tasks.processResources.inputs.files.files
.findAll { File file -> file.name =~ /\.(csv|txt)$/ }
.each { File file ->
def toDir = file.path.contains("semtag") ? semtags : stats
Files.copy(file.toPath(), new File(toDir, file.name).toPath(), StandardCopyOption.REPLACE_EXISTING)
}
}
}

test {
useJUnitPlatform()
Expand Down Expand Up @@ -216,6 +196,14 @@ jar {
exclude("ua/net/nlp/tools/StressText.class")
}

task statsJar(type: Jar) {
setArchiveFileName "${statsJarName}.jar"
version=statsVersion

from('src/main/resources') {
include '/ua/net/nlp/tools/stats/*.txt'
}
}

task sourceJar(type: Jar) {
archiveClassifier = "sources"
Expand All @@ -236,7 +224,7 @@ if( project.hasProperty("ossrhUsername") ) {

publishing {
publications {
maven(MavenPublication) {
mainJar(MavenPublication) {
// groupId = group
artifactId = artifactId
// version = version
Expand Down Expand Up @@ -274,6 +262,43 @@ publishing {
url = "https://github.com/brown-uk/nlp_uk.git"
}
}

}

statsJarPublication(MavenPublication) {
artifact statsJar {
// classifier = 'stats'
}
// groupId = 'com.example'
artifactId = statsArtifactId
version=statsVersion

pom {
name = 'NLP Stats for Ukrainian'
description = 'NLP statistic files for Ukrainian language'

url = "https://github.com/brown-uk/nlp_uk"

licenses {
license {
name = 'GNU General Public License v3'
url = 'https://www.gnu.org/licenses/gpl-3.0.txt'
}
}

developers {
developer {
id = 'arysin'
name = 'Andriy Rysin'
email = '[email protected]'
}
}

scm {
url = "https://github.com/brown-uk/nlp_uk.git"
}
}

}
}

Expand Down
6 changes: 4 additions & 2 deletions gradle.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# for LT snapshots we can use https://repo.languagetool.org/ui/native/languagetool-os-snapshot
ltBaseVersion=6.5-SNAPSHOT
ltDevVersion=6.5-SNAPSHOT
morfologik_ukrainian_lt_version=6.5.1-SNAPSHOT
morfologik_ukrainian_lt_version=6.5.1
groovyVersion=4.0.22
# nlp_uk version
version=3.3.2-SNAPSHOT
version=3.3.5-SNAPSHOT
statsVersion=3.3.5
3 changes: 2 additions & 1 deletion src/main/groovy/ua/net/nlp/tools/TagText.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ package ua.net.nlp.tools
@Grab(group='org.languagetool', module='languagetool-core', version='6.5-SNAPSHOT')
@Grab(group='org.languagetool', module='language-uk', version='6.5-SNAPSHOT')
@Grab(group='org.languagetool', module='language-ru', version='6.4')
//@Grab(group='ua.net.nlp', module='morfologik-ukrainian-lt', version='6.3.1-SNAPSHOT')
@Grab(group='ua.net.nlp', module='morfologik-ukrainian-lt', version='6.5.1')
@Grab(group='ua.net.nlp', module='nlp_uk-stats', version='3.3.5')

@Grab(group='ch.qos.logback', module='logback-classic', version='1.4.+')
@Grab(group='info.picocli', module='picocli', version='4.6.+')
Expand Down
34 changes: 0 additions & 34 deletions src/main/groovy/ua/net/nlp/tools/tag/DisambigStats.groovy
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package ua.net.nlp.tools.tag;

import java.util.function.Consumer
import java.util.regex.Pattern
import java.util.stream.Collectors

Expand All @@ -13,10 +12,7 @@ import groovy.transform.ToString
import ua.net.nlp.bruk.ContextToken
import ua.net.nlp.bruk.WordContext
import ua.net.nlp.bruk.WordReading
import ua.net.nlp.tools.tag.TagTextCore
import ua.net.nlp.tools.tag.TagTextCore.TTR
import ua.net.nlp.tools.tag.TagTextCore.TokenInfo
import ua.net.nlp.tools.tag.TagOptions


@CompileStatic
Expand Down Expand Up @@ -88,12 +84,6 @@ public class DisambigStats {
// lemma_xpN -> rate
Map<String, Map<String, Double>> statsForLemmaXp = new HashMap<>(256).withDefault{ new HashMap<>() }

@groovy.transform.SourceURI
static URI SOURCE_URI
// if this script is called from GroovyScriptEngine SourceURI is data: and does not work for File()
static File SCRIPT_DIR = SOURCE_URI.scheme == "data"
? null // new File("src/main/groovy/ua/net/nlp/tools/tag")
: new File(SOURCE_URI).getParentFile()

@CompileStatic
static double round(double d) {
Expand Down Expand Up @@ -629,26 +619,6 @@ public class DisambigStats {
[new WordContext(contextToken, offset)] as Set
}

void download() {
if( SCRIPT_DIR == null ) { // should not happen - jar will bundle the stats
System.err.println "Can't download from inside the jar"
System.exit 1
}

def targetDir = new File(SCRIPT_DIR, "../../../../../../resources/")
targetDir.mkdirs()
assert targetDir.isDirectory()

File targetFile = new File(targetDir, statsFile)
targetFile.parentFile.mkdirs()

def remoteStats = "https://github.com/brown-uk/nlp_uk/releases/download/v${statsVersion}/lemma_freqs_hom.txt"
System.err.println("Downloading $remoteStats...");
def statTxt = new URL(remoteStats).getText('UTF-8')

targetFile.setText(statTxt, 'UTF-8')
}


@CompileStatic
def loadDisambigStats() {
Expand All @@ -658,10 +628,6 @@ public class DisambigStats {
long tm1 = System.currentTimeMillis()

def statsFileRes = getClass().getResource(statsFile)
if( statsFileRes == null ) {
throw new IllegalStateException("Disambiguation stats not found, run \"TagText.groovy --download\" to download it from github, and then retry")
}


String word
WordReading wordReading
Expand Down
5 changes: 2 additions & 3 deletions src/main/groovy/ua/net/nlp/tools/tag/SemTags.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class SemTags {
TagOptions options
Map<String, Map<String,List<String>>> semanticTags = new HashMap<>()


def loadSemTags() {
if( semanticTags.size() > 0 )
return
Expand Down Expand Up @@ -85,7 +86,7 @@ public class SemTags {
}
}

@CompileStatic

String getSemTags(AnalyzedToken tkn, String posTag) {
if( options.semanticTags && tkn.getLemma() != null && posTag != null ) {
def lemma = tkn.getLemma()
Expand All @@ -108,7 +109,6 @@ public class SemTags {
}


@CompileStatic
private static boolean filterSemtag(String lemma, String posTag, String semtag) {
if( posTag.contains("pron") )
return semtag =~ ":deictic|:quantif"
Expand All @@ -133,6 +133,5 @@ public class SemTags {
}
true
}


}
2 changes: 0 additions & 2 deletions src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ public class TagOptions extends OptionsBase {
boolean singleThread
@Option(names = ["--timing"], description = "Pring timing information", hidden = true)
boolean timing
@Option(names = ["--download"], description = "Download file with disambiguation statistics and semantic tags (for tagging from CLI only)")
boolean download
@Option(names = ["--progress"], description = "Pring progress information every <n> files", hidden = true)
int progress=0

Expand Down
Loading

0 comments on commit b5e340b

Please sign in to comment.