Skip to content

Commit

Permalink
add recursive tagging option
Browse files Browse the repository at this point in the history
  • Loading branch information
arysin committed Jan 4, 2024
1 parent ed3f362 commit dee53ad
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 17 deletions.
3 changes: 3 additions & 0 deletions src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ public class TagOptions extends OptionsBase {

@Parameters(index = "0", description = "Input files. Default: stdin", arity="0..")
List<String> inputFiles
@Option(names = ["-r", "--recursive"], description = "Tag all files recursively in the given directories")
boolean recursive

@Option(names = ["--lemmaOnly"], description = "Prints only lemmas, implies: --outputFormat=txt --disambiguate=true")
boolean lemmaOnly

Expand Down
56 changes: 40 additions & 16 deletions src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
package ua.net.nlp.tools.tag

import java.math.RoundingMode
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.nio.file.attribute.BasicFileAttributes
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
Expand Down Expand Up @@ -676,30 +680,50 @@ class TagTextCore {
println("dict_uk version: ${dictUkversion}")
}

// TODO: quick hack to support multiple files
if( options.inputFiles && options.inputFiles != ["-"] ) {
// TODO: quick hack to support recursive processing
if( options.recursive ) {
def dirs = options.inputFiles ?: ["."]

ExecutorService executors = Executors.newWorkStealingPool()
options.singleThread = true
options.inputFiles.forEach{ filename ->
options.output = ""
options.input = filename
nlpUk.setInputOutput(options)
IOFiles files = TextUtils.prepareInputOutput(options)

executors.submit({
nlpUk.process(files)
} as Runnable)
List<File> files = []
dirs.collect { d ->
new File(d).eachFileRecurse { f ->
if( f.name.toLowerCase().endsWith(".txt") ) {
files << f
}
}
}

executors.shutdown()
executors.awaitTermination(1, TimeUnit.DAYS)
nlpUk.postProcess()
println "Found ${files.size()} files with .txt extension" // + files
options.singleThread = true
processFilesParallel(nlpUk, options, files.collect { it.path })
}
// TODO: quick hack to support multiple files
else if( options.inputFiles && options.inputFiles != ["-"] ) {
options.singleThread = true
processFilesParallel(nlpUk, options, options.inputFiles)
}
else {
nlpUk.process()
nlpUk.postProcess()
}
}

static processFilesParallel(TagTextCore nlpUk, TagOptions options, List<String> inputFiles) {
ExecutorService executors = Executors.newWorkStealingPool()
inputFiles.forEach{ filename ->
options.output = ""
options.input = filename
nlpUk.setInputOutput(options)
IOFiles files = TextUtils.prepareInputOutput(options)

executors.submit({
nlpUk.process(files)
} as Runnable)
}

executors.shutdown()
executors.awaitTermination(1, TimeUnit.DAYS)
nlpUk.postProcess()
}

}
2 changes: 1 addition & 1 deletion src/main/python/tag_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# This script allows to tag Ukrinian text
# by invoking TagText.groovy that uses LanguageTool API
# groovy >= 3.0 (http://www.groovy-lang.org) needs to be installed and in the path
# JDK > 17 and groovy >= 4.0 (http://www.groovy-lang.org) needs to be installed and in the path
# Usage: tag_text.py <inputfile>

import os
Expand Down
66 changes: 66 additions & 0 deletions src/main/python/tag_text_recursive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/python3

# This script allows to tag Ukrinian text
# by invoking TagText.groovy that uses LanguageTool API
# JDK > 17 and groovy >= 4.0 (http://www.groovy-lang.org) needs to be installed and in the path
# Usage: tag_text.py <inputfile>

import os
import sys
import subprocess
import threading
import argparse


ENCODING='utf-8'
SCRIPT_PATH=os.path.dirname(__file__) + '/../groovy/ua/net/nlp/tools'

in_txt = None

parser = argparse.ArgumentParser()
parser.add_argument("-v", help="Verbose", action="store_true")
parser.add_argument("-g", help="Disambiguate and print first token only", action="store_true")
parser.add_argument("dir", default=".", type=str, help="Directory to look for txt files in")

args = parser.parse_args()

def print_output(p):

print("output: ", p.stdout.read().decode(ENCODING))

def print_error(p):

error_txt = p.stderr.read().decode(ENCODING)
if error_txt:
print("stderr: ", error_txt, "\n", file=sys.stderr)


# technically only needed on Windows
my_env = os.environ.copy()
my_env["JAVA_TOOL_OPTIONS"] = "-Dfile.encoding=UTF-8"


groovy_cmd = 'groovy.bat' if sys.platform == "win32" else 'groovy'
cmd = [groovy_cmd, SCRIPT_PATH + '/TagText.groovy']

if args.g:
cmd.append('-g')
cmd.append('-t1')

cmd.append('-r')
cmd.append(args.dir)

if args.v:
print('Running: ' + str(cmd))
else:
cmd.append('-q')


p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, env=my_env)

threading.Thread(target=print_output, args=(p,)).start()
threading.Thread(target=print_error, args=(p,)).start()


p.stdin.close()

0 comments on commit dee53ad

Please sign in to comment.