From dee53adff87398af845a3dd93dcbd423b389b9c5 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Thu, 4 Jan 2024 13:30:46 -0500 Subject: [PATCH] add recursive tagging option --- .../ua/net/nlp/tools/tag/TagOptions.groovy | 3 + .../ua/net/nlp/tools/tag/TagTextCore.groovy | 56 +++++++++++----- src/main/python/tag_text.py | 2 +- src/main/python/tag_text_recursive.py | 66 +++++++++++++++++++ 4 files changed, 110 insertions(+), 17 deletions(-) create mode 100755 src/main/python/tag_text_recursive.py diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy index d3f6c5f..08be122 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/TagOptions.groovy @@ -12,6 +12,9 @@ public class TagOptions extends OptionsBase { @Parameters(index = "0", description = "Input files. Default: stdin", arity="0..") List inputFiles + @Option(names = ["-r", "--recursive"], description = "Tag all files recursively in the given directories") + boolean recursive + @Option(names = ["--lemmaOnly"], description = "Prints only lemmas, implies: --outputFormat=txt --disambiguate=true") boolean lemmaOnly diff --git a/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy b/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy index 54e671f..711e5a9 100644 --- a/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy +++ b/src/main/groovy/ua/net/nlp/tools/tag/TagTextCore.groovy @@ -3,6 +3,10 @@ package ua.net.nlp.tools.tag import java.math.RoundingMode +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.Paths +import java.nio.file.attribute.BasicFileAttributes import java.util.concurrent.ExecutorService import java.util.concurrent.Executors import java.util.concurrent.TimeUnit @@ -676,25 +680,27 @@ class TagTextCore { println("dict_uk version: ${dictUkversion}") } - // TODO: quick hack to support multiple files - if( options.inputFiles && options.inputFiles != ["-"] ) { + // TODO: quick hack to support recursive processing + if( options.recursive ) { + def dirs = options.inputFiles ?: ["."] - ExecutorService executors = Executors.newWorkStealingPool() - options.singleThread = true - options.inputFiles.forEach{ filename -> - options.output = "" - options.input = filename - nlpUk.setInputOutput(options) - IOFiles files = TextUtils.prepareInputOutput(options) - - executors.submit({ - nlpUk.process(files) - } as Runnable) + List files = [] + dirs.collect { d -> + new File(d).eachFileRecurse { f -> + if( f.name.toLowerCase().endsWith(".txt") ) { + files << f + } + } } - executors.shutdown() - executors.awaitTermination(1, TimeUnit.DAYS) - nlpUk.postProcess() + println "Found ${files.size()} files with .txt extension" // + files + options.singleThread = true + processFilesParallel(nlpUk, options, files.collect { it.path }) + } + // TODO: quick hack to support multiple files + else if( options.inputFiles && options.inputFiles != ["-"] ) { + options.singleThread = true + processFilesParallel(nlpUk, options, options.inputFiles) } else { nlpUk.process() @@ -702,4 +708,22 @@ class TagTextCore { } } + static processFilesParallel(TagTextCore nlpUk, TagOptions options, List inputFiles) { + ExecutorService executors = Executors.newWorkStealingPool() + inputFiles.forEach{ filename -> + options.output = "" + options.input = filename + nlpUk.setInputOutput(options) + IOFiles files = TextUtils.prepareInputOutput(options) + + executors.submit({ + nlpUk.process(files) + } as Runnable) + } + + executors.shutdown() + executors.awaitTermination(1, TimeUnit.DAYS) + nlpUk.postProcess() + } + } diff --git a/src/main/python/tag_text.py b/src/main/python/tag_text.py index 6298adb..9e7e8fc 100755 --- a/src/main/python/tag_text.py +++ b/src/main/python/tag_text.py @@ -2,7 +2,7 @@ # This script allows to tag Ukrinian text # by invoking TagText.groovy that uses LanguageTool API -# groovy >= 3.0 (http://www.groovy-lang.org) needs to be installed and in the path +# JDK > 17 and groovy >= 4.0 (http://www.groovy-lang.org) needs to be installed and in the path # Usage: tag_text.py import os diff --git a/src/main/python/tag_text_recursive.py b/src/main/python/tag_text_recursive.py new file mode 100755 index 0000000..bb116e5 --- /dev/null +++ b/src/main/python/tag_text_recursive.py @@ -0,0 +1,66 @@ +#!/usr/bin/python3 + +# This script allows to tag Ukrinian text +# by invoking TagText.groovy that uses LanguageTool API +# JDK > 17 and groovy >= 4.0 (http://www.groovy-lang.org) needs to be installed and in the path +# Usage: tag_text.py + +import os +import sys +import subprocess +import threading +import argparse + + +ENCODING='utf-8' +SCRIPT_PATH=os.path.dirname(__file__) + '/../groovy/ua/net/nlp/tools' + +in_txt = None + +parser = argparse.ArgumentParser() +parser.add_argument("-v", help="Verbose", action="store_true") +parser.add_argument("-g", help="Disambiguate and print first token only", action="store_true") +parser.add_argument("dir", default=".", type=str, help="Directory to look for txt files in") + +args = parser.parse_args() + +def print_output(p): + + print("output: ", p.stdout.read().decode(ENCODING)) + +def print_error(p): + + error_txt = p.stderr.read().decode(ENCODING) + if error_txt: + print("stderr: ", error_txt, "\n", file=sys.stderr) + + +# technically only needed on Windows +my_env = os.environ.copy() +my_env["JAVA_TOOL_OPTIONS"] = "-Dfile.encoding=UTF-8" + + +groovy_cmd = 'groovy.bat' if sys.platform == "win32" else 'groovy' +cmd = [groovy_cmd, SCRIPT_PATH + '/TagText.groovy'] + +if args.g: + cmd.append('-g') + cmd.append('-t1') + +cmd.append('-r') +cmd.append(args.dir) + +if args.v: + print('Running: ' + str(cmd)) +else: + cmd.append('-q') + + +p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, env=my_env) + +threading.Thread(target=print_output, args=(p,)).start() +threading.Thread(target=print_error, args=(p,)).start() + + +p.stdin.close() +