diff --git a/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java b/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java index 0b0bf65fe165..77bf868ee913 100644 --- a/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java +++ b/languagetool-core/src/main/java/org/languagetool/languagemodel/LuceneSingleIndexLanguageModel.java @@ -146,10 +146,10 @@ public long getTotalTokenCount() { try { RegexpQuery query = new RegexpQuery(new Term("totalTokenCount", ".*")); TopDocs docs = luceneSearcher.searcher.search(query, 1000); // Integer.MAX_VALUE might cause OOE on wrong index - if (docs.totalHits == 0) { + if (docs.totalHits.value == 0) { throw new RuntimeException("Expected 'totalTokenCount' meta documents not found in 1grams index: " + luceneSearcher.directory); - } else if (docs.totalHits > 1000) { - throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents: " + docs.totalHits + " in " + luceneSearcher.directory); + } else if (docs.totalHits.value > 1000) { + throw new RuntimeException("Did not expect more than 1000 'totalTokenCount' meta documents: " + docs.totalHits.value + " in " + luceneSearcher.directory); } else { long result = 0; for (ScoreDoc scoreDoc : docs.scoreDocs) { @@ -194,9 +194,9 @@ private long getCount(Term term, LuceneSearcher luceneSearcher) { long result = 0; try { TopDocs docs = luceneSearcher.searcher.search(new TermQuery(term), 2000); - if (docs.totalHits > 2000) { + if (docs.totalHits.value > 2000) { throw new RuntimeException("More than 2000 matches for '" + term + "' not supported for performance reasons: " + - docs.totalHits + " matches in " + luceneSearcher.directory); + docs.totalHits.value + " matches in " + luceneSearcher.directory); } for (ScoreDoc scoreDoc : docs.scoreDocs) { String countStr = luceneSearcher.reader.document(scoreDoc.doc).get("count"); diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe index b798387ded9d..ee091ce48847 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs index af3abe0553ad..ba80b675fe1d 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si index 7fce7afe0e39..4a4978085455 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 index 6e93fa9e6973..e49de5d67abe 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe index d04ba2ee5c95..15e6f67ed0a2 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs index 36ff0f65c139..5d62d79a170c 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si index 3b116de2a4f0..ce3c306f6fab 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 index 6e93fa9e6973..15102c4ff7f2 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe index dfbaafafcd61..fb7f2188cace 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs index 552a0d09868e..c853d3bd23f0 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si index 3f231fb03f90..10159d3fd64c 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 index 6e93fa9e6973..4b452afb987e 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 differ diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java index ab45341f7edc..39c001d227a5 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java @@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException { try (FSDirectory directory = FSDirectory.open(dir.toPath()); IndexReader reader = DirectoryReader.open(directory)) { IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); - Terms ngrams = fields.terms("ngram"); - TermsEnum iterator = ngrams.iterator(); - BytesRef next; - int i = 0; - while ((next = iterator.next()) != null) { - String term = next.utf8ToString(); - if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { - if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { - //System.out.println("ignore: " + term); - continue; - } - TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); - if (topDocs.totalHits == 0) { - throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits); - } else if (topDocs.totalHits == 1) { - int docId = topDocs.scoreDocs[0].doc; - Document document = reader.document(docId); - Long count = Long.parseLong(document.get("count")); - //System.out.println(term + " -> " + count); - totalCount += count; - if (++i % 10_000 == 0) { - System.out.println(i + " ... " + totalCount); + for (String field : FieldInfos.getIndexedFields(reader)) { + Terms ngrams = MultiTerms.getTerms(reader, field); + TermsEnum iterator = ngrams.iterator(); + BytesRef next; + int i = 0; + while ((next = iterator.next()) != null) { + String term = next.utf8ToString(); + if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { + if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { + //System.out.println("ignore: " + term); + continue; + } + TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); + if (topDocs.totalHits.value == 0) { + throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value); + } else if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document document = reader.document(docId); + Long count = Long.parseLong(document.get("count")); + //System.out.println(term + " -> " + count); + totalCount += count; + if (++i % 10_000 == 0) { + System.out.println(i + " ... " + totalCount); + } + } else { + throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits); } - } else { - throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits); } } } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java index 8b352f75be7b..08ce8200c9a2 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java @@ -1,4 +1,4 @@ -/* LanguageTool, a natural language style checker +/* LanguageTool, a natural language style checker * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or @@ -90,20 +90,11 @@ private void indexLine(String line) throws IOException { private Document getDoc(String ngram, long count) { Document doc = new Document(); doc.add(new Field("ngram", ngram, StringField.TYPE_NOT_STORED)); // use StringField.TYPE_STORED for easier debugging with e.g. Luke - doc.add(getCountField(count)); + doc.add(new LongPoint("count", count)); + doc.add(new StoredField("count", count)); return doc; } - @NotNull - private LongField getCountField(long count) { - FieldType fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setOmitNorms(true); - fieldType.setNumericType(FieldType.NumericType.LONG); - fieldType.setDocValuesType(DocValuesType.NUMERIC); - return new LongField("count", count, fieldType); - } - private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS); diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java index 4e130e87bcb5..43f87f102592 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java @@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws if (newReader != null) { reader = newReader; }*/ - index.reader = DirectoryReader.open(index.indexWriter, true); + index.reader = DirectoryReader.open(index.indexWriter); index.searcher = new IndexSearcher(index.reader); for (Map.Entry entry : ngramToCount.entrySet()) { Term ngram = new Term("ngram", entry.getKey()); TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2); //System.out.println(ngram + " ==> " + topDocs.totalHits); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { Document doc = getDoc(entry.getKey(), entry.getValue()); index.indexWriter.addDocument(doc); - } else if (topDocs.totalHits == 1) { + } else if (topDocs.totalHits.value == 1) { int docNumber = topDocs.scoreDocs[0].doc; Document document = index.reader.document(docNumber); long oldCount = Long.parseLong(document.getField("count").stringValue()); @@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue())); // would probably be faster, but we currently rely on the count being a common field: //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue()); - } else if (topDocs.totalHits > 1) { + } else if (topDocs.totalHits.value > 1) { throw new RuntimeException("Got more than one hit for: " + ngram); } //System.out.println(" " + entry.getKey() + " -> " + entry.getValue()); @@ -216,20 +216,11 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws private Document getDoc(String ngram, long count) { Document doc = new Document(); doc.add(new Field("ngram", ngram, StringField.TYPE_NOT_STORED)); - doc.add(getCountField(count)); + doc.add(new LongPoint("count", count)); + doc.add(new StoredField("count", count)); return doc; } - @NotNull - private LongField getCountField(long count) { - FieldType fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setOmitNorms(true); - fieldType.setNumericType(FieldType.NumericType.LONG); - fieldType.setDocValuesType(DocValuesType.NUMERIC); - return new LongField("count", count, fieldType); - } - private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS); @@ -269,7 +260,7 @@ static class LuceneLiveIndex { IndexWriterConfig config = new IndexWriterConfig(analyzer); directory = FSDirectory.open(dir.toPath()); indexWriter = new IndexWriter(directory, config); - reader = DirectoryReader.open(indexWriter, false); + reader = DirectoryReader.open(indexWriter); searcher = new IndexSearcher(reader); } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java index ad0e60af36a3..b70700a1ed18 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java @@ -57,47 +57,55 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms("ngram"); - TermsEnum termsEnum = terms.iterator(); - int count = 0; - BytesRef next; - while ((next = termsEnum.next()) != null) { - String term = next.utf8ToString(); - count++; - //term = "persischer Golf"; // for testing - String[] parts = term.split(" "); - boolean useful = true; - int lcCount = 0; - List ucParts = new ArrayList<>(); - for (String part : parts) { - if (part.length() < MIN_TERM_LEN) { - useful = false; - break; - } - String uc = StringTools.uppercaseFirstChar(part); - if (!part.equals(uc)) { - lcCount++; - } - ucParts.add(uc); - } - if (!useful || lcCount == 0 || lcCount == 2) { + FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader); + for (FieldInfo fieldInfo: fieldInfos) { + if (fieldInfo.getIndexOptions() == IndexOptions.NONE) { continue; } - String uppercase = String.join(" ", ucParts); - if (term.equals(uppercase)){ + Terms terms = MultiTerms.getTerms(reader, fieldInfo.name); + if (terms == null) { continue; } - long thisCount = getOccurrenceCount(reader, searcher, term); - long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); - if (count % 10_000 == 0) { - System.err.println(count + " @ " + term); - } - if (thisCount > LIMIT || thisUpperCount > LIMIT) { - if (thisUpperCount > thisCount) { - if (isRelevant(lt, term)) { - float factor = (float)thisUpperCount / thisCount; - System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); + TermsEnum termsEnum = terms.iterator(); + int count = 0; + BytesRef next; + while ((next = termsEnum.next()) != null) { + String term = next.utf8ToString(); + count++; + //term = "persischer Golf"; // for testing + String[] parts = term.split(" "); + boolean useful = true; + int lcCount = 0; + List ucParts = new ArrayList<>(); + for (String part : parts) { + if (part.length() < MIN_TERM_LEN) { + useful = false; + break; + } + String uc = StringTools.uppercaseFirstChar(part); + if (!part.equals(uc)) { + lcCount++; + } + ucParts.add(uc); + } + if (!useful || lcCount == 0 || lcCount == 2) { + continue; + } + String uppercase = String.join(" ", ucParts); + if (term.equals(uppercase)) { + continue; + } + long thisCount = getOccurrenceCount(reader, searcher, term); + long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); + if (count % 10_000 == 0) { + System.err.println(count + " @ " + term); + } + if (thisCount > LIMIT || thisUpperCount > LIMIT) { + if (thisUpperCount > thisCount) { + if (isRelevant(lt, term)) { + float factor = (float) thisUpperCount / thisCount; + System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); + } } } } @@ -117,7 +125,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException { TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { return 0; } int docId = topDocs.scoreDocs[0].doc; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java index 54d6dc8d8587..693dc866b2d5 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java @@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); + Terms terms = MultiTerms.getTerms(reader, "ngram"); long max = 0; String maxTerm = ""; - Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; @@ -71,5 +70,5 @@ public static void main(String[] args) throws IOException { } System.out.println("Max: " + max + " for " + maxTerm); } - + } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java index d5caea350778..b7c43dc46619 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java @@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException { String ngramIndexDir = args[0]; FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath()); IndexReader reader = DirectoryReader.open(fsDir); - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms("ngram"); + Terms terms = MultiTerms.getTerms(reader, "ngram"); TermsEnum termsEnum = terms.iterator(); int i = 0; int needed = 0; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java index 0d97a4df0bab..5ddbf1d7a423 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java @@ -19,8 +19,8 @@ package org.languagetool.dev.bigdata; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; diff --git a/languagetool-language-modules/ja/pom.xml b/languagetool-language-modules/ja/pom.xml index 12dfea7942b2..58ee2b8b9d68 100644 --- a/languagetool-language-modules/ja/pom.xml +++ b/languagetool-language-modules/ja/pom.xml @@ -40,7 +40,7 @@ - com.github.lucene-gosen + org.omegat.lucene lucene-gosen ipadic diff --git a/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java b/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java index 2b4d22c4c684..3fa2ff50743a 100644 --- a/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java +++ b/languagetool-standalone/src/main/java/org/languagetool/dev/HomophoneOccurrenceDumper.java @@ -18,8 +18,7 @@ */ package org.languagetool.dev; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; @@ -112,8 +111,7 @@ private void dumpOccurrences(Set tokens) throws IOException { private TermsEnum getIterator() throws IOException { LuceneSearcher luceneSearcher = getLuceneSearcher(3); - Fields fields = MultiFields.getFields(luceneSearcher.getReader()); - Terms terms = fields.terms("ngram"); + Terms terms = MultiTerms.getTerms(luceneSearcher.getReader(), "ngram"); return terms.iterator(); } diff --git a/languagetool-wikipedia/pom.xml b/languagetool-wikipedia/pom.xml index 844bddb8351d..575a81cfd8f1 100644 --- a/languagetool-wikipedia/pom.xml +++ b/languagetool-wikipedia/pom.xml @@ -38,7 +38,7 @@ - 5.5.5 + 8.11.3 diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java index d3b2adfc4abe..5f99596bf348 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java @@ -19,8 +19,8 @@ package org.languagetool.dev.dumpcheck; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java index effff0cf9460..e7474412aced 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java @@ -18,10 +18,10 @@ */ package org.languagetool.dev.index; +import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.AttributeFactory; import java.io.IOException; @@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer { private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length! private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096); - private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class); @@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException { while(true) { if(this.bufferIndex >= this.dataLen) { this.offset += this.dataLen; - this.charUtils.fill(this.ioBuffer, this.input); + CharacterUtils.fill(this.ioBuffer, this.input); if(this.ioBuffer.getLength() == 0) { this.dataLen = 0; if(length <= 0) { @@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException { this.bufferIndex = 0; } - int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength()); + int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex); int charCount = Character.charCount(c); this.bufferIndex += charCount; if(this.isTokenChar(c)) { diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java index 622c7f7291f5..e356998b9f95 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java @@ -124,7 +124,7 @@ private SpanQuery asSpanQuery(BooleanClause query) { } else { Set terms = new HashSet<>(); try { - indexSearcher.createWeight(query.getQuery(), false).extractTerms(terms); + indexSearcher.createWeight(query.getQuery(), ScoreMode.COMPLETE_NO_SCORES, 1.0f).extractTerms(terms); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java index 22dbb89e625c..8bc871e3cd57 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java @@ -33,6 +33,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; @@ -101,7 +102,7 @@ public int getDocCount() throws IOException { private int getDocCount(IndexSearcher indexSearcher) throws IOException { Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL); TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1); - if (search.totalHits != 1) { + if (search.totalHits.value != 1) { return -1; } ScoreDoc scoreDoc = search.scoreDocs[0]; @@ -200,7 +201,7 @@ public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language } private PossiblyLimitedTopDocs getTopDocs(Query query) throws IOException { - TopScoreDocCollector topCollector = TopScoreDocCollector.create(maxHits); + TopScoreDocCollector topCollector = TopScoreDocCollector.create(maxHits, Integer.MAX_VALUE); Counter clock = Counter.newCounter(true); int waitMillis = 1000; // TODO: if we interrupt the whole thread anyway, do we still need the TimeLimitingCollector? @@ -334,7 +335,7 @@ class SearchRunnable implements Runnable { private List matchingSentences; private Exception exception; private boolean tooManyLuceneMatches; - private int luceneMatchCount; + private long luceneMatchCount; private int maxDocChecked; private int docsChecked; private int numDocs; @@ -356,7 +357,7 @@ public void run() { PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query); long luceneTime = System.currentTimeMillis() - t2; long t3 = System.currentTimeMillis(); - luceneMatchCount = limitedTopDocs.topDocs.totalHits; + luceneMatchCount = limitedTopDocs.topDocs.totalHits.value; tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits; MatchingSentencesResult res = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); matchingSentences = res.matchingSentences; @@ -382,7 +383,7 @@ boolean hasTooManyLuceneMatches() { return tooManyLuceneMatches; } - int getLuceneMatchCount() { + long getLuceneMatchCount() { return luceneMatchCount; } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java index 40c860af0650..6c39036346e0 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java @@ -35,7 +35,7 @@ public class SearcherResult { private int docCount; private int maxDocChecked; private boolean hasTooManyLuceneMatches; - private int luceneMatchCount; + private long luceneMatchCount; private int skipHits; private int numDocs; @@ -81,11 +81,11 @@ public boolean hasTooManyLuceneMatches() { return hasTooManyLuceneMatches; } - public void setLuceneMatchCount(int luceneMatchCount) { + public void setLuceneMatchCount(long luceneMatchCount) { this.luceneMatchCount = luceneMatchCount; } - public int getLuceneMatchCount() { + public long getLuceneMatchCount() { return luceneMatchCount; } diff --git a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java index 8febf19486a4..be0b8a93e752 100644 --- a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java +++ b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java @@ -271,11 +271,11 @@ public void testSeveralElements() throws Exception { assertMatches(makeRule("How do you"), 1); // known overmatching } - private void assertMatches(AbstractPatternRule patternRule, int expectedMatches) throws Exception { + private void assertMatches(AbstractPatternRule patternRule, long expectedMatches) throws Exception { PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher); Query query = queryBuilder.buildRelaxedQuery(patternRule); //System.out.println("QUERY: " + query); - int matches = searcher.search(query, 1000).totalHits; + long matches = searcher.search(query, 1000).totalHits.value; assertEquals("Query failed: " + query, expectedMatches, matches); } diff --git a/pom.xml b/pom.xml index ae7bdc0d64db..4c12b32d45a8 100644 --- a/pom.xml +++ b/pom.xml @@ -161,7 +161,7 @@ 0.8.2 2.1.2 - 6.2.1 + 8.11.0 3.25.5 1.2.2 portable-1.8.2 @@ -211,11 +211,10 @@ 2.26 0.1 1.10 - 33.3.1-jre 2.18.0 1.18.34 - 5.5.5 + 8.11.3 2.1.9 0.6 @@ -275,9 +274,9 @@ ${jackson.version} - com.github.lucene-gosen + org.omegat.lucene lucene-gosen - ${com.github.lucene-gosen.version} + ${org.omegat.lucene.lucene-gosen.version} ipadic