From 575f18c30c8ddac2fafeb2b3f9b84fc316071eef Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Thu, 6 Jun 2019 16:36:30 +0200 Subject: [PATCH] #1381 - Annotations starting/ending in inter-token space cause exception - Ported fix and unit tests --- .../tsv3x/Tsv3XCasDocumentBuilder.java | 20 +++++ .../tsv/WebAnnoTsv3WriterTestBase.java | 85 +++++++++++++++++++ .../reference.tsv | 7 ++ .../reference.xmi | 10 +++ .../reference.tsv | 7 ++ .../reference.xmi | 10 +++ .../reference.tsv | 7 ++ .../reference.xmi | 10 +++ .../reference.tsv | 7 ++ .../reference.xmi | 10 +++ 10 files changed, 173 insertions(+) create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv create mode 100644 dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi diff --git a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java index afb53bf999..5cdba30afc 100644 --- a/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java +++ b/dkpro-core-io-webanno-asl/src/main/java/org/dkpro/core/io/webanno/tsv/internal/tsv3x/Tsv3XCasDocumentBuilder.java @@ -135,6 +135,16 @@ public static TsvDocument of(TsvSchema aSchema, JCas aJCas) } Entry beginTokenEntry = tokenBeginIndex.floorEntry(begin); + // If the current annotation has leading whitespace, we have wrongly fetched the + // token before the start token using floorEntry(end) - so let's try to correct this + if ( + // found begin token but found the wrong one + (beginTokenEntry != null && beginTokenEntry.getValue().getEnd() < begin) || + // didn't find end begin because annotation starts before the first token + beginTokenEntry == null + ) { + beginTokenEntry = tokenEndIndex.higherEntry(begin); + } if (beginTokenEntry == null) { throw new IllegalStateException( "Unable to find begin token starting at or before " + begin @@ -144,6 +154,16 @@ public static TsvDocument of(TsvSchema aSchema, JCas aJCas) } Entry endTokenEntry = tokenEndIndex.ceilingEntry(end); + // If the current annotation has trailing whitespace, we have wrongly fetched the + // token after the end token using ceilingEntry(end) - so let's try to correct this + if ( + // found end token but found the wrong one + (endTokenEntry != null && endTokenEntry.getValue().getBegin() > end) || + // didn't find end token because annotation ends beyond the last token + endTokenEntry == null + ) { + endTokenEntry = tokenEndIndex.lowerEntry(end); + } if (endTokenEntry == null) { throw new IllegalStateException("Unable to find end token ending at or after " + end + " (last token ends at " + tokenEndIndex.pollLastEntry().getKey() diff --git a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java index df3cc956be..99a09c973a 100644 --- a/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java +++ b/dkpro-core-io-webanno-asl/src/test/java/org/dkpro/core/io/webanno/tsv/WebAnnoTsv3WriterTestBase.java @@ -60,6 +60,7 @@ import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; @@ -1666,7 +1667,91 @@ public void testTwoSentencesWithNoSpaceInBetween() throws Exception writeAndAssertEquals(jcas); } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithTrailingWhitespace() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two"); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 0, 8).addToIndexes(); + + // NE has trailing whitespace - on export this should be silently dropped + new NamedEntity(jcas, 0, 4).addToIndexes(); + + writeAndAssertEquals(jcas); + } + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithTrailingWhitespaceAtEnd() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two "); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 4, 7).addToIndexes(); + new Sentence(jcas, 0, 7).addToIndexes(); + + // NE has trailing whitespace - on export this should be silently dropped + new NamedEntity(jcas, 4, 8).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithLeadingWhitespaceAtStart() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText(" one two"); + new Token(jcas, 1, 4).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 1, 8).addToIndexes(); + + // NE has leading whitespace - on export this should be silently dropped + new NamedEntity(jcas, 0, 4).addToIndexes(); + + writeAndAssertEquals(jcas); + } + + /* + * This is something that cannot be done through the editor UI but can happen when working with + * externally created data. + */ + @Test + public void testAnnotationWithLeadingWhitespace() throws Exception + { + JCas jcas = JCasFactory.createJCas(); + + DocumentMetaData.create(jcas).setDocumentId("doc"); + jcas.setDocumentText("one two"); + new Token(jcas, 0, 3).addToIndexes(); + new Token(jcas, 5, 8).addToIndexes(); + new Sentence(jcas, 0, 8).addToIndexes(); + + // NE has leading whitespace - on export this should be silently dropped + new NamedEntity(jcas, 4, 8).addToIndexes(); + + writeAndAssertEquals(jcas); + } + private void writeAndAssertEquals(JCas aJCas, Object... aParams) throws IOException, ResourceInitializationException, AnalysisEngineProcessException { diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv new file mode 100644 index 0000000000..16073bc943 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 0-3 one _ _ +1-2 5-8 two * * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi new file mode 100644 index 0000000000..2760952d04 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespace/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv new file mode 100644 index 0000000000..73422f62f7 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 1-4 one * * +1-2 5-8 two _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi new file mode 100644 index 0000000000..1d50fbbb13 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithLeadingWhitespaceAtStart/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv new file mode 100644 index 0000000000..4656ebf8d1 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 0-3 one * * +1-2 5-8 two _ _ diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi new file mode 100644 index 0000000000..e05cab5977 --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespace/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv new file mode 100644 index 0000000000..d0399a28cc --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.tsv @@ -0,0 +1,7 @@ +#FORMAT=WebAnno TSV 3.2 +#T_SP=de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity|identifier|value + + +#Text=one two +1-1 0-3 one _ _ +1-2 4-7 two * * diff --git a/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi new file mode 100644 index 0000000000..cfb1ce5aca --- /dev/null +++ b/dkpro-core-io-webanno-asl/src/test/resources/tsv3-suite/testAnnotationWithTrailingWhitespaceAtEnd/reference.xmi @@ -0,0 +1,10 @@ + + + + + + + + + +