add additional tokenizer mode

kermitt2 · Feb 5, 2024 · 53f8c1d · 53f8c1d
1 parent 80ca203
commit 53f8c1d
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 0 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/Analyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/Analyzer.java
@@ -21,6 +21,8 @@ public interface Analyzer {
 
  List<LayoutToken> tokenizeWithLayoutToken(String text);
 
+ List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens);
+
  List<String> retokenizeSubdigits(List<String> chunks);
 
  List<LayoutToken> retokenizeSubdigitsWithLayoutToken(List<String> chunks);

diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java
@@ -195,6 +195,10 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) {
  return tokenizeWithLayoutToken(text, null);
  }
 
+ public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) {
+ return GrobidDefaultAnalyzer.getInstance().retokenizeFromLayoutToken(tokens);
+ }
+
  public List<LayoutToken> tokenizeWithLayoutToken(String text, Language lang) {
  text = UnicodeUtil.normaliseText(text);
  List<String> tokens = tokenize(text, lang);

diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java
@@ -123,6 +123,44 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text, Language language)
  return result;
  }
 
+ /**
+ * To tokenize an existing list of tokens. Only useful if input tokens have
+ * been tokenized with a non-default Grobid tokenizer. 
+ * Note: the coordinates of the subtokens are not recomputed here (at least for 
+ * the moment). 
+ * <p>
+ * 1/74 -> "1", "/", "74"
+ *
+ */
+ public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) {
+ List<LayoutToken> result = new ArrayList<>();
+ for(LayoutToken token : tokens) {
+ if (token.getText() == null || token.getText().trim().length() == 0) {
+ result.add(token);
+ } else {
+ String tokenText = token.getText();
+ List<String> subtokens = tokenize(tokenText);
+ int offset = token.getOffset();
+ for (int i = 0; i < subtokens.size(); i++) {
+ LayoutToken layoutToken = new LayoutToken();
+ layoutToken.setText(subtokens.get(i));
+ layoutToken.setOffset(offset);
+
+ // coordinates - TODO: refine the width/X for the sub token
+ layoutToken.setX(token.getX());
+ layoutToken.setY(token.getY());
+ layoutToken.setHeight(token.getHeight());
+ layoutToken.setWidth(token.getWidth());
+ layoutToken.setPage(token.getPage());
+
+ offset += subtokens.get(i).length();
+ result.add(layoutToken);
+ }
+ }
+ }
+ return result;
+ }
+
  /**
  * To tokenize mixture of alphabetical and numerical characters by separating 
  * separate alphabetical and numerical character subsequences. To be used