Skip to content

Commit

Permalink
add additional tokenizer mode
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Feb 5, 2024
1 parent 80ca203 commit 53f8c1d
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ public interface Analyzer {

List<LayoutToken> tokenizeWithLayoutToken(String text);

List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens);

List<String> retokenizeSubdigits(List<String> chunks);

List<LayoutToken> retokenizeSubdigitsWithLayoutToken(List<String> chunks);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) {
return tokenizeWithLayoutToken(text, null);
}

public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) {
return GrobidDefaultAnalyzer.getInstance().retokenizeFromLayoutToken(tokens);
}

public List<LayoutToken> tokenizeWithLayoutToken(String text, Language lang) {
text = UnicodeUtil.normaliseText(text);
List<String> tokens = tokenize(text, lang);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,44 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text, Language language)
return result;
}

/**
* To tokenize an existing list of tokens. Only useful if input tokens have
* been tokenized with a non-default Grobid tokenizer.
* Note: the coordinates of the subtokens are not recomputed here (at least for
* the moment).
* <p>
* 1/74 -> "1", "/", "74"
*
*/
public List<LayoutToken> retokenizeFromLayoutToken(List<LayoutToken> tokens) {
List<LayoutToken> result = new ArrayList<>();
for(LayoutToken token : tokens) {
if (token.getText() == null || token.getText().trim().length() == 0) {
result.add(token);
} else {
String tokenText = token.getText();
List<String> subtokens = tokenize(tokenText);
int offset = token.getOffset();
for (int i = 0; i < subtokens.size(); i++) {
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(subtokens.get(i));
layoutToken.setOffset(offset);

// coordinates - TODO: refine the width/X for the sub token
layoutToken.setX(token.getX());
layoutToken.setY(token.getY());
layoutToken.setHeight(token.getHeight());
layoutToken.setWidth(token.getWidth());
layoutToken.setPage(token.getPage());

offset += subtokens.get(i).length();
result.add(layoutToken);
}
}
}
return result;
}

/**
* To tokenize mixture of alphabetical and numerical characters by separating
* separate alphabetical and numerical character subsequences. To be used
Expand Down

0 comments on commit 53f8c1d

Please sign in to comment.