Skip to content

Commit

Permalink
review method profile and fix test
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Feb 5, 2024
1 parent 53f8c1d commit 0d524f7
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -874,7 +874,9 @@ public String processAllCitationsInPatent(String text,
}
// we initialize the attribute individually for readability...
boolean filterDuplicate = false;
return parsers.getReferenceExtractor().extractAllReferencesString(text, filterDuplicate,
List<String> texts = new ArrayList<>();
texts.add(text);
return parsers.getReferenceExtractor().extractAllReferencesString(texts, filterDuplicate,
consolidateCitations, includeRawCitations, patentResults, nplResults);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public class PatentRefParser {
"TT", "TN", "TR", "UA", "GB", "US", "UY", "VE", "VN", "YU", "ZM", "ZW");

// this is the list of supported languages - language codes given ISO 639-1, two-letter codes
static public List<String> languages = Arrays.asList("en", "de", "fr", "es", "it", "ja", "kr", "pt", "zh", "ar");
static public List<String> languages = Arrays.asList("en", "de", "fr", "es", "it", "ja", "ko", "pt", "zh", "ar");

// list of regular expressions for identifying the authority in the raw reference string
private List<Pattern> autority_patterns = new ArrayList<Pattern>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.grobid.core.utilities.BoundingBoxCalculator;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.analyzers.GrobidDefaultAnalyzer;
import org.grobid.core.lang.Language;
import org.grobid.core.layout.*;
import org.slf4j.Logger;
Expand Down Expand Up @@ -239,7 +240,9 @@ public String extractAllReferencesPDFFile(String inputFile,
}
String description = doc.getAllBlocksClean(25, -1);
if (description != null) {
result = extractAllReferencesString(description,
List<String> descriptions = new ArrayList<>();
descriptions.add(description);
result = extractAllReferencesString(descriptions,
filterDuplicate,
consolidate,
includeRawCitations,
Expand Down Expand Up @@ -294,7 +297,7 @@ public String annotateAllReferencesPDFFile(String inputFile,
/**
* Extract all reference from a simple piece of text or a list of text segments, and return results in an XML document.
*/
public String extractAllReferencesString(String text,
/*public String extractAllReferencesString(String text,
boolean filterDuplicate,
int consolidate,
boolean includeRawCitations,
Expand All @@ -303,7 +306,7 @@ public String extractAllReferencesString(String text,
List<String> texts = new ArrayList<>();
texts.add(text);
return extractAllReferencesString(texts, filterDuplicate, consolidate, includeRawCitations, patents, articles);
}
}*/

public String extractAllReferencesString(List<String> texts,
boolean filterDuplicate,
Expand Down Expand Up @@ -348,15 +351,21 @@ public String extractAllReferencesString(List<String> texts,
List<String> allPatentBlocks = new ArrayList<>();

for (String text : texts) {

//text = TextUtilities.dehyphenize(text); // to be reviewed!

// tokenisation according to the language
List<LayoutToken> tokenizations = analyzer.tokenizeWithLayoutToken(text, lang);
// tokenisation according to the language (except for Korean, which will require retraining)
List<LayoutToken> tokenizations;
if (lang.getLang().equals(Language.KO)) {
tokenizations = GrobidDefaultAnalyzer.getInstance().tokenizeWithLayoutToken(text);
} else {
tokenizations = analyzer.tokenizeWithLayoutToken(text, lang);
// to be sure to sub-tokenize based on standard punctuations:
tokenizations = GrobidDefaultAnalyzer.getInstance().retokenizeFromLayoutToken(tokenizations);
}

if (tokenizations.size() == 0) {
continue;
}

allTokenizations.add(tokenizations);

StringBuilder patentBlocks = new StringBuilder();
Expand Down Expand Up @@ -394,6 +403,7 @@ public String extractAllReferencesString(List<String> texts,
(tok.equals("\n")) ||
(tok.equals("\r"))
) {
posit++;
continue;
}

Expand Down Expand Up @@ -512,7 +522,6 @@ public String extractAllReferencesString(List<String> texts,
String theResults = taggerAll.label(allPatentBlocks);
//System.out.println(theResults);
String[] theSegmentedResults = theResults.split("\n\n");
//System.out.println(allPatentBlocks.size() + " / " + theSegmentedResults.length);

List<String> allReferencesPatent = new ArrayList<>();
List<String> allReferencesNPL = new ArrayList<>();
Expand Down Expand Up @@ -807,7 +816,6 @@ public String extractAllReferencesString(List<String> texts,
localList.add(bds);
articlesBySegment.put(localIndexSegmentNPL.get(k), localList);


k++;
}
}
Expand Down Expand Up @@ -836,7 +844,7 @@ public String extractAllReferencesString(List<String> texts,
(localArticlesBySegment != null && localArticlesBySegment.size()>0) ) {
// output text
String divID = KeyGen.getKey().substring(0,7);
resultTEI.append("\t\t<div id=\"_"+ divID +"\">\n");
resultTEI.append("\t\t<div id=\"_"+ divID +"\">");
String text = LayoutTokensUtil.toText(tokens);
// not affecting offsets:
text = text.replace("\n", " ").replace("\t", " ");
Expand Down Expand Up @@ -959,6 +967,7 @@ public String annotateAllReferences(Document doc,
(tok.equals("\n")) ||
(tok.equals("\r"))
) {
posit++;
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,42 +40,47 @@ public static void destroyInitialContext() throws Exception {
@Test
public void extractAllReferencesStringNull() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<String> toExtracts = new ArrayList<>();
toExtracts.add("Economic Development Quarterly November 2011 25: 353-365, first published on August 25, 2011.");
String res = extractor
.extractAllReferencesString(
"Economic Development Quarterly November 2011 25: 353-365, first published on August 25, 2011.",
toExtracts,
false, 0, false, null, null);
//assertEquals(0, nbRes);
}

@Test
//@Test
public void extractAllReferencesStringArticles() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
List<BibDataSet> articles = new ArrayList<BibDataSet>();
List<PatentItem> patents = new ArrayList<>();
List<BibDataSet> articles = new ArrayList<>();
String toExtract = "Some other description includes ref. US 2011/0155847 A1 in aerodynamic" +
" and applied physics. " +
"This patent, ref. US 7930197 says data mining of personal data is patented. " +
"That article refers to Economic Development Quarterly November 2011 25: 353-365, first" +
"That article refers to Economic Development Quarterly November 2011 25:353-365, first" +
" published on August 25, 2011.";
GrobidTimer timer = new GrobidTimer(true);
extractor.extractAllReferencesString(toExtract, false, 0, false, patents, articles);
timer.stop("STOP");
System.out.println(timer.getElapsedTimeFromStartFormated("STOP"));
LOGGER.info("BibDataSet: " + articles.toString());
//GrobidTimer timer = new GrobidTimer(true);
List<String> toExtracts = new ArrayList<>();
toExtracts.add(toExtract);
extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, articles);
//timer.stop("STOP");
//System.out.println(timer.getElapsedTimeFromStartFormated("STOP"));
//LOGGER.info("BibDataSet: " + articles.toString());
assertEquals(2, patents.size());
assertEquals(1, articles.size());
LOGGER.info(articles.get(0).getOffsets().toString());
//LOGGER.info(articles.get(0).getOffsets().toString());
}

@Test
public void extractAllReferencesStringArticles2() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
List<BibDataSet> articles = new ArrayList<BibDataSet>();
extractor
.extractAllReferencesString(
"That article It refers to Economic Development Quarterly November 2011 25: 353-365," +
" first published on August 25, 2011.",
List<PatentItem> patents = new ArrayList<>();
List<BibDataSet> articles = new ArrayList<>();
List<String> toExtracts = new ArrayList<>();
toExtracts.add("That article It refers to Economic Development Quarterly November 2011 25: 353-365," +
" first published on August 25, 2011.");
extractor.extractAllReferencesString(
toExtracts,
false, 0, false, patents, articles);
LOGGER.info("BibDataSet: " + articles.toString());
assertEquals(0, patents.size());
Expand Down Expand Up @@ -115,11 +120,13 @@ public void extractAllReferencesStringArticles2() {
//@Test
public void extractAllReferencesStringPatents() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
List<BibDataSet> articles = new ArrayList<BibDataSet>();
List<PatentItem> patents = new ArrayList<>();
List<BibDataSet> articles = new ArrayList<>();
String toExtract = "US-8303618, Intravascular filter and method A filter disposed at the distal end of an elongate guidewire. Catheters are provided for delivering the filter to, and retrieving the filter from, a treatment...";
toExtract = "this patent refers US-8303618, bla bla";
extractor.extractAllReferencesString(toExtract, false, 0, false, patents, articles);
List<String> toExtracts = new ArrayList<>();
toExtracts.add(toExtract);
extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, articles);
LOGGER.info("PatentItem: " + patents.toString());
assertEquals(1, patents.size());
assertEquals(0, articles.size());
Expand All @@ -133,10 +140,9 @@ public void extractAllReferencesStringPatents() {
@Test
public void extractAllReferencesXmlST36() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
List<BibDataSet> articles = new ArrayList<BibDataSet>();
extractor
.extractAllReferencesXMLFile(
List<PatentItem> patents = new ArrayList<>();
List<BibDataSet> articles = new ArrayList<>();
extractor.extractAllReferencesXMLFile(
new File(
"src/test/resources/org/grobid/core/engines/patent/ReferenceExtractor/st36-sample-1.xml")
.getAbsolutePath(), false, 0, false, patents, articles);
Expand All @@ -150,10 +156,9 @@ public void extractAllReferencesXmlST36() {
@Test
public void extractAllReferencesXml() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
List<BibDataSet> articles = new ArrayList<BibDataSet>();
extractor
.extractAllReferencesXMLFile(
List<PatentItem> patents = new ArrayList<>();
List<BibDataSet> articles = new ArrayList<>();
extractor.extractAllReferencesXMLFile(
new File(
"src/test/resources/patents/006271747.xml")
.getAbsolutePath(), false, 0, false, patents, articles);
Expand All @@ -165,10 +170,9 @@ public void extractAllReferencesXml() {
@Ignore
public void extractAllReferencesPdf() {
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
List<BibDataSet> articles = new ArrayList<BibDataSet>();
extractor
.extractAllReferencesPDFFile(
List<PatentItem> patents = new ArrayList<>();
List<BibDataSet> articles = new ArrayList<>();
extractor.extractAllReferencesPDFFile(
new File(
"src/test/resources/org/grobid/core/engines/patent/ReferenceExtractor/sample-1.pdf")
.getAbsolutePath(), false, 0, false, patents, articles);
Expand All @@ -179,25 +183,29 @@ public void jaProcessing() {
String text_jp = "すなわち、相対的な頻度で、エポキシドをベースとする液体接着剤及び接着結合剤が、" +
"例えばWO98/21287A1。これらの主な使用分野は、硬質装置のみならず適度に柔軟な装置における縁部の結合である。" +
"硬化は、熱により又はUV照射により行われる。";
System.out.println(text_jp);
//System.out.println(text_jp);
List<String> toExtracts = new ArrayList<>();
toExtracts.add(text_jp);
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
extractor.extractAllReferencesString(text_jp, false, 0, false, patents, null);
List<PatentItem> patents = new ArrayList<>();
extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, null);
LOGGER.info("PatentItem: " + patents.toString());
assertEquals(1, patents.size());
assertEquals("21287", patents.get(0).getNumberEpoDoc());
}

@Test
public void krProcessing() {
String text_kr = "미국의 애플사의 미국 출원 2012/012710." + "따라서, 전기화학적 센서들의 제조, 특히 혈액 또는 간질액과 같은 신체 마커들(포도당, 프룩토사민, " +
public void koProcessing() {
String text_ko = "미국의 애플사의 미국 출원 2012/012710. 따라서, 전기화학적 센서들의 제조, 특히 혈액 또는 간질액과 같은 신체 마커들(포도당, 프룩토사민, " +
"하에마토크릿 등)의측정을 위한 전기화학적 센서들을 제조하기 위해 개선된 프로세스가 필요하다. 또한, 합리적인 가격으로 센서 스트립들을제조하기 " +
"위한 고속의 예측가능하고 재생가능한 방법에 대한 필요성이 있다. 또한, 각각의 완료된 스트립이 재생가능한 방법으로 체액의 분석 대상물들을 " +
"신뢰성있고 예측가능하며 정밀하게 측정하는데 사용될 수 있는 매우 작은 특성들을 갖는센서 스트립들을 고속의 예측가능하고 반복가능한 방법으로 제조할 필요가 있다.";
System.out.println(text_kr);
//System.out.println(text_kr);
List<String> toExtracts = new ArrayList<>();
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
extractor.extractAllReferencesString(text_kr, false, 0, false, patents, null);
List<PatentItem> patents = new ArrayList<>();
toExtracts.add(text_ko);
extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, null);
LOGGER.info("PatentItem: " + patents.toString());
assertEquals(1, patents.size());
assertEquals("2012012710", patents.get(0).getNumberEpoDoc());
Expand All @@ -209,10 +217,12 @@ public void zhProcessing() {
"揭示了一种等截面三角形定向棱镜圆形反光板及由其制成的圆板灯。该圆板灯包括:等截面三角形微棱镜圆形导光板;" +
"围绕导光板的散热框,该散热框与导光板之间形成间隙而构成环形灯槽;以及嵌装于环形灯槽内的环形灯组件," +
"该环形灯组件由多个发光二极管(LED)贴片、电阻和线路板构成。该申请的全部内容,通过引用结合于此。";
System.out.println(text_zh);
//System.out.println(text_zh);
List<String> toExtracts = new ArrayList<>();
ReferenceExtractor extractor = new ReferenceExtractor();
List<PatentItem> patents = new ArrayList<PatentItem>();
extractor.extractAllReferencesString(text_zh, false, 0, false, patents, null);
List<PatentItem> patents = new ArrayList<>();
toExtracts.add(text_zh);
extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, null);
LOGGER.info("PatentItem: " + patents.toString());
assertEquals(1, patents.size());
assertEquals("2008001534", patents.get(0).getNumberEpoDoc());
Expand Down
1 change: 1 addition & 0 deletions grobid-core/src/test/resources/patents/sample4.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some other description includes ref. US 2011/0155847 A1 in aerodynamic and applied physics. This patent, ref. US 7930197 says data mining of personal data is patented. That article refers to Economic Development Quarterly November 2011 25:353-365, first published on August 25, 2011.

0 comments on commit 0d524f7

Please sign in to comment.