diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java index 4ea16f6796..eb667e878a 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/Engine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Engine.java @@ -874,7 +874,9 @@ public String processAllCitationsInPatent(String text, } // we initialize the attribute individually for readability... boolean filterDuplicate = false; - return parsers.getReferenceExtractor().extractAllReferencesString(text, filterDuplicate, + List texts = new ArrayList<>(); + texts.add(text); + return parsers.getReferenceExtractor().extractAllReferencesString(texts, filterDuplicate, consolidateCitations, includeRawCitations, patentResults, nplResults); } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/patent/PatentRefParser.java b/grobid-core/src/main/java/org/grobid/core/engines/patent/PatentRefParser.java index a69d6d2476..88a64a7cce 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/patent/PatentRefParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/patent/PatentRefParser.java @@ -49,7 +49,7 @@ public class PatentRefParser { "TT", "TN", "TR", "UA", "GB", "US", "UY", "VE", "VN", "YU", "ZM", "ZW"); // this is the list of supported languages - language codes given ISO 639-1, two-letter codes - static public List languages = Arrays.asList("en", "de", "fr", "es", "it", "ja", "kr", "pt", "zh", "ar"); + static public List languages = Arrays.asList("en", "de", "fr", "es", "it", "ja", "ko", "pt", "zh", "ar"); // list of regular expressions for identifying the authority in the raw reference string private List autority_patterns = new ArrayList(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java index 54ba0c0d7d..7de2a02981 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java @@ -29,6 +29,7 @@ import org.grobid.core.utilities.BoundingBoxCalculator; import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.analyzers.GrobidDefaultAnalyzer; import org.grobid.core.lang.Language; import org.grobid.core.layout.*; import org.slf4j.Logger; @@ -239,7 +240,9 @@ public String extractAllReferencesPDFFile(String inputFile, } String description = doc.getAllBlocksClean(25, -1); if (description != null) { - result = extractAllReferencesString(description, + List descriptions = new ArrayList<>(); + descriptions.add(description); + result = extractAllReferencesString(descriptions, filterDuplicate, consolidate, includeRawCitations, @@ -294,7 +297,7 @@ public String annotateAllReferencesPDFFile(String inputFile, /** * Extract all reference from a simple piece of text or a list of text segments, and return results in an XML document. */ - public String extractAllReferencesString(String text, + /*public String extractAllReferencesString(String text, boolean filterDuplicate, int consolidate, boolean includeRawCitations, @@ -303,7 +306,7 @@ public String extractAllReferencesString(String text, List texts = new ArrayList<>(); texts.add(text); return extractAllReferencesString(texts, filterDuplicate, consolidate, includeRawCitations, patents, articles); - } + }*/ public String extractAllReferencesString(List texts, boolean filterDuplicate, @@ -348,15 +351,21 @@ public String extractAllReferencesString(List texts, List allPatentBlocks = new ArrayList<>(); for (String text : texts) { - //text = TextUtilities.dehyphenize(text); // to be reviewed! - // tokenisation according to the language - List tokenizations = analyzer.tokenizeWithLayoutToken(text, lang); + // tokenisation according to the language (except for Korean, which will require retraining) + List tokenizations; + if (lang.getLang().equals(Language.KO)) { + tokenizations = GrobidDefaultAnalyzer.getInstance().tokenizeWithLayoutToken(text); + } else { + tokenizations = analyzer.tokenizeWithLayoutToken(text, lang); + // to be sure to sub-tokenize based on standard punctuations: + tokenizations = GrobidDefaultAnalyzer.getInstance().retokenizeFromLayoutToken(tokenizations); + } + if (tokenizations.size() == 0) { continue; } - allTokenizations.add(tokenizations); StringBuilder patentBlocks = new StringBuilder(); @@ -394,6 +403,7 @@ public String extractAllReferencesString(List texts, (tok.equals("\n")) || (tok.equals("\r")) ) { + posit++; continue; } @@ -512,7 +522,6 @@ public String extractAllReferencesString(List texts, String theResults = taggerAll.label(allPatentBlocks); //System.out.println(theResults); String[] theSegmentedResults = theResults.split("\n\n"); -//System.out.println(allPatentBlocks.size() + " / " + theSegmentedResults.length); List allReferencesPatent = new ArrayList<>(); List allReferencesNPL = new ArrayList<>(); @@ -807,7 +816,6 @@ public String extractAllReferencesString(List texts, localList.add(bds); articlesBySegment.put(localIndexSegmentNPL.get(k), localList); - k++; } } @@ -836,7 +844,7 @@ public String extractAllReferencesString(List texts, (localArticlesBySegment != null && localArticlesBySegment.size()>0) ) { // output text String divID = KeyGen.getKey().substring(0,7); - resultTEI.append("\t\t
\n"); + resultTEI.append("\t\t
"); String text = LayoutTokensUtil.toText(tokens); // not affecting offsets: text = text.replace("\n", " ").replace("\t", " "); @@ -959,6 +967,7 @@ public String annotateAllReferences(Document doc, (tok.equals("\n")) || (tok.equals("\r")) ) { + posit++; continue; } diff --git a/grobid-core/src/test/java/org/grobid/core/engines/patent/ReferenceExtractorTest.java b/grobid-core/src/test/java/org/grobid/core/engines/patent/ReferenceExtractorTest.java index b773bcabd0..cf8684bb5b 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/patent/ReferenceExtractorTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/patent/ReferenceExtractorTest.java @@ -40,42 +40,47 @@ public static void destroyInitialContext() throws Exception { @Test public void extractAllReferencesStringNull() { ReferenceExtractor extractor = new ReferenceExtractor(); + List toExtracts = new ArrayList<>(); + toExtracts.add("Economic Development Quarterly November 2011 25: 353-365, first published on August 25, 2011."); String res = extractor .extractAllReferencesString( - "Economic Development Quarterly November 2011 25: 353-365, first published on August 25, 2011.", + toExtracts, false, 0, false, null, null); //assertEquals(0, nbRes); } - @Test + //@Test public void extractAllReferencesStringArticles() { ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - List articles = new ArrayList(); + List patents = new ArrayList<>(); + List articles = new ArrayList<>(); String toExtract = "Some other description includes ref. US 2011/0155847 A1 in aerodynamic" + " and applied physics. " + "This patent, ref. US 7930197 says data mining of personal data is patented. " + - "That article refers to Economic Development Quarterly November 2011 25: 353-365, first" + + "That article refers to Economic Development Quarterly November 2011 25:353-365, first" + " published on August 25, 2011."; - GrobidTimer timer = new GrobidTimer(true); - extractor.extractAllReferencesString(toExtract, false, 0, false, patents, articles); - timer.stop("STOP"); - System.out.println(timer.getElapsedTimeFromStartFormated("STOP")); - LOGGER.info("BibDataSet: " + articles.toString()); + //GrobidTimer timer = new GrobidTimer(true); + List toExtracts = new ArrayList<>(); + toExtracts.add(toExtract); + extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, articles); + //timer.stop("STOP"); + //System.out.println(timer.getElapsedTimeFromStartFormated("STOP")); + //LOGGER.info("BibDataSet: " + articles.toString()); assertEquals(2, patents.size()); assertEquals(1, articles.size()); - LOGGER.info(articles.get(0).getOffsets().toString()); + //LOGGER.info(articles.get(0).getOffsets().toString()); } @Test public void extractAllReferencesStringArticles2() { ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - List articles = new ArrayList(); - extractor - .extractAllReferencesString( - "That article It refers to Economic Development Quarterly November 2011 25: 353-365," + - " first published on August 25, 2011.", + List patents = new ArrayList<>(); + List articles = new ArrayList<>(); + List toExtracts = new ArrayList<>(); + toExtracts.add("That article It refers to Economic Development Quarterly November 2011 25: 353-365," + + " first published on August 25, 2011."); + extractor.extractAllReferencesString( + toExtracts, false, 0, false, patents, articles); LOGGER.info("BibDataSet: " + articles.toString()); assertEquals(0, patents.size()); @@ -115,11 +120,13 @@ public void extractAllReferencesStringArticles2() { //@Test public void extractAllReferencesStringPatents() { ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - List articles = new ArrayList(); + List patents = new ArrayList<>(); + List articles = new ArrayList<>(); String toExtract = "US-8303618, Intravascular filter and method A filter disposed at the distal end of an elongate guidewire. Catheters are provided for delivering the filter to, and retrieving the filter from, a treatment..."; toExtract = "this patent refers US-8303618, bla bla"; - extractor.extractAllReferencesString(toExtract, false, 0, false, patents, articles); + List toExtracts = new ArrayList<>(); + toExtracts.add(toExtract); + extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, articles); LOGGER.info("PatentItem: " + patents.toString()); assertEquals(1, patents.size()); assertEquals(0, articles.size()); @@ -133,10 +140,9 @@ public void extractAllReferencesStringPatents() { @Test public void extractAllReferencesXmlST36() { ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - List articles = new ArrayList(); - extractor - .extractAllReferencesXMLFile( + List patents = new ArrayList<>(); + List articles = new ArrayList<>(); + extractor.extractAllReferencesXMLFile( new File( "src/test/resources/org/grobid/core/engines/patent/ReferenceExtractor/st36-sample-1.xml") .getAbsolutePath(), false, 0, false, patents, articles); @@ -150,10 +156,9 @@ public void extractAllReferencesXmlST36() { @Test public void extractAllReferencesXml() { ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - List articles = new ArrayList(); - extractor - .extractAllReferencesXMLFile( + List patents = new ArrayList<>(); + List articles = new ArrayList<>(); + extractor.extractAllReferencesXMLFile( new File( "src/test/resources/patents/006271747.xml") .getAbsolutePath(), false, 0, false, patents, articles); @@ -165,10 +170,9 @@ public void extractAllReferencesXml() { @Ignore public void extractAllReferencesPdf() { ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - List articles = new ArrayList(); - extractor - .extractAllReferencesPDFFile( + List patents = new ArrayList<>(); + List articles = new ArrayList<>(); + extractor.extractAllReferencesPDFFile( new File( "src/test/resources/org/grobid/core/engines/patent/ReferenceExtractor/sample-1.pdf") .getAbsolutePath(), false, 0, false, patents, articles); @@ -179,25 +183,29 @@ public void jaProcessing() { String text_jp = "すなわち、相対的な頻度で、エポキシドをベースとする液体接着剤及び接着結合剤が、" + "例えばWO98/21287A1。これらの主な使用分野は、硬質装置のみならず適度に柔軟な装置における縁部の結合である。" + "硬化は、熱により又はUV照射により行われる。"; - System.out.println(text_jp); + //System.out.println(text_jp); + List toExtracts = new ArrayList<>(); + toExtracts.add(text_jp); ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - extractor.extractAllReferencesString(text_jp, false, 0, false, patents, null); + List patents = new ArrayList<>(); + extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, null); LOGGER.info("PatentItem: " + patents.toString()); assertEquals(1, patents.size()); assertEquals("21287", patents.get(0).getNumberEpoDoc()); } @Test - public void krProcessing() { - String text_kr = "미국의 애플사의 미국 출원 2012/012710." + "따라서, 전기화학적 센서들의 제조, 특히 혈액 또는 간질액과 같은 신체 마커들(포도당, 프룩토사민, " + + public void koProcessing() { + String text_ko = "미국의 애플사의 미국 출원 2012/012710. 따라서, 전기화학적 센서들의 제조, 특히 혈액 또는 간질액과 같은 신체 마커들(포도당, 프룩토사민, " + "하에마토크릿 등)의측정을 위한 전기화학적 센서들을 제조하기 위해 개선된 프로세스가 필요하다. 또한, 합리적인 가격으로 센서 스트립들을제조하기 " + "위한 고속의 예측가능하고 재생가능한 방법에 대한 필요성이 있다. 또한, 각각의 완료된 스트립이 재생가능한 방법으로 체액의 분석 대상물들을 " + "신뢰성있고 예측가능하며 정밀하게 측정하는데 사용될 수 있는 매우 작은 특성들을 갖는센서 스트립들을 고속의 예측가능하고 반복가능한 방법으로 제조할 필요가 있다."; - System.out.println(text_kr); + //System.out.println(text_kr); + List toExtracts = new ArrayList<>(); ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - extractor.extractAllReferencesString(text_kr, false, 0, false, patents, null); + List patents = new ArrayList<>(); + toExtracts.add(text_ko); + extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, null); LOGGER.info("PatentItem: " + patents.toString()); assertEquals(1, patents.size()); assertEquals("2012012710", patents.get(0).getNumberEpoDoc()); @@ -209,10 +217,12 @@ public void zhProcessing() { "揭示了一种等截面三角形定向棱镜圆形反光板及由其制成的圆板灯。该圆板灯包括:等截面三角形微棱镜圆形导光板;" + "围绕导光板的散热框,该散热框与导光板之间形成间隙而构成环形灯槽;以及嵌装于环形灯槽内的环形灯组件," + "该环形灯组件由多个发光二极管(LED)贴片、电阻和线路板构成。该申请的全部内容,通过引用结合于此。"; - System.out.println(text_zh); + //System.out.println(text_zh); + List toExtracts = new ArrayList<>(); ReferenceExtractor extractor = new ReferenceExtractor(); - List patents = new ArrayList(); - extractor.extractAllReferencesString(text_zh, false, 0, false, patents, null); + List patents = new ArrayList<>(); + toExtracts.add(text_zh); + extractor.extractAllReferencesString(toExtracts, false, 0, false, patents, null); LOGGER.info("PatentItem: " + patents.toString()); assertEquals(1, patents.size()); assertEquals("2008001534", patents.get(0).getNumberEpoDoc()); diff --git a/grobid-core/src/test/resources/patents/sample4.txt b/grobid-core/src/test/resources/patents/sample4.txt new file mode 100644 index 0000000000..6dbc4910b8 --- /dev/null +++ b/grobid-core/src/test/resources/patents/sample4.txt @@ -0,0 +1 @@ +Some other description includes ref. US 2011/0155847 A1 in aerodynamic and applied physics. This patent, ref. US 7930197 says data mining of personal data is patented. That article refers to Economic Development Quarterly November 2011 25:353-365, first published on August 25, 2011.