From fbb238cf7a4bd40850b8c8c99d217f8fc007c095 Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Sun, 4 Feb 2024 22:50:22 +0100 Subject: [PATCH] refactor process for DL models batch --- .../engines/patent/ReferenceExtractor.java | 822 ++++++++++-------- .../org/grobid/core/sax/TextSaxParser.java | 38 +- 2 files changed, 476 insertions(+), 384 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java index 392c3ef608..fc2aa3731a 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java @@ -74,7 +74,7 @@ public class ReferenceExtractor implements Closeable { public Lexicon lexicon = Lexicon.getInstance(); public String currentPatentNumber = null; public OPSService ops = null; - private String description = null; + private List descriptionSegments = null; public ArrayList resBib = null; // identified current parsed // bibliographical items and related information @@ -109,8 +109,8 @@ public String extractAllReferencesOPS(boolean filterDuplicate, List patents, List articles) { try { - if (description != null) { - return extractAllReferencesString(description, + if (descriptionSegments != null && descriptionSegments.size()>0) { + return extractAllReferencesString(descriptionSegments, filterDuplicate, consolidate, includeRawCitations, @@ -157,6 +157,7 @@ public String extractAllReferencesXMLFile(String pathXML, sax.addFilter("description"); sax.addFilter("p"); sax.addFilter("heading"); + sax.addFilter("head"); // get a factory SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); @@ -188,13 +189,18 @@ public InputSource resolveEntity(String publicId, String systemId) { reader.parse(input); - description = sax.getText(); + List descriptionSegments = sax.getTexts(); + +/*for(String text : descriptionSegments){ + System.out.println(text); +}*/ + currentPatentNumber = sax.currentPatentNumber; consolidate = 0; filterDuplicate = true; - if (description != null) { - return extractAllReferencesString(description, + if (descriptionSegments != null && descriptionSegments.size()>0) { + return extractAllReferencesString(descriptionSegments, filterDuplicate, consolidate, includeRawCitations, @@ -228,7 +234,7 @@ public String extractAllReferencesPDFFile(String inputFile, if (doc.getBlocks() == null) { return result; } - description = doc.getAllBlocksClean(25, -1); + String description = doc.getAllBlocksClean(25, -1); if (description != null) { result = extractAllReferencesString(description, filterDuplicate, @@ -283,7 +289,7 @@ public String annotateAllReferencesPDFFile(String inputFile, } /** - * Extract all reference from a simple piece of text and return results in an XML document. + * Extract all reference from a simple piece of text or a list of text segments, and return results in an XML document. */ public String extractAllReferencesString(String text, boolean filterDuplicate, @@ -291,411 +297,446 @@ public String extractAllReferencesString(String text, boolean includeRawCitations, List patents, List articles) { + List texts = new ArrayList<>(); + texts.add(text); + return extractAllReferencesString(texts, filterDuplicate, consolidate, includeRawCitations, patents, articles); + } + + public String extractAllReferencesString(List texts, + boolean filterDuplicate, + int consolidate, + boolean includeRawCitations, + List patents, + List articles) { try { // if parameters are null, these lists will only be valid in the method if (patents == null) { - patents = new ArrayList(); + patents = new ArrayList<>(); } if (articles == null) { - articles = new ArrayList(); + articles = new ArrayList<>(); } - // parser for patent references + // parser for patent and non patent references if (patentParser == null) { patentParser = new PatentRefParser(); } - // parser for non patent references - // tokenisation for the parser (with punctuation as tokens) - ArrayList patentBlocks = new ArrayList(); + // identify the language of the patent document, we use only the first 500 characters + // which is enough normally for a very safe language prediction + // the text here is the patent description, so normally strictly monolingual - //text = TextUtilities.dehyphenize(text); // to be reviewed! - text = text.replace("\n", " ").replace("\t", " "); - //text = text.replace(" ", " "); - - // identify the language of the patent document, we use only the first 500 characters - // which is enough normally for a very safe language prediction - // the text here is the patent description, so strictly monolingual - Language lang = languageUtilities.runLanguageId(text, 500); - List tokenizations = analyzer.tokenize(text, lang); - int offset = 0; - if (tokenizations.size() == 0) { - return null; + // create text buffer + StringBuilder localText = new StringBuilder(); + for(String text : texts) { + localText.append(text); + if (localText.length()> 500) + break; } + if (localText.length() == 0) + return null; + Language lang = languageUtilities.runLanguageId(localText.toString(), 500); - List journalPositions = null; - List abbrevJournalPositions = null; - List conferencePositions = null; - List publisherPositions = null; + List allPatentBlocks = new ArrayList<>(); + List> allTokenizations = new ArrayList<>(); + for (String text : texts) { - //if (articles != null) - { - journalPositions = lexicon.tokenPositionsJournalNames(text); - abbrevJournalPositions = lexicon.tokenPositionsAbbrevJournalNames(text); - conferencePositions = lexicon.tokenPositionsConferenceNames(text); - publisherPositions = lexicon.tokenPositionsPublisherNames(text); - } + //text = TextUtilities.dehyphenize(text); // to be reviewed! - boolean isJournalToken = false; - boolean isAbbrevJournalToken = false; - boolean isConferenceToken = false; - boolean isPublisherToken = false; - int currentJournalPositions = 0; - int currentAbbrevJournalPositions = 0; - int currentConferencePositions = 0; - int currentPublisherPositions = 0; - boolean skipTest = false; - //st = new StringTokenizer(text, " (["+ TextUtilities.punctuations, true); - //st = new StringTokenizer(text, delimiters, true); - int posit = 0; - //while (st.hasMoreTokens()) { - for(String tok : tokenizations) { - isJournalToken = false; - isAbbrevJournalToken = false; - isConferenceToken = false; - isPublisherToken = false; - skipTest = false; - //String tok = st.nextToken(); - if ( (tok.trim().length() == 0) || - (tok.equals(" ")) || - (tok.equals("\t")) || - (tok.equals("\n")) || - (tok.equals("\r")) - ) { + // tokenisation according to the language + List tokenizations = analyzer.tokenizeWithLayoutToken(text, lang); + if (tokenizations.size() == 0) { continue; } - // check the position of matches for journals - if (journalPositions != null) { - if (currentJournalPositions == journalPositions.size() - 1) { - if (journalPositions.get(currentJournalPositions).end < posit) { - skipTest = true; - } + allTokenizations.add(tokenizations); + + StringBuilder patentBlocks = new StringBuilder(); + + List journalPositions = null; + List abbrevJournalPositions = null; + List conferencePositions = null; + List publisherPositions = null; + + journalPositions = lexicon.tokenPositionsJournalNames(tokenizations); + abbrevJournalPositions = lexicon.tokenPositionsAbbrevJournalNames(tokenizations); + conferencePositions = lexicon.tokenPositionsConferenceNames(tokenizations); + publisherPositions = lexicon.tokenPositionsPublisherNames(tokenizations); + + boolean isJournalToken = false; + boolean isAbbrevJournalToken = false; + boolean isConferenceToken = false; + boolean isPublisherToken = false; + int currentJournalPositions = 0; + int currentAbbrevJournalPositions = 0; + int currentConferencePositions = 0; + int currentPublisherPositions = 0; + boolean skipTest = false; + //st = new StringTokenizer(text, " (["+ TextUtilities.punctuations, true); + //st = new StringTokenizer(text, delimiters, true); + int posit = 0; + //while (st.hasMoreTokens()) { + for(LayoutToken token : tokenizations) { + String tok = token.getText(); + isJournalToken = false; + isAbbrevJournalToken = false; + isConferenceToken = false; + isPublisherToken = false; + skipTest = false; + //String tok = st.nextToken(); + if ( (tok.trim().length() == 0) || + (tok.equals(" ")) || + (tok.equals("\t")) || + (tok.equals("\n")) || + (tok.equals("\r")) + ) { + continue; } - if (!skipTest) { - for (int i = currentJournalPositions; i < journalPositions.size(); i++) { - if ((journalPositions.get(i).start <= posit) && - (journalPositions.get(i).end >= posit)) { - isJournalToken = true; - currentJournalPositions = i; - break; - } else if (journalPositions.get(i).start > posit) { - isJournalToken = false; - currentJournalPositions = i; - break; + + // check the position of matches for journals + if (journalPositions != null) { + if (currentJournalPositions == journalPositions.size() - 1) { + if (journalPositions.get(currentJournalPositions).end < posit) { + skipTest = true; + } + } + if (!skipTest) { + for (int i = currentJournalPositions; i < journalPositions.size(); i++) { + if ((journalPositions.get(i).start <= posit) && + (journalPositions.get(i).end >= posit)) { + isJournalToken = true; + currentJournalPositions = i; + break; + } else if (journalPositions.get(i).start > posit) { + isJournalToken = false; + currentJournalPositions = i; + break; + } } } } - } - // check the position of matches for abbreviated journals - skipTest = false; - if (abbrevJournalPositions != null) { - if (currentAbbrevJournalPositions == abbrevJournalPositions.size() - 1) { - if (abbrevJournalPositions.get(currentAbbrevJournalPositions).end < posit) { - skipTest = true; + // check the position of matches for abbreviated journals + skipTest = false; + if (abbrevJournalPositions != null) { + if (currentAbbrevJournalPositions == abbrevJournalPositions.size() - 1) { + if (abbrevJournalPositions.get(currentAbbrevJournalPositions).end < posit) { + skipTest = true; + } } - } - if (!skipTest) { - for (int i = currentAbbrevJournalPositions; i < abbrevJournalPositions.size(); i++) { - if ((abbrevJournalPositions.get(i).start <= posit) && - (abbrevJournalPositions.get(i).end >= posit)) { - isAbbrevJournalToken = true; - currentAbbrevJournalPositions = i; - break; - } else if (abbrevJournalPositions.get(i).start > posit) { - isAbbrevJournalToken = false; - currentAbbrevJournalPositions = i; - break; + if (!skipTest) { + for (int i = currentAbbrevJournalPositions; i < abbrevJournalPositions.size(); i++) { + if ((abbrevJournalPositions.get(i).start <= posit) && + (abbrevJournalPositions.get(i).end >= posit)) { + isAbbrevJournalToken = true; + currentAbbrevJournalPositions = i; + break; + } else if (abbrevJournalPositions.get(i).start > posit) { + isAbbrevJournalToken = false; + currentAbbrevJournalPositions = i; + break; + } } } } - } - // check the position of matches for conference names - skipTest = false; - if (conferencePositions != null) { - if (currentConferencePositions == conferencePositions.size() - 1) { - if (conferencePositions.get(currentConferencePositions).end < posit) { - skipTest = true; + // check the position of matches for conference names + skipTest = false; + if (conferencePositions != null) { + if (currentConferencePositions == conferencePositions.size() - 1) { + if (conferencePositions.get(currentConferencePositions).end < posit) { + skipTest = true; + } } - } - if (!skipTest) { - for (int i = currentConferencePositions; i < conferencePositions.size(); i++) { - if ((conferencePositions.get(i).start <= posit) && - (conferencePositions.get(i).end >= posit)) { - isConferenceToken = true; - currentConferencePositions = i; - break; - } else if (conferencePositions.get(i).start > posit) { - isConferenceToken = false; - currentConferencePositions = i; - break; + if (!skipTest) { + for (int i = currentConferencePositions; i < conferencePositions.size(); i++) { + if ((conferencePositions.get(i).start <= posit) && + (conferencePositions.get(i).end >= posit)) { + isConferenceToken = true; + currentConferencePositions = i; + break; + } else if (conferencePositions.get(i).start > posit) { + isConferenceToken = false; + currentConferencePositions = i; + break; + } } } } - } - // check the position of matches for publisher names - skipTest = false; - if (publisherPositions != null) { - if (currentPublisherPositions == publisherPositions.size() - 1) { - if (publisherPositions.get(currentPublisherPositions).end < posit) { - skipTest = true; + // check the position of matches for publisher names + skipTest = false; + if (publisherPositions != null) { + if (currentPublisherPositions == publisherPositions.size() - 1) { + if (publisherPositions.get(currentPublisherPositions).end < posit) { + skipTest = true; + } } - } - if (!skipTest) { - for (int i = currentPublisherPositions; i < publisherPositions.size(); i++) { - if ((publisherPositions.get(i).start <= posit) && - (publisherPositions.get(i).end >= posit)) { - isPublisherToken = true; - currentPublisherPositions = i; - break; - } else if (publisherPositions.get(i).start > posit) { - isPublisherToken = false; - currentPublisherPositions = i; - break; + if (!skipTest) { + for (int i = currentPublisherPositions; i < publisherPositions.size(); i++) { + if ((publisherPositions.get(i).start <= posit) && + (publisherPositions.get(i).end >= posit)) { + isPublisherToken = true; + currentPublisherPositions = i; + break; + } else if (publisherPositions.get(i).start > posit) { + isPublisherToken = false; + currentPublisherPositions = i; + break; + } } } } + + FeaturesVectorReference featureVector = + FeaturesVectorReference.addFeaturesPatentReferences(new LayoutToken(tok), null, + tokenizations.size(), + posit, + isJournalToken, + isAbbrevJournalToken, + isConferenceToken, + isPublisherToken); + patentBlocks.append(featureVector.printVector()); + patentBlocks.append("\n"); + posit++; } - FeaturesVectorReference featureVector = - FeaturesVectorReference.addFeaturesPatentReferences(new LayoutToken(tok), null, - tokenizations.size(), - posit, - isJournalToken, - isAbbrevJournalToken, - isConferenceToken, - isPublisherToken); - patentBlocks.add(featureVector.printVector()); - posit++; + patentBlocks.append("\n\n"); + allPatentBlocks.add(patentBlocks.toString()); } - patentBlocks.add("\n"); - - String theResult = null; - theResult = taggerAll.label(patentBlocks); - //System.out.println(theResult); - - StringTokenizer stt = new StringTokenizer(theResult, "\n"); - - List referencesPatent = new ArrayList(); - List referencesNPL = new ArrayList(); - List offsets_patent = new ArrayList(); - List offsets_NPL = new ArrayList(); - List probPatent = new ArrayList(); - List probNPL = new ArrayList(); - - boolean currentPatent = true; // type of current reference - String reference = null; - double currentProb = 0.0; - offset = 0; - int currentOffset = 0; - int addedOffset = 0; - String label = null; // label - String actual = null; // token - int p = 0; // iterator for the tokenizations for restauring the original tokenization with - // respect to spaces - - while (stt.hasMoreTokens()) { - String line = stt.nextToken(); - if (line.trim().length() == 0) { - continue; - } - - StringTokenizer st2 = new StringTokenizer(line, "\t "); - boolean start = true; - String separator = ""; - label = null; - actual = null; - while (st2.hasMoreTokens()) { - if (start) { - actual = st2.nextToken().trim(); - start = false; + String theResults = taggerAll.label(allPatentBlocks); +//System.out.println(theResults); + String[] theSegmentedResults = theResults.split("\n\n"); +//System.out.println(allPatentBlocks.size() + " / " + theSegmentedResults.length); + + List allReferencesNPL = new ArrayList<>(); + //int offset = 0; + for(int index=0; index tokenizations = allTokenizations.get(index); + + List referencesPatent = new ArrayList(); + List referencesNPL = new ArrayList(); + List offsets_patent = new ArrayList(); + List offsets_NPL = new ArrayList(); + List probPatent = new ArrayList(); + List probNPL = new ArrayList(); + + boolean currentPatent = true; // type of current reference + String reference = null; + double currentProb = 0.0; + int offset = 0; + int currentOffset = 0; + int addedOffset = 0; + String label = null; // label + String actual = null; // token + int p = 0; // iterator for the tokenizations for restauring the original tokenization with + // respect to spaces + + while (stt.hasMoreTokens()) { + String line = stt.nextToken(); + if (line.trim().length() == 0) { + continue; + } - boolean strop = false; - while ((!strop) && (p < tokenizations.size())) { - String tokOriginal = tokenizations.get(p); - addedOffset += tokOriginal.length(); - if (tokOriginal.equals(" ")) { - separator += tokOriginal; - } else if (tokOriginal.equals(actual)) { - strop = true; + StringTokenizer st2 = new StringTokenizer(line, "\t "); + boolean start = true; + String separator = ""; + label = null; + actual = null; + while (st2.hasMoreTokens()) { + if (start) { + actual = st2.nextToken().trim(); + start = false; + + boolean strop = false; + while ((!strop) && (p < tokenizations.size())) { + String tokOriginal = tokenizations.get(p).getText(); + addedOffset += tokOriginal.length(); + if (tokOriginal.equals(" ")) { + separator += tokOriginal; + } else if (tokOriginal.equals(actual)) { + strop = true; + } + p++; } - p++; + } else { + label = st2.nextToken().trim(); } - } else { - label = st2.nextToken().trim(); } - } - - if (label == null) { - offset += addedOffset; - addedOffset = 0; - continue; - } - double prob = 0.0; - int segProb = label.lastIndexOf("/"); - if (segProb != -1) { - String probString = label.substring(segProb+1, label.length()); - //System.out.println("given prob: " + probString); - try { - prob = Double.parseDouble(probString); - //System.out.println("given prob: " + probString + ", parsed: " + prob); - } - catch(Exception e) { - LOGGER.debug(probString + " cannot be parsed."); - } - label = label.substring(0,segProb); - } - - // TBD: use TaggingTokenClusteror and TaggingLabel as for the other parsers - if (actual != null) { - if (label.endsWith("")) { - if (reference == null) { - reference = separator + actual; - currentOffset = offset; - currentPatent = true; - currentProb = prob; - } else { - if (currentPatent) { - if (label.equals("I-")) { - referencesPatent.add(reference); - offsets_patent.add(currentOffset); + if (label == null) { + offset += addedOffset; + addedOffset = 0; + continue; + } - probPatent.add(Double.valueOf(currentProb)); + double prob = 0.0; + int segProb = label.lastIndexOf("/"); + if (segProb != -1) { + String probString = label.substring(segProb+1, label.length()); + //System.out.println("given prob: " + probString); + try { + prob = Double.parseDouble(probString); + //System.out.println("given prob: " + probString + ", parsed: " + prob); + } + catch(Exception e) { + LOGGER.debug(probString + " cannot be parsed."); + } + label = label.substring(0,segProb); + } + + // TBD: use TaggingTokenClusteror and TaggingLabel as for the other parsers + if (actual != null) { + if (label.endsWith("")) { + if (reference == null) { + reference = separator + actual; + currentOffset = offset; + currentPatent = true; + currentProb = prob; + } else { + if (currentPatent) { + if (label.equals("I-")) { + referencesPatent.add(reference); + offsets_patent.add(currentOffset); + + probPatent.add(Double.valueOf(currentProb)); + + currentPatent = true; + reference = separator + actual; + currentOffset = offset; + currentProb = prob; + } else { + reference += separator + actual; + if (prob > currentProb) { + currentProb = prob; + } + } + } else { + referencesNPL.add(reference); + offsets_NPL.add(currentOffset); + probNPL.add(Double.valueOf(currentProb)); currentPatent = true; - reference = separator + actual; + reference = separator + actual; currentOffset = offset; - currentProb = prob; - } else { - reference += separator + actual; - if (prob > currentProb) { - currentProb = prob; - } + currentProb = prob; } - } else { - referencesNPL.add(reference); - offsets_NPL.add(currentOffset); - probNPL.add(Double.valueOf(currentProb)); - - currentPatent = true; - reference = separator + actual; - currentOffset = offset; - currentProb = prob; } - } - } else if (label.endsWith("")) { - if (reference == null) { - reference = separator + actual; - currentOffset = offset; - currentPatent = false; - currentProb = prob; - } else { - if (currentPatent) { - referencesPatent.add(reference); - offsets_patent.add(currentOffset); - probPatent.add(Double.valueOf(currentProb)); - - currentPatent = false; - reference = separator + actual; + } else if (label.endsWith("")) { + if (reference == null) { + reference = separator + actual; currentOffset = offset; - currentProb = prob; + currentPatent = false; + currentProb = prob; } else { - if (label.equals("I-")) { - referencesNPL.add(reference); - offsets_NPL.add(currentOffset); - probNPL.add(Double.valueOf(currentProb)); + if (currentPatent) { + referencesPatent.add(reference); + offsets_patent.add(currentOffset); + probPatent.add(Double.valueOf(currentProb)); currentPatent = false; - reference = separator + actual; + reference = separator + actual; currentOffset = offset; - currentProb = prob; + currentProb = prob; } else { - reference += separator + actual; - if (prob > currentProb) { - currentProb = prob; - } + if (label.equals("I-")) { + referencesNPL.add(reference); + offsets_NPL.add(currentOffset); + probNPL.add(Double.valueOf(currentProb)); + + currentPatent = false; + reference = separator + actual; + currentOffset = offset; + currentProb = prob; + } else { + reference += separator + actual; + if (prob > currentProb) { + currentProb = prob; + } + } } } - } - } else if (label.equals("")) { - if (reference != null) { - if (currentPatent) { - referencesPatent.add(reference); - offsets_patent.add(currentOffset); - probPatent.add(Double.valueOf(currentProb)); - } else { - referencesNPL.add(reference); - offsets_NPL.add(currentOffset); - probNPL.add(Double.valueOf(currentProb)); + } else if (label.equals("")) { + if (reference != null) { + if (currentPatent) { + referencesPatent.add(reference); + offsets_patent.add(currentOffset); + probPatent.add(Double.valueOf(currentProb)); + } else { + referencesNPL.add(reference); + offsets_NPL.add(currentOffset); + probNPL.add(Double.valueOf(currentProb)); + } + currentPatent = false; } - currentPatent = false; + reference = null; + currentProb = 0.0; } - reference = null; - currentProb = 0.0; } + offset += addedOffset; + addedOffset = 0; } - offset += addedOffset; - addedOffset = 0; - } - // run reference patent parser in isolation, and produce some traces - int j = 0; - for (String ref : referencesPatent) { - patentParser.setRawRefText(ref); - patentParser.setRawRefTextOffset(offsets_patent.get(j).intValue()); - List patents0 = patentParser.processRawRefText(); - for (PatentItem pat : patents0) { - pat.setContext(ref); - pat.setConf(probPatent.get(j).doubleValue()); - patents.add(pat); - /*if (pat.getApplication()) { - if (pat.getProvisional()) { - if (debug) { - System.out.println(pat.getAuthority() + " " + pat.getNumber() - + " P application " + pat.getOffsetBegin() - + ":" + pat.getOffsetEnd() + "\n"); + // run reference patent parser in isolation, and produce some traces + int j = 0; + for (String ref : referencesPatent) { + patentParser.setRawRefText(ref); + patentParser.setRawRefTextOffset(offsets_patent.get(j).intValue()); + List patents0 = patentParser.processRawRefText(); + for (PatentItem pat : patents0) { + pat.setContext(ref); + pat.setConf(probPatent.get(j).doubleValue()); + patents.add(pat); + /*if (pat.getApplication()) { + if (pat.getProvisional()) { + if (debug) { + System.out.println(pat.getAuthority() + " " + pat.getNumber() + + " P application " + pat.getOffsetBegin() + + ":" + pat.getOffsetEnd() + "\n"); + } + } else { + if (debug) { + System.out.println(pat.getAuthority() + " " + pat.getNumber() + + " application " + pat.getOffsetBegin() + + ":" + pat.getOffsetEnd() + "\n"); + } } - } else { - if (debug) { - System.out.println(pat.getAuthority() + " " + pat.getNumber() - + " application " + pat.getOffsetBegin() - + ":" + pat.getOffsetEnd() + "\n"); + } else if (pat.getReissued()) { + if (pat.getAuthority().equals("US")) { + if (debug) { + System.out.println(pat.getAuthority() + "RE" + pat.getNumber() + " E " + + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); + } } - } - } else if (pat.getReissued()) { - if (pat.getAuthority().equals("US")) { - if (debug) { - System.out.println(pat.getAuthority() + "RE" + pat.getNumber() + " E " - + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); + } else if (pat.getPlant()) { + if (pat.getAuthority().equals("US")) { + if (debug) + System.out.println(pat.getAuthority() + "PP" + pat.getNumber() + " " + + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); } - } - } else if (pat.getPlant()) { - if (pat.getAuthority().equals("US")) { - if (debug) - System.out.println(pat.getAuthority() + "PP" + pat.getNumber() + " " + - pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); - } - } else { - if (debug) { - if (pat.getKindCode() != null) { - System.out.println(pat.getAuthority() + " " + pat.getNumber() + " " - + pat.getKindCode() + " " - + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); - } else { - System.out.println(pat.getAuthority() + " " + pat.getNumber() + " " + - pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); + } else { + if (debug) { + if (pat.getKindCode() != null) { + System.out.println(pat.getAuthority() + " " + pat.getNumber() + " " + + pat.getKindCode() + " " + + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); + } else { + System.out.println(pat.getAuthority() + " " + pat.getNumber() + " " + + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n"); + } + System.out.println(pat.getContext()); } - System.out.println(pat.getContext()); - } - }*/ + }*/ + } + j++; } - j++; + + if (referencesNPL.size()>0) + allReferencesNPL.addAll(referencesNPL); } // list for filtering duplicates, if we want to ignore the duplicate numbers @@ -716,10 +757,10 @@ public String extractAllReferencesString(String text, } } - if (articles != null) { + if (articles != null && allReferencesNPL != null && allReferencesNPL.size()>0) { int k = 0; - List bibResults = parsers.getCitationParser().processingStringMultiple(referencesNPL, consolidate); - for (String ref : referencesNPL) { + List bibResults = parsers.getCitationParser().processingStringMultiple(allReferencesNPL, consolidate); + for (String ref : allReferencesNPL) { BiblioItem result = bibResults.get(k); if (result == null) { k++; @@ -729,7 +770,7 @@ public String extractAllReferencesString(String text, result.setReference(ref); bds.setResBib(result); bds.setRawBib(ref); - bds.addOffset(offsets_NPL.get(k).intValue()); + //bds.addOffset(offsets_NPL.get(k).intValue()); //bds.setConfidence(probNPL.get(k).doubleValue()); articles.add(bds); k++; @@ -749,33 +790,59 @@ public String extractAllReferencesString(String text, "\n"; - String divID = KeyGen.getKey().substring(0,7); - resultTEI += "\n"; - resultTEI += "\n"; - resultTEI += "
\n"; - resultTEI += TextUtilities.HTMLEncode(text); - resultTEI += "
\n"; - resultTEI += "
\n"; - if ( (patents != null) || (articles != null) ) { - resultTEI += "\n"; - } + resultTEI += "\n"; + resultTEI += "\n"; + + /*for(String text : texts) { + String divID = KeyGen.getKey().substring(0,7); + resultTEI += "
\n"; + text = text.replace("\n", " ").replace("\t", " "); + resultTEI += TextUtilities.HTMLEncode(text); + resultTEI += "
\n"; + resultTEI += "
\n"; + if ( (patents != null) || (articles != null) ) { + resultTEI += "\n"; + } + + if (patents != null) { + for(PatentItem patentCitation : patents) { + resultTEI += patentCitation.toTEI(true, divID) + "\n"; // with offsets + } + } + + if (articles != null) { + for(BibDataSet articleCitation : articles) { + resultTEI += articleCitation.toTEI(includeRawCitations) + "\n"; + } + } + if ( (patents != null) || (articles != null) ) { + resultTEI += "\n"; + } + resultTEI += "
\n"; + } + resultTEI += "
\n";*/ - if (patents != null) { - for(PatentItem patentCitation : patents) { - resultTEI += patentCitation.toTEI(true, divID) + "\n"; // with offsets - } - } + resultTEI += "
\n"; + if ( (patents != null) || (articles != null) ) { + resultTEI += "\n"; + } + + if (patents != null) { + for(PatentItem patentCitation : patents) { + resultTEI += patentCitation.toTEI(true, null) + "\n"; // with offsets + } + } + + if (articles != null) { + for(BibDataSet articleCitation : articles) { + resultTEI += articleCitation.toTEI(includeRawCitations) + "\n"; + } + } + if ( (patents != null) || (articles != null) ) { + resultTEI += "\n"; + } + resultTEI += "
\n"; - if (articles != null) { - for(BibDataSet articleCitation : articles) { - resultTEI += articleCitation.toTEI(includeRawCitations) + "\n"; - } - } - if ( (patents != null) || (articles != null) ) { - resultTEI += "
\n"; - } - resultTEI += "
\n"; - resultTEI += "
\n"; resultTEI += "
"; return resultTEI; @@ -1490,6 +1557,7 @@ public void generateTrainingData(String documentPath, String newTrainingPath) { sax.addFilter("description"); sax.addFilter("p"); sax.addFilter("heading"); + sax.addFilter("head"); // get a factory SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setValidating(false); @@ -1521,15 +1589,15 @@ public InputSource resolveEntity(String publicId, String systemId) { reader.parse(input); - String description = sax.getText(); + List descriptionSegments = sax.getTexts(); String currentPatentNumber = sax.currentPatentNumber; ArrayList patents = new ArrayList(); ArrayList articles = new ArrayList(); // we process the patent description - if (description != null) { - extractAllReferencesString(description, false, 0, false, patents, articles); + if (descriptionSegments != null && descriptionSegments.size() > 0) { + extractAllReferencesString(descriptionSegments, false, 0, false, patents, articles); // second pass: we add annotations corresponding to identified citation chunks based on // stored offsets Writer writer = new OutputStreamWriter( @@ -1624,14 +1692,16 @@ public boolean getDocOPS(String number) { try { if (ops == null) ops = new OPSService(); - description = ops.descriptionRetrieval(number); - + String description = ops.descriptionRetrieval(number); if (description == null) return false; else if (description.length() < 600) return false; - else + else { + descriptionSegments = new ArrayList<>(); + descriptionSegments.add(description); return true; + } } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid.", e); } diff --git a/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java b/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java index 56c80f29f7..d79bc437a2 100755 --- a/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java @@ -2,9 +2,10 @@ import org.xml.sax.*; import org.xml.sax.helpers.*; +import java.util.*; /** - * Stupid SAX parser which accumulate the textual content. + * Stupid SAX parser which accumulate the textual content for a patent document. *

* As an option, it is possible to accumulate only the content under a given * element name, for instance "description" for getting the description of a @@ -15,15 +16,20 @@ public class TextSaxParser extends DefaultHandler { StringBuffer accumulator = new StringBuffer(); // Accumulate parsed text - private String filter = null; // the name of an element for getting only the - // corresponding text + private List filters = null; // the name of elements for getting only the + // corresponding text, this will also be used + // for possible segmentations if more than one chunk of text is present under the + // filter element(s) private boolean accumule = true; public String currentPatentNumber = null; public String country = null; + private List texts = null; + public TextSaxParser() { + texts = new ArrayList<>(); } public void characters(char[] buffer, int start, int length) { @@ -32,8 +38,16 @@ public void characters(char[] buffer, int start, int length) { } } - public void setFilter(String filt) { - filter = filt; + public void setFilter(List filt) { + filters = filt; + accumule = false; + } + + public void addFilter(String filt) { + if (filters == null) + filters = new ArrayList<>(); + if (!filters.contains(filt)) + filters.add(filt); accumule = false; } @@ -45,13 +59,21 @@ public String getText() { return text; } + public List getTexts() { + return texts; + } + public void endElement(String uri, String localName, String qName) throws SAXException { - if (qName.equals(filter)) { + if (filters.contains(qName)) { + String localText = getText(); + if (localText.trim().length()>0) + texts.add(localText); + accumulator.setLength(0); accumule = false; } if (accumule) { - if (qName.equals("row") || qName.equals("p")) { + if (qName.equals("row") || qName.equals("p") || qName.equals("heading")) { accumulator.append(" "); } } @@ -99,7 +121,7 @@ else if (docID != null) { } } - if (qName.equals(filter)) { + if (filters.contains(qName)) { accumule = true; } }