From fbb238cf7a4bd40850b8c8c99d217f8fc007c095 Mon Sep 17 00:00:00 2001
From: Patrice Lopez <patrice.lopez@science-miner.com>
Date: Sun, 4 Feb 2024 22:50:22 +0100
Subject: [PATCH] refactor process for DL models batch

---
 .../engines/patent/ReferenceExtractor.java    | 822 ++++++++++--------
 .../org/grobid/core/sax/TextSaxParser.java    |  38 +-
 2 files changed, 476 insertions(+), 384 deletions(-)
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java
index 392c3ef608..fc2aa3731a 100755
--- a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java
+++ b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java
@@ -74,7 +74,7 @@ public class ReferenceExtractor implements Closeable {
     public Lexicon lexicon = Lexicon.getInstance();
     public String currentPatentNumber = null;
     public OPSService ops = null;
-    private String description = null;
+    private List<String> descriptionSegments = null;
 
     public ArrayList<org.grobid.core.data.BibDataSet> resBib = null; // identified current parsed
     // bibliographical items and related information
@@ -109,8 +109,8 @@ public String extractAllReferencesOPS(boolean filterDuplicate,
                                        List<PatentItem> patents,
                                        List<BibDataSet> articles) {
         try {
-            if (description != null) {
-                return extractAllReferencesString(description,
+            if (descriptionSegments != null && descriptionSegments.size()>0) {
+                return extractAllReferencesString(descriptionSegments,
                         filterDuplicate,
                         consolidate,
                         includeRawCitations,
@@ -157,6 +157,7 @@ public String extractAllReferencesXMLFile(String pathXML,
             sax.addFilter("description");
             sax.addFilter("p");
             sax.addFilter("heading");
+            sax.addFilter("head");
             // get a factory
             SAXParserFactory spf = SAXParserFactory.newInstance();
             spf.setValidating(false);
@@ -188,13 +189,18 @@ public InputSource resolveEntity(String publicId, String systemId) {
 
             reader.parse(input);
 
-            description = sax.getText();
+            List<String> descriptionSegments = sax.getTexts();
+
+/*for(String text : descriptionSegments){
+    System.out.println(text);
+}*/
+
             currentPatentNumber = sax.currentPatentNumber;
             consolidate = 0;
             filterDuplicate = true;
 
-            if (description != null) {
-                return extractAllReferencesString(description,
+            if (descriptionSegments != null && descriptionSegments.size()>0) {
+                return extractAllReferencesString(descriptionSegments,
                         filterDuplicate,
                         consolidate,
                         includeRawCitations,
@@ -228,7 +234,7 @@ public String extractAllReferencesPDFFile(String inputFile,
             if (doc.getBlocks() == null) {
                 return result;
             }
-            description = doc.getAllBlocksClean(25, -1);
+            String description = doc.getAllBlocksClean(25, -1);
             if (description != null) {
                 result = extractAllReferencesString(description,
                         filterDuplicate,
@@ -283,7 +289,7 @@ public String annotateAllReferencesPDFFile(String inputFile,
     }
 
     /**
-     * Extract all reference from a simple piece of text and return results in an XML document.
+     * Extract all reference from a simple piece of text or a list of text segments, and return results in an XML document.
      */
     public String extractAllReferencesString(String text,
                                           boolean filterDuplicate,
@@ -291,411 +297,446 @@ public String extractAllReferencesString(String text,
                                           boolean includeRawCitations,
                                           List<PatentItem> patents,
                                           List<BibDataSet> articles) {
+        List<String> texts = new ArrayList<>();
+        texts.add(text);
+        return extractAllReferencesString(texts, filterDuplicate, consolidate, includeRawCitations, patents, articles);
+    }
+
+    public String extractAllReferencesString(List<String> texts,
+                                          boolean filterDuplicate,
+                                          int consolidate,
+                                          boolean includeRawCitations,
+                                          List<PatentItem> patents,
+                                          List<BibDataSet> articles) {
         try {
             // if parameters are null, these lists will only be valid in the method
 			if (patents == null) {
-				patents = new ArrayList<PatentItem>();
+				patents = new ArrayList<>();
 			}
 
 			if (articles == null) {
-				articles = new ArrayList<BibDataSet>();
+				articles = new ArrayList<>();
 			}
 
-			// parser for patent references
+			// parser for patent and non patent references
             if (patentParser == null) {
                 patentParser = new PatentRefParser();
             }
-            // parser for non patent references
 
-            // tokenisation for the parser (with punctuation as tokens)
-            ArrayList<String> patentBlocks = new ArrayList<String>();
+            // identify the language of the patent document, we use only the first 500 characters
+            // which is enough normally for a very safe language prediction
+            // the text here is the patent description, so normally strictly monolingual
 
-            //text = TextUtilities.dehyphenize(text); // to be reviewed!
-            text = text.replace("\n", " ").replace("\t", " ");
-            //text = text.replace("  ", " ");
-
-			// identify the language of the patent document, we use only the first 500 characters
-			// which is enough normally for a very safe language prediction
-			// the text here is the patent description, so strictly monolingual
-            Language lang = languageUtilities.runLanguageId(text, 500);
-			List<String> tokenizations = analyzer.tokenize(text, lang);
-            int offset = 0;
-			if (tokenizations.size() == 0) {
-                return null;
+            // create text buffer
+            StringBuilder localText = new StringBuilder();
+            for(String text : texts) {
+                localText.append(text);
+                if (localText.length()> 500) 
+                    break;
             }
+            if (localText.length() == 0)
+                return null;
+            Language lang = languageUtilities.runLanguageId(localText.toString(), 500);            
 
-            List<OffsetPosition> journalPositions = null;
-            List<OffsetPosition> abbrevJournalPositions = null;
-            List<OffsetPosition> conferencePositions = null;
-            List<OffsetPosition> publisherPositions = null;
+            List<String> allPatentBlocks = new ArrayList<>();
+            List<List<LayoutToken>> allTokenizations = new ArrayList<>();
+            for (String text : texts) {
 
-            //if (articles != null)
-            {
-                journalPositions = lexicon.tokenPositionsJournalNames(text);
-                abbrevJournalPositions = lexicon.tokenPositionsAbbrevJournalNames(text);
-                conferencePositions = lexicon.tokenPositionsConferenceNames(text);
-                publisherPositions = lexicon.tokenPositionsPublisherNames(text);
-            }
+                //text = TextUtilities.dehyphenize(text); // to be reviewed!
 
-            boolean isJournalToken = false;
-            boolean isAbbrevJournalToken = false;
-            boolean isConferenceToken = false;
-            boolean isPublisherToken = false;
-            int currentJournalPositions = 0;
-            int currentAbbrevJournalPositions = 0;
-            int currentConferencePositions = 0;
-            int currentPublisherPositions = 0;
-            boolean skipTest = false;
-            //st = new StringTokenizer(text, " (["+ TextUtilities.punctuations, true);
-            //st = new StringTokenizer(text, delimiters, true);
-            int posit = 0;
-            //while (st.hasMoreTokens()) {
-			for(String tok : tokenizations)	{
-                isJournalToken = false;
-                isAbbrevJournalToken = false;
-                isConferenceToken = false;
-                isPublisherToken = false;
-                skipTest = false;
-                //String tok = st.nextToken();
-                if ( (tok.trim().length() == 0) ||
-					 (tok.equals(" ")) ||
-				     (tok.equals("\t")) ||
-					 (tok.equals("\n")) ||
-					 (tok.equals("\r"))
-					 ) {
+                // tokenisation according to the language
+                List<LayoutToken> tokenizations = analyzer.tokenizeWithLayoutToken(text, lang);
+                if (tokenizations.size() == 0) {
                     continue;
                 }
 
-                // check the position of matches for journals
-                if (journalPositions != null) {
-                    if (currentJournalPositions == journalPositions.size() - 1) {
-                        if (journalPositions.get(currentJournalPositions).end < posit) {
-                            skipTest = true;
-                        }
+                allTokenizations.add(tokenizations);
+
+                StringBuilder patentBlocks = new StringBuilder();
+
+                List<OffsetPosition> journalPositions = null;
+                List<OffsetPosition> abbrevJournalPositions = null;
+                List<OffsetPosition> conferencePositions = null;
+                List<OffsetPosition> publisherPositions = null;
+
+                journalPositions = lexicon.tokenPositionsJournalNames(tokenizations);
+                abbrevJournalPositions = lexicon.tokenPositionsAbbrevJournalNames(tokenizations);
+                conferencePositions = lexicon.tokenPositionsConferenceNames(tokenizations);
+                publisherPositions = lexicon.tokenPositionsPublisherNames(tokenizations);
+
+                boolean isJournalToken = false;
+                boolean isAbbrevJournalToken = false;
+                boolean isConferenceToken = false;
+                boolean isPublisherToken = false;
+                int currentJournalPositions = 0;
+                int currentAbbrevJournalPositions = 0;
+                int currentConferencePositions = 0;
+                int currentPublisherPositions = 0;
+                boolean skipTest = false;
+                //st = new StringTokenizer(text, " (["+ TextUtilities.punctuations, true);
+                //st = new StringTokenizer(text, delimiters, true);
+                int posit = 0;
+                //while (st.hasMoreTokens()) {
+    			for(LayoutToken token : tokenizations)	{
+                    String tok = token.getText();
+                    isJournalToken = false;
+                    isAbbrevJournalToken = false;
+                    isConferenceToken = false;
+                    isPublisherToken = false;
+                    skipTest = false;
+                    //String tok = st.nextToken();
+                    if ( (tok.trim().length() == 0) ||
+    					 (tok.equals(" ")) ||
+    				     (tok.equals("\t")) ||
+    					 (tok.equals("\n")) ||
+    					 (tok.equals("\r"))
+    					 ) {
+                        continue;
                     }
-                    if (!skipTest) {
-                        for (int i = currentJournalPositions; i < journalPositions.size(); i++) {
-                            if ((journalPositions.get(i).start <= posit) &&
-                                    (journalPositions.get(i).end >= posit)) {
-                                isJournalToken = true;
-                                currentJournalPositions = i;
-                                break;
-                            } else if (journalPositions.get(i).start > posit) {
-                                isJournalToken = false;
-                                currentJournalPositions = i;
-                                break;
+
+                    // check the position of matches for journals
+                    if (journalPositions != null) {
+                        if (currentJournalPositions == journalPositions.size() - 1) {
+                            if (journalPositions.get(currentJournalPositions).end < posit) {
+                                skipTest = true;
+                            }
+                        }
+                        if (!skipTest) {
+                            for (int i = currentJournalPositions; i < journalPositions.size(); i++) {
+                                if ((journalPositions.get(i).start <= posit) &&
+                                        (journalPositions.get(i).end >= posit)) {
+                                    isJournalToken = true;
+                                    currentJournalPositions = i;
+                                    break;
+                                } else if (journalPositions.get(i).start > posit) {
+                                    isJournalToken = false;
+                                    currentJournalPositions = i;
+                                    break;
+                                }
                             }
                         }
                     }
-                }
 
-                // check the position of matches for abbreviated journals
-                skipTest = false;
-                if (abbrevJournalPositions != null) {
-                    if (currentAbbrevJournalPositions == abbrevJournalPositions.size() - 1) {
-                        if (abbrevJournalPositions.get(currentAbbrevJournalPositions).end < posit) {
-                            skipTest = true;
+                    // check the position of matches for abbreviated journals
+                    skipTest = false;
+                    if (abbrevJournalPositions != null) {
+                        if (currentAbbrevJournalPositions == abbrevJournalPositions.size() - 1) {
+                            if (abbrevJournalPositions.get(currentAbbrevJournalPositions).end < posit) {
+                                skipTest = true;
+                            }
                         }
-                    }
-                    if (!skipTest) {
-                        for (int i = currentAbbrevJournalPositions; i < abbrevJournalPositions.size(); i++) {
-                            if ((abbrevJournalPositions.get(i).start <= posit) &&
-                                    (abbrevJournalPositions.get(i).end >= posit)) {
-                                isAbbrevJournalToken = true;
-                                currentAbbrevJournalPositions = i;
-                                break;
-                            } else if (abbrevJournalPositions.get(i).start > posit) {
-                                isAbbrevJournalToken = false;
-                                currentAbbrevJournalPositions = i;
-                                break;
+                        if (!skipTest) {
+                            for (int i = currentAbbrevJournalPositions; i < abbrevJournalPositions.size(); i++) {
+                                if ((abbrevJournalPositions.get(i).start <= posit) &&
+                                        (abbrevJournalPositions.get(i).end >= posit)) {
+                                    isAbbrevJournalToken = true;
+                                    currentAbbrevJournalPositions = i;
+                                    break;
+                                } else if (abbrevJournalPositions.get(i).start > posit) {
+                                    isAbbrevJournalToken = false;
+                                    currentAbbrevJournalPositions = i;
+                                    break;
+                                }
                             }
                         }
                     }
-                }
 
-                // check the position of matches for conference names
-                skipTest = false;
-                if (conferencePositions != null) {
-                    if (currentConferencePositions == conferencePositions.size() - 1) {
-                        if (conferencePositions.get(currentConferencePositions).end < posit) {
-                            skipTest = true;
+                    // check the position of matches for conference names
+                    skipTest = false;
+                    if (conferencePositions != null) {
+                        if (currentConferencePositions == conferencePositions.size() - 1) {
+                            if (conferencePositions.get(currentConferencePositions).end < posit) {
+                                skipTest = true;
+                            }
                         }
-                    }
-                    if (!skipTest) {
-                        for (int i = currentConferencePositions; i < conferencePositions.size(); i++) {
-                            if ((conferencePositions.get(i).start <= posit) &&
-                                    (conferencePositions.get(i).end >= posit)) {
-                                isConferenceToken = true;
-                                currentConferencePositions = i;
-                                break;
-                            } else if (conferencePositions.get(i).start > posit) {
-                                isConferenceToken = false;
-                                currentConferencePositions = i;
-                                break;
+                        if (!skipTest) {
+                            for (int i = currentConferencePositions; i < conferencePositions.size(); i++) {
+                                if ((conferencePositions.get(i).start <= posit) &&
+                                        (conferencePositions.get(i).end >= posit)) {
+                                    isConferenceToken = true;
+                                    currentConferencePositions = i;
+                                    break;
+                                } else if (conferencePositions.get(i).start > posit) {
+                                    isConferenceToken = false;
+                                    currentConferencePositions = i;
+                                    break;
+                                }
                             }
                         }
                     }
-                }
 
-                // check the position of matches for publisher names
-                skipTest = false;
-                if (publisherPositions != null) {
-                    if (currentPublisherPositions == publisherPositions.size() - 1) {
-                        if (publisherPositions.get(currentPublisherPositions).end < posit) {
-                            skipTest = true;
+                    // check the position of matches for publisher names
+                    skipTest = false;
+                    if (publisherPositions != null) {
+                        if (currentPublisherPositions == publisherPositions.size() - 1) {
+                            if (publisherPositions.get(currentPublisherPositions).end < posit) {
+                                skipTest = true;
+                            }
                         }
-                    }
-                    if (!skipTest) {
-                        for (int i = currentPublisherPositions; i < publisherPositions.size(); i++) {
-                            if ((publisherPositions.get(i).start <= posit) &&
-                                    (publisherPositions.get(i).end >= posit)) {
-                                isPublisherToken = true;
-                                currentPublisherPositions = i;
-                                break;
-                            } else if (publisherPositions.get(i).start > posit) {
-                                isPublisherToken = false;
-                                currentPublisherPositions = i;
-                                break;
+                        if (!skipTest) {
+                            for (int i = currentPublisherPositions; i < publisherPositions.size(); i++) {
+                                if ((publisherPositions.get(i).start <= posit) &&
+                                        (publisherPositions.get(i).end >= posit)) {
+                                    isPublisherToken = true;
+                                    currentPublisherPositions = i;
+                                    break;
+                                } else if (publisherPositions.get(i).start > posit) {
+                                    isPublisherToken = false;
+                                    currentPublisherPositions = i;
+                                    break;
+                                }
                             }
                         }
                     }
+
+                    FeaturesVectorReference featureVector =
+                            FeaturesVectorReference.addFeaturesPatentReferences(new LayoutToken(tok), null,
+                                    tokenizations.size(),
+                                    posit,
+                                    isJournalToken,
+                                    isAbbrevJournalToken,
+                                    isConferenceToken,
+                                    isPublisherToken);
+                    patentBlocks.append(featureVector.printVector());
+                    patentBlocks.append("\n");
+                    posit++;
                 }
 
-                FeaturesVectorReference featureVector =
-                        FeaturesVectorReference.addFeaturesPatentReferences(new LayoutToken(tok), null,
-                                tokenizations.size(),
-                                posit,
-                                isJournalToken,
-                                isAbbrevJournalToken,
-                                isConferenceToken,
-                                isPublisherToken);
-                patentBlocks.add(featureVector.printVector());
-                posit++;
+                patentBlocks.append("\n\n");
+                allPatentBlocks.add(patentBlocks.toString());
             }
 
-            patentBlocks.add("\n");
-
-            String theResult = null;
-            theResult = taggerAll.label(patentBlocks);
-            //System.out.println(theResult);
-
-            StringTokenizer stt = new StringTokenizer(theResult, "\n");
-
-            List<String> referencesPatent = new ArrayList<String>();
-            List<String> referencesNPL = new ArrayList<String>();
-            List<Integer> offsets_patent = new ArrayList<Integer>();
-            List<Integer> offsets_NPL = new ArrayList<Integer>();
-			List<Double> probPatent = new ArrayList<Double>();
-			List<Double> probNPL = new ArrayList<Double>();
-
-            boolean currentPatent = true; // type of current reference
-            String reference = null;
-			double currentProb = 0.0;
-            offset = 0;
-            int currentOffset = 0;
-			int addedOffset = 0;
-            String label = null; // label
-            String actual = null; // token
-            int p = 0; // iterator for the tokenizations for restauring the original tokenization with
-            // respect to spaces
-
-            while (stt.hasMoreTokens()) {
-                String line = stt.nextToken();
-                if (line.trim().length() == 0) {
-                    continue;
-                }
-
-                StringTokenizer st2 = new StringTokenizer(line, "\t ");
-                boolean start = true;
-				String separator = "";
-                label = null;
-                actual = null;
-                while (st2.hasMoreTokens()) {
-                    if (start) {
-                        actual = st2.nextToken().trim();
-                        start = false;
+            String theResults = taggerAll.label(allPatentBlocks);
+//System.out.println(theResults);
+            String[] theSegmentedResults = theResults.split("\n\n");
+//System.out.println(allPatentBlocks.size() + " / " + theSegmentedResults.length);
+
+            List<String> allReferencesNPL = new ArrayList<>();
+            //int offset = 0;
+            for(int index=0; index<theSegmentedResults.length; index++) {
+                String theResult = theSegmentedResults[index];
+                StringTokenizer stt = new StringTokenizer(theResult, "\n");
+                List<LayoutToken> tokenizations = allTokenizations.get(index);
+
+                List<String> referencesPatent = new ArrayList<String>();
+                List<String> referencesNPL = new ArrayList<String>();
+                List<Integer> offsets_patent = new ArrayList<Integer>();
+                List<Integer> offsets_NPL = new ArrayList<Integer>();
+    			List<Double> probPatent = new ArrayList<Double>();
+    			List<Double> probNPL = new ArrayList<Double>();
+
+                boolean currentPatent = true; // type of current reference
+                String reference = null;
+    			double currentProb = 0.0;
+                int offset = 0;
+                int currentOffset = 0;
+    			int addedOffset = 0;
+                String label = null; // label
+                String actual = null; // token
+                int p = 0; // iterator for the tokenizations for restauring the original tokenization with
+                // respect to spaces
+
+                while (stt.hasMoreTokens()) {
+                    String line = stt.nextToken();
+                    if (line.trim().length() == 0) {
+                        continue;
+                    }
 
-                        boolean strop = false;
-                        while ((!strop) && (p < tokenizations.size())) {
-                            String tokOriginal = tokenizations.get(p);
-							addedOffset += tokOriginal.length();
-                            if (tokOriginal.equals(" ")) {
-								separator += tokOriginal;
-                            } else if (tokOriginal.equals(actual)) {
-                                strop = true;
+                    StringTokenizer st2 = new StringTokenizer(line, "\t ");
+                    boolean start = true;
+    				String separator = "";
+                    label = null;
+                    actual = null;
+                    while (st2.hasMoreTokens()) {
+                        if (start) {
+                            actual = st2.nextToken().trim();
+                            start = false;
+
+                            boolean strop = false;
+                            while ((!strop) && (p < tokenizations.size())) {
+                                String tokOriginal = tokenizations.get(p).getText();
+    							addedOffset += tokOriginal.length();
+                                if (tokOriginal.equals(" ")) {
+    								separator += tokOriginal;
+                                } else if (tokOriginal.equals(actual)) {
+                                    strop = true;
+                                }
+                                p++;
                             }
-                            p++;
+                        } else {
+                            label = st2.nextToken().trim();
                         }
-                    } else {
-                        label = st2.nextToken().trim();
                     }
-                }
-
-                if (label == null) {
-					offset += addedOffset;
-					addedOffset = 0;
-                    continue;
-                }
 
-				double prob = 0.0;
-				int segProb = label.lastIndexOf("/");
-				if (segProb != -1) {
-					String probString = label.substring(segProb+1, label.length());
-					//System.out.println("given prob: " + probString);
-					try {
-						prob = Double.parseDouble(probString);
-						//System.out.println("given prob: " + probString + ", parsed: " + prob);
-					}
-					catch(Exception e) {
-						LOGGER.debug(probString + " cannot be parsed.");
-					}
-					label = label.substring(0,segProb);
-				}
-
-                // TBD: use TaggingTokenClusteror and TaggingLabel as for the other parsers
-                if (actual != null) {
-                    if (label.endsWith("<refPatent>")) {
-                        if (reference == null) {
-                            reference = separator + actual;
-                            currentOffset = offset;
-                            currentPatent = true;
-							currentProb = prob;
-                        } else {
-                            if (currentPatent) {
-                                if (label.equals("I-<refPatent>")) {
-                                    referencesPatent.add(reference);
-                                    offsets_patent.add(currentOffset);
+                    if (label == null) {
+    					offset += addedOffset;
+    					addedOffset = 0;
+                        continue;
+                    }
 
-									probPatent.add(Double.valueOf(currentProb));
+    				double prob = 0.0;
+    				int segProb = label.lastIndexOf("/");
+    				if (segProb != -1) {
+    					String probString = label.substring(segProb+1, label.length());
+    					//System.out.println("given prob: " + probString);
+    					try {
+    						prob = Double.parseDouble(probString);
+    						//System.out.println("given prob: " + probString + ", parsed: " + prob);
+    					}
+    					catch(Exception e) {
+    						LOGGER.debug(probString + " cannot be parsed.");
+    					}
+    					label = label.substring(0,segProb);
+    				}
+
+                    // TBD: use TaggingTokenClusteror and TaggingLabel as for the other parsers
+                    if (actual != null) {
+                        if (label.endsWith("<refPatent>")) {
+                            if (reference == null) {
+                                reference = separator + actual;
+                                currentOffset = offset;
+                                currentPatent = true;
+    							currentProb = prob;
+                            } else {
+                                if (currentPatent) {
+                                    if (label.equals("I-<refPatent>")) {
+                                        referencesPatent.add(reference);
+                                        offsets_patent.add(currentOffset);
+
+    									probPatent.add(Double.valueOf(currentProb));
+
+                                        currentPatent = true;
+    		                            reference = separator + actual;
+                                        currentOffset = offset;
+    									currentProb = prob;
+                                    } else {
+                                        reference += separator + actual;
+    									if (prob > currentProb) {
+    										currentProb = prob;
+    									}
+                                    }
+                                } else {
+                                    referencesNPL.add(reference);
+                                    offsets_NPL.add(currentOffset);
+    								probNPL.add(Double.valueOf(currentProb));
 
                                     currentPatent = true;
-		                            reference = separator + actual;
+    	                            reference = separator + actual;
                                     currentOffset = offset;
-									currentProb = prob;
-                                } else {
-                                    reference += separator + actual;
-									if (prob > currentProb) {
-										currentProb = prob;
-									}
+    								currentProb = prob;
                                 }
-                            } else {
-                                referencesNPL.add(reference);
-                                offsets_NPL.add(currentOffset);
-								probNPL.add(Double.valueOf(currentProb));
-
-                                currentPatent = true;
-	                            reference = separator + actual;
-                                currentOffset = offset;
-								currentProb = prob;
                             }
-                        }
-                    } else if (label.endsWith("<refNPL>")) {
-                        if (reference == null) {
-                            reference = separator + actual;
-                            currentOffset = offset;
-                            currentPatent = false;
-							currentProb = prob;
-                        } else {
-                            if (currentPatent) {
-                                referencesPatent.add(reference);
-                                offsets_patent.add(currentOffset);
-								probPatent.add(Double.valueOf(currentProb));
-
-                                currentPatent = false;
-	                            reference = separator + actual;
+                        } else if (label.endsWith("<refNPL>")) {
+                            if (reference == null) {
+                                reference = separator + actual;
                                 currentOffset = offset;
-								currentProb = prob;
+                                currentPatent = false;
+    							currentProb = prob;
                             } else {
-                                if (label.equals("I-<refNPL>")) {
-                                    referencesNPL.add(reference);
-                                    offsets_NPL.add(currentOffset);
-									probNPL.add(Double.valueOf(currentProb));
+                                if (currentPatent) {
+                                    referencesPatent.add(reference);
+                                    offsets_patent.add(currentOffset);
+    								probPatent.add(Double.valueOf(currentProb));
 
                                     currentPatent = false;
-		                            reference = separator + actual;
+    	                            reference = separator + actual;
                                     currentOffset = offset;
-									currentProb = prob;
+    								currentProb = prob;
                                 } else {
-                                    reference += separator + actual;
-									if (prob > currentProb) {
-										currentProb = prob;
-									}
+                                    if (label.equals("I-<refNPL>")) {
+                                        referencesNPL.add(reference);
+                                        offsets_NPL.add(currentOffset);
+    									probNPL.add(Double.valueOf(currentProb));
+
+                                        currentPatent = false;
+    		                            reference = separator + actual;
+                                        currentOffset = offset;
+    									currentProb = prob;
+                                    } else {
+                                        reference += separator + actual;
+    									if (prob > currentProb) {
+    										currentProb = prob;
+    									}
+                                    }
                                 }
                             }
-                        }
-                    } else if (label.equals("<other>")) {
-                        if (reference != null) {
-                            if (currentPatent) {
-                                referencesPatent.add(reference);
-                                offsets_patent.add(currentOffset);
-								probPatent.add(Double.valueOf(currentProb));
-                            } else {
-                                referencesNPL.add(reference);
-                                offsets_NPL.add(currentOffset);
-								probNPL.add(Double.valueOf(currentProb));
+                        } else if (label.equals("<other>")) {
+                            if (reference != null) {
+                                if (currentPatent) {
+                                    referencesPatent.add(reference);
+                                    offsets_patent.add(currentOffset);
+    								probPatent.add(Double.valueOf(currentProb));
+                                } else {
+                                    referencesNPL.add(reference);
+                                    offsets_NPL.add(currentOffset);
+    								probNPL.add(Double.valueOf(currentProb));
+                                }
+                                currentPatent = false;
                             }
-                            currentPatent = false;
+                            reference = null;
+    						currentProb	= 0.0;
                         }
-                        reference = null;
-						currentProb	= 0.0;
                     }
+    				offset += addedOffset;
+    				addedOffset = 0;
                 }
-				offset += addedOffset;
-				addedOffset = 0;
-            }
 
-            // run reference patent parser in isolation, and produce some traces
-            int j = 0;
-            for (String ref : referencesPatent) {
-                patentParser.setRawRefText(ref);
-                patentParser.setRawRefTextOffset(offsets_patent.get(j).intValue());
-                List<PatentItem> patents0 = patentParser.processRawRefText();
-                for (PatentItem pat : patents0) {
-                    pat.setContext(ref);
-					pat.setConf(probPatent.get(j).doubleValue());
-                    patents.add(pat);
-                    /*if (pat.getApplication()) {
-                        if (pat.getProvisional()) {
-                            if (debug) {
-                                System.out.println(pat.getAuthority() + " " + pat.getNumber()
-                                        + " P application " + pat.getOffsetBegin()
-                                        + ":" + pat.getOffsetEnd() + "\n");
+                // run reference patent parser in isolation, and produce some traces
+                int j = 0;
+                for (String ref : referencesPatent) {
+                    patentParser.setRawRefText(ref);
+                    patentParser.setRawRefTextOffset(offsets_patent.get(j).intValue());
+                    List<PatentItem> patents0 = patentParser.processRawRefText();
+                    for (PatentItem pat : patents0) {
+                        pat.setContext(ref);
+    					pat.setConf(probPatent.get(j).doubleValue());
+                        patents.add(pat);
+                        /*if (pat.getApplication()) {
+                            if (pat.getProvisional()) {
+                                if (debug) {
+                                    System.out.println(pat.getAuthority() + " " + pat.getNumber()
+                                            + " P application " + pat.getOffsetBegin()
+                                            + ":" + pat.getOffsetEnd() + "\n");
+                                }
+                            } else {
+                                if (debug) {
+                                    System.out.println(pat.getAuthority() + " " + pat.getNumber()
+                                            + " application " + pat.getOffsetBegin()
+                                            + ":" + pat.getOffsetEnd() + "\n");
+                                }
                             }
-                        } else {
-                            if (debug) {
-                                System.out.println(pat.getAuthority() + " " + pat.getNumber()
-                                        + " application " + pat.getOffsetBegin()
-                                        + ":" + pat.getOffsetEnd() + "\n");
+                        } else if (pat.getReissued()) {
+                            if (pat.getAuthority().equals("US")) {
+                                if (debug) {
+                                    System.out.println(pat.getAuthority() + "RE" + pat.getNumber() + " E "
+                                            + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
+                                }
                             }
-                        }
-                    } else if (pat.getReissued()) {
-                        if (pat.getAuthority().equals("US")) {
-                            if (debug) {
-                                System.out.println(pat.getAuthority() + "RE" + pat.getNumber() + " E "
-                                        + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
+                        } else if (pat.getPlant()) {
+                            if (pat.getAuthority().equals("US")) {
+                                if (debug)
+                                    System.out.println(pat.getAuthority() + "PP" + pat.getNumber() + " " +
+                                            pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
                             }
-                        }
-                    } else if (pat.getPlant()) {
-                        if (pat.getAuthority().equals("US")) {
-                            if (debug)
-                                System.out.println(pat.getAuthority() + "PP" + pat.getNumber() + " " +
-                                        pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
-                        }
-                    } else {
-                        if (debug) {
-                            if (pat.getKindCode() != null) {
-                                System.out.println(pat.getAuthority() + " " + pat.getNumber() + " "
-                                        + pat.getKindCode() + " "
-                                        + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
-                            } else {
-                                System.out.println(pat.getAuthority() + " " + pat.getNumber() + " " +
-                                        pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
+                        } else {
+                            if (debug) {
+                                if (pat.getKindCode() != null) {
+                                    System.out.println(pat.getAuthority() + " " + pat.getNumber() + " "
+                                            + pat.getKindCode() + " "
+                                            + pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
+                                } else {
+                                    System.out.println(pat.getAuthority() + " " + pat.getNumber() + " " +
+                                            pat.getOffsetBegin() + ":" + pat.getOffsetEnd() + "\n");
+                                }
+                                System.out.println(pat.getContext());
                             }
-                            System.out.println(pat.getContext());
-                        }
-                    }*/
+                        }*/
+                    }
+                    j++;
                 }
-                j++;
+
+                if (referencesNPL.size()>0)
+                    allReferencesNPL.addAll(referencesNPL);
             }
 
             // list for filtering duplicates, if we want to ignore the duplicate numbers
@@ -716,10 +757,10 @@ public String extractAllReferencesString(String text,
                 }
             }
 
-            if (articles != null) {
+            if (articles != null && allReferencesNPL != null && allReferencesNPL.size()>0) {
                 int k = 0;
-                List<BiblioItem> bibResults = parsers.getCitationParser().processingStringMultiple(referencesNPL, consolidate);
-                for (String ref : referencesNPL) {
+                List<BiblioItem> bibResults = parsers.getCitationParser().processingStringMultiple(allReferencesNPL, consolidate);
+                for (String ref : allReferencesNPL) {
                     BiblioItem result = bibResults.get(k);
                     if (result == null) {
                         k++;
@@ -729,7 +770,7 @@ public String extractAllReferencesString(String text,
                     result.setReference(ref);
                     bds.setResBib(result);
                     bds.setRawBib(ref);
-                    bds.addOffset(offsets_NPL.get(k).intValue());
+                    //bds.addOffset(offsets_NPL.get(k).intValue());
                     //bds.setConfidence(probNPL.get(k).doubleValue());
                     articles.add(bds);
                     k++;
@@ -749,33 +790,59 @@ public String extractAllReferencesString(String text,
 						   "<TEI xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\" " +
 						   "xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n";
 
-		String divID = KeyGen.getKey().substring(0,7);
-		resultTEI += "<teiHeader />\n";
-		resultTEI += "<text>\n";
-		resultTEI += "<div id=\"_"+ divID +"\">\n";
-		resultTEI += TextUtilities.HTMLEncode(text);
-		resultTEI += "</div>\n";
-		resultTEI += "<div type=\"references\">\n";
-		if ( (patents != null) || (articles != null) ) {
-			resultTEI += "<listBibl>\n";
-		}
+        resultTEI += "<teiHeader/>\n";
+        resultTEI += "<text/>\n";
+
+        /*for(String text : texts) {
+    		String divID = KeyGen.getKey().substring(0,7);		
+    		resultTEI += "<div id=\"_"+ divID +"\">\n";
+            text = text.replace("\n", " ").replace("\t", " ");  
+    		resultTEI += TextUtilities.HTMLEncode(text);
+    		resultTEI += "</div>\n";
+    		resultTEI += "<div type=\"references\">\n";
+    		if ( (patents != null) || (articles != null) ) {
+    			resultTEI += "<listBibl>\n";
+    		}
+
+    		if (patents != null) {
+    			for(PatentItem patentCitation : patents) {
+    				resultTEI += patentCitation.toTEI(true, divID) + "\n"; // with offsets
+    			}
+    		}
+
+    		if (articles != null) {
+    			for(BibDataSet articleCitation : articles) {
+    				resultTEI += articleCitation.toTEI(includeRawCitations) + "\n";
+    			}
+    		}
+    		if ( (patents != null) || (articles != null) ) {
+    			resultTEI += "</listBibl>\n";
+    		}
+    		resultTEI += "</div>\n";
+        }
+		resultTEI += "</text>\n";*/
 
-		if (patents != null) {
-			for(PatentItem patentCitation : patents) {
-				resultTEI += patentCitation.toTEI(true, divID) + "\n"; // with offsets
-			}
-		}
+        resultTEI += "<div type=\"references\">\n";
+        if ( (patents != null) || (articles != null) ) {
+            resultTEI += "<listBibl>\n";
+        }
+
+        if (patents != null) {
+            for(PatentItem patentCitation : patents) {
+                resultTEI += patentCitation.toTEI(true, null) + "\n"; // with offsets
+            }
+        }
+
+        if (articles != null) {
+            for(BibDataSet articleCitation : articles) {
+                resultTEI += articleCitation.toTEI(includeRawCitations) + "\n";
+            }
+        }
+        if ( (patents != null) || (articles != null) ) {
+            resultTEI += "</listBibl>\n";
+        }
+        resultTEI += "</div>\n";
 
-		if (articles != null) {
-			for(BibDataSet articleCitation : articles) {
-				resultTEI += articleCitation.toTEI(includeRawCitations) + "\n";
-			}
-		}
-		if ( (patents != null) || (articles != null) ) {
-			resultTEI += "</listBibl>\n";
-		}
-		resultTEI += "</div>\n";
-		resultTEI += "</text>\n";
 		resultTEI += "</TEI>";
 
         return resultTEI;
@@ -1490,6 +1557,7 @@ public void generateTrainingData(String documentPath, String newTrainingPath) {
             sax.addFilter("description");
             sax.addFilter("p");
             sax.addFilter("heading");
+            sax.addFilter("head");
             // get a factory
             SAXParserFactory spf = SAXParserFactory.newInstance();
             spf.setValidating(false);
@@ -1521,15 +1589,15 @@ public InputSource resolveEntity(String publicId, String systemId) {
 
             reader.parse(input);
 
-            String description = sax.getText();
+            List<String> descriptionSegments = sax.getTexts();
             String currentPatentNumber = sax.currentPatentNumber;
 
             ArrayList<PatentItem> patents = new ArrayList<PatentItem>();
             ArrayList<BibDataSet> articles = new ArrayList<BibDataSet>();
 
             // we process the patent description
-            if (description != null) {
-                extractAllReferencesString(description, false, 0, false, patents, articles);
+            if (descriptionSegments != null && descriptionSegments.size() > 0) {
+                extractAllReferencesString(descriptionSegments, false, 0, false, patents, articles);
                 // second pass: we add annotations corresponding to identified citation chunks based on
                 // stored offsets
                 Writer writer = new OutputStreamWriter(
@@ -1624,14 +1692,16 @@ public boolean getDocOPS(String number) {
         try {
             if (ops == null)
                 ops = new OPSService();
-            description = ops.descriptionRetrieval(number);
-
+            String description = ops.descriptionRetrieval(number);
             if (description == null)
                 return false;
             else if (description.length() < 600)
                 return false;
-            else
+            else {
+                descriptionSegments = new ArrayList<>();
+                descriptionSegments.add(description);
                 return true;
+            }
         } catch (Exception e) {
             throw new GrobidException("An exception occured while running Grobid.", e);
         }
diff --git a/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java b/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java
index 56c80f29f7..d79bc437a2 100755
--- a/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java
+++ b/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java
@@ -2,9 +2,10 @@
 
 import org.xml.sax.*;
 import org.xml.sax.helpers.*;
+import java.util.*;
 
 /**
- * Stupid SAX parser which accumulate the textual content.
+ * Stupid SAX parser which accumulate the textual content for a patent document.
  * <p/>
  * As an option, it is possible to accumulate only the content under a given
  * element name, for instance "description" for getting the description of a
@@ -15,15 +16,20 @@ public class TextSaxParser extends DefaultHandler {
 
 	StringBuffer accumulator = new StringBuffer(); // Accumulate parsed text
 
-	private String filter = null; // the name of an element for getting only the
-									// corresponding text
+	private List<String> filters = null; // the name of elements for getting only the
+									    // corresponding text, this will also be used
+	// for possible segmentations if more than one chunk of text is present under the 
+	// filter element(s)
 
 	private boolean accumule = true;
 
 	public String currentPatentNumber = null;
 	public String country = null;
 
+	private List<String> texts = null;
+
 	public TextSaxParser() {
+		texts = new ArrayList<>();
 	}
 
 	public void characters(char[] buffer, int start, int length) {
@@ -32,8 +38,16 @@ public void characters(char[] buffer, int start, int length) {
 		}
 	}
 
-	public void setFilter(String filt) {
-		filter = filt;
+	public void setFilter(List<String> filt) {
+		filters = filt;
+		accumule = false;
+	}
+
+	public void addFilter(String filt) {
+		if (filters == null)
+			filters = new ArrayList<>();
+		if (!filters.contains(filt))
+			filters.add(filt);
 		accumule = false;
 	}
 
@@ -45,13 +59,21 @@ public String getText() {
 		return text;
 	}
 
+	public List<String> getTexts() {
+		return texts;
+	}
+
 	public void endElement(String uri, String localName, String qName)
 			throws SAXException {
-		if (qName.equals(filter)) {
+		if (filters.contains(qName)) {
+			String localText = getText();
+			if (localText.trim().length()>0)
+				texts.add(localText);
+			accumulator.setLength(0);
 			accumule = false;
 		}
 		if (accumule) {
-			if (qName.equals("row") || qName.equals("p")) {
+			if (qName.equals("row") || qName.equals("p") || qName.equals("heading")) {
 				accumulator.append(" ");
 			}
 		}
@@ -99,7 +121,7 @@ else if (docID != null) {
 			}
 		}
 
-		if (qName.equals(filter)) {
+		if (filters.contains(qName)) {
 			accumule = true;
 		}
 	}