From 1c52ad3a286fd3d1bb7e6e5b586f675bbb5ad7e3 Mon Sep 17 00:00:00 2001
From: Yuzhan Jiang <yjiang@progress.com>
Date: Mon, 28 Oct 2024 15:25:51 -0400
Subject: [PATCH 1/5] MLE-12358: (Technical) Apply fix on MLCP commons-csv to
 the latest 1.12 release

---
 pom.xml                                       |  3 +
 .../org/apache/commons/csv/CSVFormat.java     |  4 +
 .../org/apache/commons/csv/CSVParser.java     | 10 ++-
 .../org/apache/commons/csv/CSVRecord.java     | 26 ++++++
 .../commons/csv/ExtendedBufferedReader.java   | 79 +++++++++++++++++++
 .../java/org/apache/commons/csv/Lexer.java    |  9 +++
 .../org/apache/commons/csv/CSVParserTest.java | 79 +++++++++++++++++++
 .../apache/commons/csv/JiraCsv196Test.java    | 79 +++++++++++++++++++
 .../org/apache/commons/csv/CSV-196/emoji.csv  |  5 ++
 .../apache/commons/csv/CSV-196/japanese.csv   |  4 +
 10 files changed, 296 insertions(+), 2 deletions(-)
 create mode 100644 src/test/java/org/apache/commons/csv/JiraCsv196Test.java
 create mode 100644 src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
 create mode 100644 src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv
diff --git a/pom.xml b/pom.xml
index a3e053a10..1e81e5f9d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -28,6 +28,7 @@
   <url>https://commons.apache.org/proper/commons-csv/</url>
   <inceptionYear>2005</inceptionYear>
   <description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
+  <packaging>jar</packaging>
 
   <dependencies>
     <dependency>
@@ -231,6 +232,8 @@
               <exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
index dd5416e11..800bd1efb 100644
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -2074,6 +2074,10 @@ public CSVParser parse(final Reader reader) throws IOException {
         return new CSVParser(reader, this);
     }
 
+    public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
+        return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
+    }
+
     /**
      * Prints to the specified output.
      *
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index a2bc23070..b7a6fcf56 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -438,10 +438,15 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
     @SuppressWarnings("resource")
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
         throws IOException {
+            this(reader, format, characterOffset, recordNumber, null);
+        }
+
+    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset,final long recordNumber, 
+        String encoding) throws IOException {
         Objects.requireNonNull(reader, "reader");
         Objects.requireNonNull(format, "format");
         this.format = format.copy();
-        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
+        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
         this.csvRecordIterator = new CSVRecordIterator();
         this.headers = createHeaders();
         this.characterOffset = characterOffset;
@@ -768,6 +773,7 @@ CSVRecord nextRecord() throws IOException {
         recordList.clear();
         StringBuilder sb = null;
         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
+        final long startCharByte = lexer.getBytesRead() + this.characterOffset;
         do {
             reusableToken.reset();
             lexer.nextToken(reusableToken);
@@ -805,7 +811,7 @@ CSVRecord nextRecord() throws IOException {
             recordNumber++;
             final String comment = Objects.toString(sb, null);
             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
-                recordNumber, startCharPosition);
+                recordNumber, startCharPosition, startCharByte);
         }
         return result;
     }
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
index 1fac65843..a86d910c9 100644
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -17,6 +17,8 @@
 
 package org.apache.commons.csv;
 
+import static org.apache.commons.csv.Constants.EMPTY_STRING_ARRAY;
+
 import java.io.Serializable;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -48,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
      */
     private final long characterPosition;
 
+    /**
+     * The start byte of this record as a character byte in the source stream.
+     */
+    private final long characterByte;
+
     /** The accumulated comments (if any) */
     private final String comment;
 
@@ -67,8 +74,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
         this.parser = parser;
         this.comment = comment;
         this.characterPosition = characterPosition;
+        this.characterByte = 0L;
     }
 
+    CSVRecord(final CSVParser parser, final String[] values,  final String comment, final long recordNumber,
+            final long characterPosition, final long characterByte) {
+        this.recordNumber = recordNumber;
+        this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
+        this.parser = parser;
+        this.comment = comment;
+        this.characterPosition = characterPosition;
+        this.characterByte = characterByte;
+    }
     /**
      * Returns a value by {@link Enum}.
      *
@@ -144,6 +161,15 @@ public long getCharacterPosition() {
         return characterPosition;
     }
 
+    /**
+     * Returns the start byte of this record as a character byte in the source stream.
+     *
+     * @return the start byte of this record as a character byte in the source stream.
+     */
+    public long getCharacterByte() {
+        return characterByte;
+    }
+
     /**
      * Returns the comment for this record, if any.
      * Note that comments are attached to the following record.
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index 18c922a50..92654de3d 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -24,6 +24,10 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -49,6 +53,16 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
     private long position;
     private long positionMark;
 
+    /** The number of bytes read so far */
+    private long bytesRead;
+    /** Encoder used to calculate the bytes of characters */
+    CharsetEncoder encoder;
+    
+    /**
+     * A flag to indicate if the read is a peek operation.
+     */
+    private boolean isReadPeek;
+
     /**
      * Constructs a new instance using the default buffer size.
      */
@@ -56,6 +70,14 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
         super(reader);
     }
 
+    ExtendedBufferedReader(final Reader reader, String encoding) {
+        super(reader);
+        if (encoding != null) {
+            encoder = Charset.forName(encoding).newEncoder();
+        }
+        isReadPeek = false;
+    }
+    
     /**
      * Closes the stream.
      *
@@ -118,11 +140,43 @@ public int read() throws IOException {
             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
             lineNumber++;
         }
+        if (encoder != null && !isReadPeek) {
+            this.bytesRead += getCharBytes(current); 
+        }
         lastChar = current;
         position++;
         return lastChar;
     }
 
+    /**
+     *  In Java, a char data type are based on the original Unicode
+     *  specification, which defined characters as fixed-width 16-bit entities.
+     *   U+0000 to U+FFFF:
+     *     - BMP, represented using 1 16-bit char
+     *     - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
+     *   U+10000 to U+10FFFF:
+     *     - Supplementary characters, represented as a pair of characters,
+     *     the first char from the high-surrogates range (\uD800-\uDBFF),
+     *     and the second char from the low-surrogates range (uDC00-\uDFFF).
+     *     - Consists of UTF-8 some 3-byte chars and 4-byte chars
+     */
+    private long getCharBytes(int current) throws CharacterCodingException {
+        char cChar = (char)current;
+        char lChar = (char)lastChar;
+        if (!Character.isSurrogate(cChar)) {
+            return encoder.encode(
+                CharBuffer.wrap(new char[] {cChar})).limit();
+        } else {
+            if (Character.isHighSurrogate(cChar)) {
+                // Move on to the next char (low surrogate)
+                return 0;
+            } else if (Character.isSurrogatePair(lChar, cChar)) {
+                return encoder.encode(
+                    CharBuffer.wrap(new char[] {lChar, cChar})).limit();
+            } else throw new CharacterCodingException();
+        }
+    }
+
     @Override
     public int read(final char[] buf, final int offset, final int length) throws IOException {
         if (length == 0) {
@@ -190,4 +244,29 @@ public void reset() throws IOException {
         super.reset();
     }
 
+    /**
+     * Gets the number of bytes read by the reader.
+     *
+     * @return the number of bytes read by the read
+     */
+    long getBytesRead() {
+        return this.bytesRead;
+    }
+
+    /**
+     * Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will still return this value.
+     *
+     * @return the next character
+     * @throws IOException If an I/O error occurs
+     */
+    @Override
+    public int peek() throws IOException {
+        isReadPeek = true;
+        mark(1);
+        final int c = read();
+        reset();
+        isReadPeek = false;
+        return c;
+    }
+
 }
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index 6d9c8a485..4eb0ca4eb 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -103,6 +103,15 @@ long getCharacterPosition() {
         return reader.getPosition();
     }
 
+    /**
+     * Returns the number of bytes read
+     *
+     * @return the number of bytes read
+     */
+    long getBytesRead() {
+        return reader.getBytesRead();
+    }
+    
     /**
      * Returns the current line number
      *
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 6a0637301..36ed8475b 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -693,6 +693,85 @@ public void testGetHeaderComment_NoComment3() throws IOException {
         }
     }
 
+    @Test
+    public void testGetRecordThreeBytesRead() throws Exception {
+        String code = "id,date,val5,val4\n"
+                + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n"
+                + "22222222222222,'4017-01-01',おはよう私の友人～,v4\n"
+                + "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
+                
+        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+        final CSVFormat format = CSVFormat.Builder.create()
+                               .setDelimiter(',')
+                               .setQuote('\'')
+                               .build();
+        // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
+        
+        CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
+
+        CSVRecord record;
+        assertEquals(0, parser.getRecordNumber());
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(1, record.getRecordNumber());
+        assertEquals(code.indexOf('i'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(2, record.getRecordNumber());
+        assertEquals(code.indexOf('1'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(3, record.getRecordNumber());
+        assertEquals(code.indexOf('2'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 95);
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(4, record.getRecordNumber());
+        assertEquals(code.indexOf('3'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 154);
+
+        parser.close();
+
+    }
+
+    @Test
+    public void testGetRecordFourBytesRead() throws Exception {
+        String code = "id,a,b,c\n"
+            + "1,😊,🤔,😂\n"
+            + "2,😊,🤔,😂\n"
+            + "3,😊,🤔,😂\n";
+        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+        final CSVFormat format = CSVFormat.Builder.create()
+            .setDelimiter(',')
+            .setQuote('\'')
+            .build();
+        
+        // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
+        CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
+
+        CSVRecord record;
+        assertEquals(0, parser.getRecordNumber());
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(1, record.getRecordNumber());
+        assertEquals(code.indexOf('i'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(2, record.getRecordNumber());
+        assertEquals(code.indexOf('1'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(3, record.getRecordNumber());
+        assertEquals(code.indexOf('2'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 26);
+        assertNotNull(record = parser.nextRecord());
+        assertEquals(4, record.getRecordNumber());
+        assertEquals(code.indexOf('3'), record.getCharacterPosition());
+        assertEquals(record.getCharacterByte(), 43);
+        parser.close();
+    }
+
     @Test
     public void testGetHeaderMap() throws Exception {
         try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
new file mode 100644
index 000000000..596b14f08
--- /dev/null
+++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.csv;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+import org.junit.jupiter.api.Test;
+
+
+public class JiraCsv196Test {
+    @Test
+    public void parseThreeBytes() throws IOException {
+
+        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+        final CSVFormat format = CSVFormat.Builder.create()
+                               .setDelimiter(',')
+                               .setQuote('\'')
+                               .build();
+        // CSVParser parser = new CSVParser(getTestInput(
+            // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8");
+        CSVParser parser =  format.parse(getTestInput(
+            "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8");
+
+        
+            long[] charByteKey = {0, 89, 242, 395};
+        int idx = 0;
+        for (CSVRecord record : parser) {
+            assertEquals(charByteKey[idx++], record.getCharacterByte());
+        }
+        parser.close();
+    }
+
+
+    @Test
+    public void parseFourBytes() throws IOException {
+        // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+        final CSVFormat format = CSVFormat.Builder.create()
+            .setDelimiter(',')
+            .setQuote('\'')
+            .build();
+        
+        // CSVParser parser = new CSVParser(getTestInput(
+        //     "org/apache/commons/csv/CSV-196/emoji.csv"), format, 0L, 1L, "UTF-8");
+        CSVParser parser =  format.parse(getTestInput(
+                "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8");
+
+        long[] charByteKey = {0, 84, 701, 1318, 1935};
+        int idx = 0;
+        for (CSVRecord record : parser) {
+            assertEquals(charByteKey[idx++], record.getCharacterByte());
+        }
+        parser.close();
+    }
+
+
+    private Reader getTestInput(String path) {
+        return new InputStreamReader(
+            ClassLoader.getSystemClassLoader().getResourceAsStream(path));
+    }
+}
\ No newline at end of file
diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
new file mode 100644
index 000000000..0bff7a44f
--- /dev/null
+++ b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
@@ -0,0 +1,5 @@
+id,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15
+1,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
+2,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
+3,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
+4,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
\ No newline at end of file
diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv
new file mode 100644
index 000000000..b06e04bd6
--- /dev/null
+++ b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv
@@ -0,0 +1,4 @@
+id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15
+00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな～,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
+00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな～,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
+00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな～,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
\ No newline at end of file

From b397c1668c91e68c7d79fe0e307da4faa7eb0291 Mon Sep 17 00:00:00 2001
From: Yuzhan Jiang <yjiang@progress.com>
Date: Mon, 28 Oct 2024 16:37:28 -0400
Subject: [PATCH 2/5] MLE-12358: (Technical) Apply fix on MLCP commons-csv to
 the latest 1.12 release

---
 .../org/apache/commons/csv/CSVFormat.java     | 15 +++++++++++++
 .../org/apache/commons/csv/CSVParser.java     |  2 +-
 .../org/apache/commons/csv/CSVRecord.java     |  2 --
 .../commons/csv/ExtendedBufferedReader.java   | 10 ++++-----
 .../java/org/apache/commons/csv/Lexer.java    |  2 +-
 .../org/apache/commons/csv/CSVParserTest.java | 22 +++++++++----------
 .../apache/commons/csv/JiraCsv196Test.java    | 10 +++------
 7 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
index 800bd1efb..dd27391d9 100644
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -2074,6 +2074,21 @@ public CSVParser parse(final Reader reader) throws IOException {
         return new CSVParser(reader, this);
     }
 
+    /**
+     * Parses the specified content.
+     *
+     * <p>
+     * See also the various static parse methods on {@link CSVParser}.
+     * </p>
+     *
+     * @param reader the input stream
+     * @param characterOffset the character offset
+     * @param recordNumber the record number
+     * @param encoding the encoding
+     * @return a parser over a stream of {@link CSVRecord}s.
+     * @throws IOException If an I/O error occurs
+     * @throws CSVException Thrown on invalid input.
+     */
     public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
         return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
     }
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index b7a6fcf56..18488deef 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -441,7 +441,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
             this(reader, format, characterOffset, recordNumber, null);
         }
 
-    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset,final long recordNumber, 
+    public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
         String encoding) throws IOException {
         Objects.requireNonNull(reader, "reader");
         Objects.requireNonNull(format, "format");
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
index a86d910c9..f0a0a6b81 100644
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -17,8 +17,6 @@
 
 package org.apache.commons.csv;
 
-import static org.apache.commons.csv.Constants.EMPTY_STRING_ARRAY;
-
 import java.io.Serializable;
 import java.util.Arrays;
 import java.util.Iterator;
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index 92654de3d..ba14b1021 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -57,7 +57,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
     private long bytesRead;
     /** Encoder used to calculate the bytes of characters */
     CharsetEncoder encoder;
-    
+
     /**
      * A flag to indicate if the read is a peek operation.
      */
@@ -77,7 +77,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
         }
         isReadPeek = false;
     }
-    
+
     /**
      * Closes the stream.
      *
@@ -141,7 +141,7 @@ public int read() throws IOException {
             lineNumber++;
         }
         if (encoder != null && !isReadPeek) {
-            this.bytesRead += getCharBytes(current); 
+            this.bytesRead += getCharBytes(current);
         }
         lastChar = current;
         position++;
@@ -161,8 +161,8 @@ public int read() throws IOException {
      *     - Consists of UTF-8 some 3-byte chars and 4-byte chars
      */
     private long getCharBytes(int current) throws CharacterCodingException {
-        char cChar = (char)current;
-        char lChar = (char)lastChar;
+        char cChar = (char) current;
+        char lChar = (char) lastChar;
         if (!Character.isSurrogate(cChar)) {
             return encoder.encode(
                 CharBuffer.wrap(new char[] {cChar})).limit();
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index 4eb0ca4eb..afbba4d21 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -111,7 +111,7 @@ long getCharacterPosition() {
     long getBytesRead() {
         return reader.getBytesRead();
     }
-    
+
     /**
      * Returns the current line number
      *
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 36ed8475b..998e04cd9 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -695,21 +695,19 @@ public void testGetHeaderComment_NoComment3() throws IOException {
 
     @Test
     public void testGetRecordThreeBytesRead() throws Exception {
-        String code = "id,date,val5,val4\n"
-                + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n"
-                + "22222222222222,'4017-01-01',おはよう私の友人～,v4\n"
-                + "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
-                
+        String code = "id,date,val5,val4\n" +
+            "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n" +
+            "22222222222222,'4017-01-01',おはよう私の友人～,v4\n" +
+            "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
         // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
         final CSVFormat format = CSVFormat.Builder.create()
                                .setDelimiter(',')
                                .setQuote('\'')
                                .build();
         // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
-        
         CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
 
-        CSVRecord record;
+        CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
         assertEquals(0, parser.getRecordNumber());
         assertNotNull(record = parser.nextRecord());
         assertEquals(1, record.getRecordNumber());
@@ -737,16 +735,16 @@ public void testGetRecordThreeBytesRead() throws Exception {
 
     @Test
     public void testGetRecordFourBytesRead() throws Exception {
-        String code = "id,a,b,c\n"
-            + "1,😊,🤔,😂\n"
-            + "2,😊,🤔,😂\n"
-            + "3,😊,🤔,😂\n";
+        String code = "id,a,b,c\n" +
+            "1,😊,🤔,😂\n" +
+            "2,😊,🤔,😂\n" +
+            "3,😊,🤔,😂\n";
         // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
         final CSVFormat format = CSVFormat.Builder.create()
             .setDelimiter(',')
             .setQuote('\'')
             .build();
-        
+
         // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
         CSVParser parser =  format.parse(new StringReader(code), 0L, 1L, "UTF-8");
 
diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
index 596b14f08..7dbc23caf 100644
--- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
+++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
@@ -39,9 +39,7 @@ public void parseThreeBytes() throws IOException {
             // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8");
         CSVParser parser =  format.parse(getTestInput(
             "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8");
-
-        
-            long[] charByteKey = {0, 89, 242, 395};
+        long[] charByteKey = {0, 89, 242, 395};
         int idx = 0;
         for (CSVRecord record : parser) {
             assertEquals(charByteKey[idx++], record.getCharacterByte());
@@ -57,9 +55,7 @@ public void parseFourBytes() throws IOException {
             .setDelimiter(',')
             .setQuote('\'')
             .build();
-        
-        // CSVParser parser = new CSVParser(getTestInput(
-        //     "org/apache/commons/csv/CSV-196/emoji.csv"), format, 0L, 1L, "UTF-8");
+
         CSVParser parser =  format.parse(getTestInput(
                 "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8");
 
@@ -76,4 +72,4 @@ private Reader getTestInput(String path) {
         return new InputStreamReader(
             ClassLoader.getSystemClassLoader().getResourceAsStream(path));
     }
-}
\ No newline at end of file
+}

From 16756c6f1ae998e1fb87c130a238a95b8f2f8469 Mon Sep 17 00:00:00 2001
From: Yuzhan Jiang <yjiang@progress.com>
Date: Thu, 31 Oct 2024 10:01:59 -0400
Subject: [PATCH 3/5] MLE-12358: Mark and reset the byteRead

---
 .../commons/csv/ExtendedBufferedReader.java   | 28 ++++---------------
 .../org/apache/commons/csv/CSVParserTest.java |  1 +
 2 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index ba14b1021..2a82d48a5 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -55,14 +55,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
 
     /** The number of bytes read so far */
     private long bytesRead;
+    private long bytesReadMark;
+
     /** Encoder used to calculate the bytes of characters */
     CharsetEncoder encoder;
 
-    /**
-     * A flag to indicate if the read is a peek operation.
-     */
-    private boolean isReadPeek;
-
     /**
      * Constructs a new instance using the default buffer size.
      */
@@ -75,7 +72,6 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
         if (encoding != null) {
             encoder = Charset.forName(encoding).newEncoder();
         }
-        isReadPeek = false;
     }
 
     /**
@@ -130,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
         lineNumberMark = lineNumber;
         lastCharMark = lastChar;
         positionMark = position;
+        bytesReadMark = bytesRead;
         super.mark(readAheadLimit);
     }
 
@@ -140,7 +137,7 @@ public int read() throws IOException {
             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
             lineNumber++;
         }
-        if (encoder != null && !isReadPeek) {
+        if (encoder != null) {
             this.bytesRead += getCharBytes(current);
         }
         lastChar = current;
@@ -241,6 +238,7 @@ public void reset() throws IOException {
         lineNumber = lineNumberMark;
         lastChar = lastCharMark;
         position = positionMark;
+        bytesRead = bytesReadMark;
         super.reset();
     }
 
@@ -253,20 +251,4 @@ long getBytesRead() {
         return this.bytesRead;
     }
 
-    /**
-     * Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will still return this value.
-     *
-     * @return the next character
-     * @throws IOException If an I/O error occurs
-     */
-    @Override
-    public int peek() throws IOException {
-        isReadPeek = true;
-        mark(1);
-        final int c = read();
-        reset();
-        isReadPeek = false;
-        return c;
-    }
-
 }
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 998e04cd9..f871308e8 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -699,6 +699,7 @@ public void testGetRecordThreeBytesRead() throws Exception {
             "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n" +
             "22222222222222,'4017-01-01',おはよう私の友人～,v4\n" +
             "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
+        // String code = "'1',4";
         // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
         final CSVFormat format = CSVFormat.Builder.create()
                                .setDelimiter(',')

From f24889f7e37953ead70894f0de92287ce8223b9c Mon Sep 17 00:00:00 2001
From: Yuzhan Jiang <yjiang@progress.com>
Date: Fri, 1 Nov 2024 11:11:57 -0400
Subject: [PATCH 4/5] Add support in Commons CSV for tracking byte positions
 during parsing

---
 .../org/apache/commons/csv/CSVFormat.java     | 13 +++++++---
 .../org/apache/commons/csv/CSVParser.java     | 25 +++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
index dd27391d9..773896183 100644
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -2078,13 +2078,18 @@ public CSVParser parse(final Reader reader) throws IOException {
      * Parses the specified content.
      *
      * <p>
-     * See also the various static parse methods on {@link CSVParser}.
+     * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
+     * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
+     * </p>
+     *
+     * <p>
+     * For additional parsing options, see the various static parse methods available on {@link CSVParser}.
      * </p>
      *
      * @param reader the input stream
-     * @param characterOffset the character offset
-     * @param recordNumber the record number
-     * @param encoding the encoding
+     * @param characterOffset the character offset to start parsing from
+     * @param recordNumber the initial record number to start counting from
+     * @param encoding the character encoding of the input stream
      * @return a parser over a stream of {@link CSVRecord}s.
      * @throws IOException If an I/O error occurs
      * @throws CSVException Thrown on invalid input.
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index 18488deef..c8f30b672 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -441,6 +441,31 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
             this(reader, format, characterOffset, recordNumber, null);
         }
 
+        /**
+     * Constructs a new instance using the given {@link CSVFormat}
+     *
+     * <p>
+     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+     * unless you close the {@code reader}.
+     * </p>
+     *
+     * @param reader
+     *            a Reader containing CSV-formatted input. Must not be null.
+     * @param format
+     *            the CSVFormat used for CSV parsing. Must not be null.
+     * @param characterOffset
+     *            Lexer offset when the parser does not start parsing at the beginning of the source.
+     * @param recordNumber
+     *            The next record number to assign
+     * @param encoding
+     *            The encoding to use for the reader
+     * @throws IllegalArgumentException
+     *             If the parameters of the format are inconsistent or if either the reader or format is null.
+     * @throws IOException
+     *             If there is a problem reading the header or skipping the first record
+     * @throws CSVException Thrown on invalid input.
+     * @since 1.12
+     */
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
         String encoding) throws IOException {
         Objects.requireNonNull(reader, "reader");

From 61087a6784908f816f1a695310c8cdbc18f5b324 Mon Sep 17 00:00:00 2001
From: Yuzhan Jiang <yjiang@progress.com>
Date: Tue, 5 Nov 2024 11:09:11 -0500
Subject: [PATCH 5/5] Add support in Commons CSV for tracking byte positions
 during parsing

---
 src/main/java/org/apache/commons/csv/CSVParser.java | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index c8f30b672..761599a39 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -464,7 +464,6 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
      * @throws IOException
      *             If there is a problem reading the header or skipping the first record
      * @throws CSVException Thrown on invalid input.
-     * @since 1.12
      */
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
         String encoding) throws IOException {