From 1c52ad3a286fd3d1bb7e6e5b586f675bbb5ad7e3 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Mon, 28 Oct 2024 15:25:51 -0400 Subject: [PATCH 1/5] MLE-12358: (Technical) Apply fix on MLCP commons-csv to the latest 1.12 release --- pom.xml | 3 + .../org/apache/commons/csv/CSVFormat.java | 4 + .../org/apache/commons/csv/CSVParser.java | 10 ++- .../org/apache/commons/csv/CSVRecord.java | 26 ++++++ .../commons/csv/ExtendedBufferedReader.java | 79 +++++++++++++++++++ .../java/org/apache/commons/csv/Lexer.java | 9 +++ .../org/apache/commons/csv/CSVParserTest.java | 79 +++++++++++++++++++ .../apache/commons/csv/JiraCsv196Test.java | 79 +++++++++++++++++++ .../org/apache/commons/csv/CSV-196/emoji.csv | 5 ++ .../apache/commons/csv/CSV-196/japanese.csv | 4 + 10 files changed, 296 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/apache/commons/csv/JiraCsv196Test.java create mode 100644 src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv create mode 100644 src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv diff --git a/pom.xml b/pom.xml index a3e053a10..1e81e5f9d 100644 --- a/pom.xml +++ b/pom.xml @@ -28,6 +28,7 @@ https://commons.apache.org/proper/commons-csv/ 2005 The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types. + jar @@ -231,6 +232,8 @@ src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv src/test/resources/org/apache/commons/csv/csv-167/sample1.csv src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv + src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv + src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index dd5416e11..800bd1efb 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2074,6 +2074,10 @@ public CSVParser parse(final Reader reader) throws IOException { return new CSVParser(reader, this); } + public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { + return new CSVParser(reader, this, characterOffset, recordNumber, encoding); + } + /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index a2bc23070..b7a6fcf56 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -438,10 +438,15 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { + this(reader, format, characterOffset, recordNumber, null); + } + + public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset,final long recordNumber, + String encoding) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; @@ -768,6 +773,7 @@ CSVRecord nextRecord() throws IOException { recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + characterOffset; + final long startCharByte = lexer.getBytesRead() + this.characterOffset; do { reusableToken.reset(); lexer.nextToken(reusableToken); @@ -805,7 +811,7 @@ CSVRecord nextRecord() throws IOException { recordNumber++; final String comment = Objects.toString(sb, null); result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, - recordNumber, startCharPosition); + recordNumber, startCharPosition, startCharByte); } return result; } diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index 1fac65843..a86d910c9 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -17,6 +17,8 @@ package org.apache.commons.csv; +import static org.apache.commons.csv.Constants.EMPTY_STRING_ARRAY; + import java.io.Serializable; import java.util.Arrays; import java.util.Iterator; @@ -48,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable { */ private final long characterPosition; + /** + * The start byte of this record as a character byte in the source stream. + */ + private final long characterByte; + /** The accumulated comments (if any) */ private final String comment; @@ -67,8 +74,18 @@ public final class CSVRecord implements Serializable, Iterable { this.parser = parser; this.comment = comment; this.characterPosition = characterPosition; + this.characterByte = 0L; } + CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, + final long characterPosition, final long characterByte) { + this.recordNumber = recordNumber; + this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY; + this.parser = parser; + this.comment = comment; + this.characterPosition = characterPosition; + this.characterByte = characterByte; + } /** * Returns a value by {@link Enum}. * @@ -144,6 +161,15 @@ public long getCharacterPosition() { return characterPosition; } + /** + * Returns the start byte of this record as a character byte in the source stream. + * + * @return the start byte of this record as a character byte in the source stream. + */ + public long getCharacterByte() { + return characterByte; + } + /** * Returns the comment for this record, if any. * Note that comments are attached to the following record. diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 18c922a50..92654de3d 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -24,6 +24,10 @@ import java.io.IOException; import java.io.Reader; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedBufferedReader; @@ -49,6 +53,16 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; + /** The number of bytes read so far */ + private long bytesRead; + /** Encoder used to calculate the bytes of characters */ + CharsetEncoder encoder; + + /** + * A flag to indicate if the read is a peek operation. + */ + private boolean isReadPeek; + /** * Constructs a new instance using the default buffer size. */ @@ -56,6 +70,14 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } + ExtendedBufferedReader(final Reader reader, String encoding) { + super(reader); + if (encoding != null) { + encoder = Charset.forName(encoding).newEncoder(); + } + isReadPeek = false; + } + /** * Closes the stream. * @@ -118,11 +140,43 @@ public int read() throws IOException { current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } + if (encoder != null && !isReadPeek) { + this.bytesRead += getCharBytes(current); + } lastChar = current; position++; return lastChar; } + /** + * In Java, a char data type are based on the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + * U+0000 to U+FFFF: + * - BMP, represented using 1 16-bit char + * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars + * U+10000 to U+10FFFF: + * - Supplementary characters, represented as a pair of characters, + * the first char from the high-surrogates range (\uD800-\uDBFF), + * and the second char from the low-surrogates range (uDC00-\uDFFF). + * - Consists of UTF-8 some 3-byte chars and 4-byte chars + */ + private long getCharBytes(int current) throws CharacterCodingException { + char cChar = (char)current; + char lChar = (char)lastChar; + if (!Character.isSurrogate(cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {cChar})).limit(); + } else { + if (Character.isHighSurrogate(cChar)) { + // Move on to the next char (low surrogate) + return 0; + } else if (Character.isSurrogatePair(lChar, cChar)) { + return encoder.encode( + CharBuffer.wrap(new char[] {lChar, cChar})).limit(); + } else throw new CharacterCodingException(); + } + } + @Override public int read(final char[] buf, final int offset, final int length) throws IOException { if (length == 0) { @@ -190,4 +244,29 @@ public void reset() throws IOException { super.reset(); } + /** + * Gets the number of bytes read by the reader. + * + * @return the number of bytes read by the read + */ + long getBytesRead() { + return this.bytesRead; + } + + /** + * Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will still return this value. + * + * @return the next character + * @throws IOException If an I/O error occurs + */ + @Override + public int peek() throws IOException { + isReadPeek = true; + mark(1); + final int c = read(); + reset(); + isReadPeek = false; + return c; + } + } diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 6d9c8a485..4eb0ca4eb 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -103,6 +103,15 @@ long getCharacterPosition() { return reader.getPosition(); } + /** + * Returns the number of bytes read + * + * @return the number of bytes read + */ + long getBytesRead() { + return reader.getBytesRead(); + } + /** * Returns the current line number * diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 6a0637301..36ed8475b 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -693,6 +693,85 @@ public void testGetHeaderComment_NoComment3() throws IOException { } } + @Test + public void testGetRecordThreeBytesRead() throws Exception { + String code = "id,date,val5,val4\n" + + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; + + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); + + CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); + + CSVRecord record; + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 95); + + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 154); + + parser.close(); + + } + + @Test + public void testGetRecordFourBytesRead() throws Exception { + String code = "id,a,b,c\n" + + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + + // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); + + CSVRecord record; + assertEquals(0, parser.getRecordNumber()); + assertNotNull(record = parser.nextRecord()); + assertEquals(1, record.getRecordNumber()); + assertEquals(code.indexOf('i'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + + assertNotNull(record = parser.nextRecord()); + assertEquals(2, record.getRecordNumber()); + assertEquals(code.indexOf('1'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), record.getCharacterPosition()); + assertNotNull(record = parser.nextRecord()); + assertEquals(3, record.getRecordNumber()); + assertEquals(code.indexOf('2'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 26); + assertNotNull(record = parser.nextRecord()); + assertEquals(4, record.getRecordNumber()); + assertEquals(code.indexOf('3'), record.getCharacterPosition()); + assertEquals(record.getCharacterByte(), 43); + parser.close(); + } + @Test public void testGetHeaderMap() throws Exception { try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) { diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java new file mode 100644 index 000000000..596b14f08 --- /dev/null +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv; +import static org.junit.jupiter.api.Assertions.assertEquals; + + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; + + +import org.junit.jupiter.api.Test; + + +public class JiraCsv196Test { + @Test + public void parseThreeBytes() throws IOException { + + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + // CSVParser parser = new CSVParser(getTestInput( + // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(getTestInput( + "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8"); + + + long[] charByteKey = {0, 89, 242, 395}; + int idx = 0; + for (CSVRecord record : parser) { + assertEquals(charByteKey[idx++], record.getCharacterByte()); + } + parser.close(); + } + + + @Test + public void parseFourBytes() throws IOException { + // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); + final CSVFormat format = CSVFormat.Builder.create() + .setDelimiter(',') + .setQuote('\'') + .build(); + + // CSVParser parser = new CSVParser(getTestInput( + // "org/apache/commons/csv/CSV-196/emoji.csv"), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(getTestInput( + "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8"); + + long[] charByteKey = {0, 84, 701, 1318, 1935}; + int idx = 0; + for (CSVRecord record : parser) { + assertEquals(charByteKey[idx++], record.getCharacterByte()); + } + parser.close(); + } + + + private Reader getTestInput(String path) { + return new InputStreamReader( + ClassLoader.getSystemClassLoader().getResourceAsStream(path)); + } +} \ No newline at end of file diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv new file mode 100644 index 000000000..0bff7a44f --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv @@ -0,0 +1,5 @@ +id,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 +1,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +2,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +3,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 +4,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄 \ No newline at end of file diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv new file mode 100644 index 000000000..b06e04bd6 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv @@ -0,0 +1,4 @@ +id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15 +00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 +00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 +00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15 \ No newline at end of file From b397c1668c91e68c7d79fe0e307da4faa7eb0291 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Mon, 28 Oct 2024 16:37:28 -0400 Subject: [PATCH 2/5] MLE-12358: (Technical) Apply fix on MLCP commons-csv to the latest 1.12 release --- .../org/apache/commons/csv/CSVFormat.java | 15 +++++++++++++ .../org/apache/commons/csv/CSVParser.java | 2 +- .../org/apache/commons/csv/CSVRecord.java | 2 -- .../commons/csv/ExtendedBufferedReader.java | 10 ++++----- .../java/org/apache/commons/csv/Lexer.java | 2 +- .../org/apache/commons/csv/CSVParserTest.java | 22 +++++++++---------- .../apache/commons/csv/JiraCsv196Test.java | 10 +++------ 7 files changed, 35 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 800bd1efb..dd27391d9 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2074,6 +2074,21 @@ public CSVParser parse(final Reader reader) throws IOException { return new CSVParser(reader, this); } + /** + * Parses the specified content. + * + *

+ * See also the various static parse methods on {@link CSVParser}. + *

+ * + * @param reader the input stream + * @param characterOffset the character offset + * @param recordNumber the record number + * @param encoding the encoding + * @return a parser over a stream of {@link CSVRecord}s. + * @throws IOException If an I/O error occurs + * @throws CSVException Thrown on invalid input. + */ public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { return new CSVParser(reader, this, characterOffset, recordNumber, encoding); } diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index b7a6fcf56..18488deef 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -441,7 +441,7 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact this(reader, format, characterOffset, recordNumber, null); } - public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset,final long recordNumber, + public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, String encoding) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index a86d910c9..f0a0a6b81 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -17,8 +17,6 @@ package org.apache.commons.csv; -import static org.apache.commons.csv.Constants.EMPTY_STRING_ARRAY; - import java.io.Serializable; import java.util.Arrays; import java.util.Iterator; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 92654de3d..ba14b1021 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -57,7 +57,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long bytesRead; /** Encoder used to calculate the bytes of characters */ CharsetEncoder encoder; - + /** * A flag to indicate if the read is a peek operation. */ @@ -77,7 +77,7 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { } isReadPeek = false; } - + /** * Closes the stream. * @@ -141,7 +141,7 @@ public int read() throws IOException { lineNumber++; } if (encoder != null && !isReadPeek) { - this.bytesRead += getCharBytes(current); + this.bytesRead += getCharBytes(current); } lastChar = current; position++; @@ -161,8 +161,8 @@ public int read() throws IOException { * - Consists of UTF-8 some 3-byte chars and 4-byte chars */ private long getCharBytes(int current) throws CharacterCodingException { - char cChar = (char)current; - char lChar = (char)lastChar; + char cChar = (char) current; + char lChar = (char) lastChar; if (!Character.isSurrogate(cChar)) { return encoder.encode( CharBuffer.wrap(new char[] {cChar})).limit(); diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java index 4eb0ca4eb..afbba4d21 100644 --- a/src/main/java/org/apache/commons/csv/Lexer.java +++ b/src/main/java/org/apache/commons/csv/Lexer.java @@ -111,7 +111,7 @@ long getCharacterPosition() { long getBytesRead() { return reader.getBytesRead(); } - + /** * Returns the current line number * diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 36ed8475b..998e04cd9 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -695,21 +695,19 @@ public void testGetHeaderComment_NoComment3() throws IOException { @Test public void testGetRecordThreeBytesRead() throws Exception { - String code = "id,date,val5,val4\n" - + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" - + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" - + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; - + String code = "id,date,val5,val4\n" + + "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() .setDelimiter(',') .setQuote('\'') .build(); // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); - CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); - CSVRecord record; + CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L); assertEquals(0, parser.getRecordNumber()); assertNotNull(record = parser.nextRecord()); assertEquals(1, record.getRecordNumber()); @@ -737,16 +735,16 @@ public void testGetRecordThreeBytesRead() throws Exception { @Test public void testGetRecordFourBytesRead() throws Exception { - String code = "id,a,b,c\n" - + "1,😊,🤔,😂\n" - + "2,😊,🤔,😂\n" - + "3,😊,🤔,😂\n"; + String code = "id,a,b,c\n" + + "1,😊,🤔,😂\n" + + "2,😊,🤔,😂\n" + + "3,😊,🤔,😂\n"; // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() .setDelimiter(',') .setQuote('\'') .build(); - + // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8"); CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8"); diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java index 596b14f08..7dbc23caf 100644 --- a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java +++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java @@ -39,9 +39,7 @@ public void parseThreeBytes() throws IOException { // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8"); CSVParser parser = format.parse(getTestInput( "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8"); - - - long[] charByteKey = {0, 89, 242, 395}; + long[] charByteKey = {0, 89, 242, 395}; int idx = 0; for (CSVRecord record : parser) { assertEquals(charByteKey[idx++], record.getCharacterByte()); @@ -57,9 +55,7 @@ public void parseFourBytes() throws IOException { .setDelimiter(',') .setQuote('\'') .build(); - - // CSVParser parser = new CSVParser(getTestInput( - // "org/apache/commons/csv/CSV-196/emoji.csv"), format, 0L, 1L, "UTF-8"); + CSVParser parser = format.parse(getTestInput( "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8"); @@ -76,4 +72,4 @@ private Reader getTestInput(String path) { return new InputStreamReader( ClassLoader.getSystemClassLoader().getResourceAsStream(path)); } -} \ No newline at end of file +} From 16756c6f1ae998e1fb87c130a238a95b8f2f8469 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Thu, 31 Oct 2024 10:01:59 -0400 Subject: [PATCH 3/5] MLE-12358: Mark and reset the byteRead --- .../commons/csv/ExtendedBufferedReader.java | 28 ++++--------------- .../org/apache/commons/csv/CSVParserTest.java | 1 + 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index ba14b1021..2a82d48a5 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -55,14 +55,11 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { /** The number of bytes read so far */ private long bytesRead; + private long bytesReadMark; + /** Encoder used to calculate the bytes of characters */ CharsetEncoder encoder; - /** - * A flag to indicate if the read is a peek operation. - */ - private boolean isReadPeek; - /** * Constructs a new instance using the default buffer size. */ @@ -75,7 +72,6 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { if (encoding != null) { encoder = Charset.forName(encoding).newEncoder(); } - isReadPeek = false; } /** @@ -130,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException { lineNumberMark = lineNumber; lastCharMark = lastChar; positionMark = position; + bytesReadMark = bytesRead; super.mark(readAheadLimit); } @@ -140,7 +137,7 @@ public int read() throws IOException { current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) { lineNumber++; } - if (encoder != null && !isReadPeek) { + if (encoder != null) { this.bytesRead += getCharBytes(current); } lastChar = current; @@ -241,6 +238,7 @@ public void reset() throws IOException { lineNumber = lineNumberMark; lastChar = lastCharMark; position = positionMark; + bytesRead = bytesReadMark; super.reset(); } @@ -253,20 +251,4 @@ long getBytesRead() { return this.bytesRead; } - /** - * Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will still return this value. - * - * @return the next character - * @throws IOException If an I/O error occurs - */ - @Override - public int peek() throws IOException { - isReadPeek = true; - mark(1); - final int c = read(); - reset(); - isReadPeek = false; - return c; - } - } diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 998e04cd9..f871308e8 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -699,6 +699,7 @@ public void testGetRecordThreeBytesRead() throws Exception { "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" + "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" + "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n"; + // String code = "'1',4"; // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\''); final CSVFormat format = CSVFormat.Builder.create() .setDelimiter(',') From f24889f7e37953ead70894f0de92287ce8223b9c Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Fri, 1 Nov 2024 11:11:57 -0400 Subject: [PATCH 4/5] Add support in Commons CSV for tracking byte positions during parsing --- .../org/apache/commons/csv/CSVFormat.java | 13 +++++++--- .../org/apache/commons/csv/CSVParser.java | 25 +++++++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index dd27391d9..773896183 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -2078,13 +2078,18 @@ public CSVParser parse(final Reader reader) throws IOException { * Parses the specified content. * *

- * See also the various static parse methods on {@link CSVParser}. + * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, + * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. + *

+ * + *

+ * For additional parsing options, see the various static parse methods available on {@link CSVParser}. *

* * @param reader the input stream - * @param characterOffset the character offset - * @param recordNumber the record number - * @param encoding the encoding + * @param characterOffset the character offset to start parsing from + * @param recordNumber the initial record number to start counting from + * @param encoding the character encoding of the input stream * @return a parser over a stream of {@link CSVRecord}s. * @throws IOException If an I/O error occurs * @throws CSVException Thrown on invalid input. diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 18488deef..c8f30b672 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -441,6 +441,31 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact this(reader, format, characterOffset, recordNumber, null); } + /** + * Constructs a new instance using the given {@link CSVFormat} + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @param characterOffset + * Lexer offset when the parser does not start parsing at the beginning of the source. + * @param recordNumber + * The next record number to assign + * @param encoding + * The encoding to use for the reader + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either the reader or format is null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @throws CSVException Thrown on invalid input. + * @since 1.12 + */ public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, String encoding) throws IOException { Objects.requireNonNull(reader, "reader"); From 61087a6784908f816f1a695310c8cdbc18f5b324 Mon Sep 17 00:00:00 2001 From: Yuzhan Jiang Date: Tue, 5 Nov 2024 11:09:11 -0500 Subject: [PATCH 5/5] Add support in Commons CSV for tracking byte positions during parsing --- src/main/java/org/apache/commons/csv/CSVParser.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index c8f30b672..761599a39 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -464,7 +464,6 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @throws IOException * If there is a problem reading the header or skipping the first record * @throws CSVException Thrown on invalid input. - * @since 1.12 */ public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, String encoding) throws IOException {