From 01e25e47c4173d76660b436b18fb06ee6c62b018 Mon Sep 17 00:00:00 2001 From: Harry Law <66720678+harryalaw@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:07:17 +0100 Subject: [PATCH] Remove to string for dictionary pages (#138) # Problem As reported in #121 and #131 there are issues with the parsing of `FIXED_LEN_BYTE_ARRAY` values. This may be caused by the fact that they are converted to strings as part of the decoding process for DICTIONARY_PAGE types. This should close both issues. Solution ======== By removing the `toString` conversion this removes the error, and byte array values get returned as Buffers, as well as also returning numeric types as numbers, and not as strings. To try to check that the change would only impact things as expected, I also added some regression testing for all of the reference files. For this I've added a check for the first record returned by each reference file. The process to get this data was to add the test, see the error message and then fix the test. With this regression in place, I then removed the `toString` call, and saw how the tests failed. The errors were all in places where the schema was returning either numeric types or things like timestamps or byte arrays. Previously these would be getting stringified if the file was using a dictionary page. **Note, this does cause a breaking change to the way that these dictionary page files are decoded, as the values being returned will no longer be strings by default, and this may create issues for some consumers as the runtime types will change.** There may be an improvement to add something like snapshot testing for the reference-test files so that the whole file can be compared without having to manually write it all out. I know this is natively supported in jest/vitest but I'm not sure how to do it in mocha. ## Change summary: - Add testing to validate the first record in each reference file - Remove the `toString` conversion when decoding dictionary pages ## Steps to Verify: It may be worth double checking that some of the schemas for some of the reference files line up with the values that I've added. I've split it into two commits so that the first commit is adding in the regression, and the second commit is removing the `toString`. Additionally I've ran the test added in #121, the test needs to be updated to either parse the age value from a buffer to the number, or to compare the ages as Buffers. The `id` is also now returned as a bigint as the column is an int64. ``` assert.deep,Equal(records, [{ id: 10n, age: Buffer.from([0,0,0,0,0,0,0,0,0,0,0,9,199,101,36,0]), full_name: 'Jonathan Cohen' }, { id: 11n, age: Buffer.from([0,0,0,0,0,0,0,0,0,0,0,0,178,208,94,0]), }]) ``` The data recorded in #131 is returned as a buffer, but I've not fully checked the contents of that buffer that is returned. --- lib/reader.ts | 2 +- test/reference-test/first-record.ts | 639 +++++++++++++++++++++++++++ test/reference-test/read-all.test.ts | 10 +- test/test-files.js | 34 +- 4 files changed, 671 insertions(+), 14 deletions(-) create mode 100644 test/reference-test/first-record.ts diff --git a/lib/reader.ts b/lib/reader.ts index 9ae6fb20..689a62d6 100644 --- a/lib/reader.ts +++ b/lib/reader.ts @@ -981,7 +981,7 @@ async function decodeDictionaryPage(cursor: Cursor, header: parquet_thrift.PageH dictCursor, header.dictionary_page_header!.num_values, opts - ).map((d: unknown[]) => d.toString()); + ); } async function decodeDataPage(cursor: Cursor, header: parquet_thrift.PageHeader, opts: Options) { diff --git a/test/reference-test/first-record.ts b/test/reference-test/first-record.ts new file mode 100644 index 00000000..acf9eafd --- /dev/null +++ b/test/reference-test/first-record.ts @@ -0,0 +1,639 @@ +import { unsupported } from './read-all.test'; + +/** + * Helper function to return the data for the first row of a parquet file + * + * If the first row has been manually parsed, returns [true, data] + * If the data has not been manually parsed returns [false, null] + */ +export function getFirstRecord(filename: string): [true, any] | [false, null] { + switch (filename) { + case 'alltypes_dictionary.parquet': + return [ + true, + { + bigint_col: 0n, + bool_col: true, + date_string_col: Buffer.from('01/01/09'), + double_col: 0, + float_col: 0, + id: 0, + int_col: 0, + smallint_col: 0, + string_col: Buffer.from('0'), + timestamp_col: 0, + tinyint_col: 0, + }, + ]; + case 'alltypes_plain.parquet': + return [ + true, + { + bigint_col: 0n, + bool_col: true, + date_string_col: Buffer.from('03/01/09'), + double_col: 0, + float_col: 0, + id: 4, + int_col: 0, + smallint_col: 0, + string_col: Buffer.from('0'), + timestamp_col: 0, + tinyint_col: 0, + }, + ]; + case 'alltypes_plain.snappy.parquet': + return [ + true, + { + bigint_col: 0n, + bool_col: true, + date_string_col: Buffer.from('04/01/09'), + double_col: 0, + float_col: 0, + id: 6, + int_col: 0, + smallint_col: 0, + string_col: Buffer.from('0'), + timestamp_col: 0, + tinyint_col: 0, + }, + ]; + case 'nan_in_stats.parquet': + return [true, { x: 1 }]; + case 'alltypes_tiny_pages.parquet': + return [ + true, + { + bigint_col: 20n, + bool_col: true, + date_string_col: '01/13/09', + double_col: 20.2, + float_col: 2.200000047683716, + id: 122, + int_col: 2, + month: 1, + smallint_col: 2, + string_col: '2', + timestamp_col: 3725410000000, + tinyint_col: 2, + year: 2009, + }, + ]; + case 'alltypes_tiny_pages_plain.parquet': + return [ + true, + { + bigint_col: 20n, + bool_col: true, + date_string_col: '01/13/09', + double_col: 20.2, + float_col: 2.200000047683716, + id: 122, + int_col: 2, + month: 1, + smallint_col: 2, + string_col: '2', + timestamp_col: 3725410000000, + tinyint_col: 2, + year: 2009, + }, + ]; + case 'binary.parquet': + return [ + true, + { + foo: Buffer.from([0]), + }, + ]; + case 'byte_array_decimal.parquet': + return [ + true, + { + value: Buffer.from([100]), + }, + ]; + case 'concatenated_gzip_members.parquet': + return [ + true, + { + long_col: 1n, + }, + ]; + case 'data_index_bloom_encoding_stats.parquet': + return [ + true, + { + String: 'Hello', + }, + ]; + case 'data_index_bloom_encoding_with_length.parquet': + return [ + true, + { + String: 'Hello', + }, + ]; + case 'datapage_v1-corrupt-checksum.parquet': + return [true, { a: 50462976, b: 1734763876 }]; + case 'datapage_v1-snappy-compressed-checksum.parquet': + return [true, { a: 50462976, b: 1734763876 }]; + case 'datapage_v1-uncompressed-checksum.parquet': + return [true, { a: 50462976, b: 1734763876 }]; + case 'dict-page-offset-zero.parquet': + return [ + true, + { + l_partkey: 1552, + }, + ]; + case 'fixed_length_byte_array.parquet': + return [ + true, + { + flba_field: Buffer.from([0, 0, 3, 232]), + }, + ]; + case 'fixed_length_decimal.parquet': + return [ + true, + { + value: Buffer.from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100]), + }, + ]; + case 'fixed_length_decimal_legacy.parquet': + return [ + true, + { + value: Buffer.from([0, 0, 0, 0, 0, 100]), + }, + ]; + case 'float16_nonzeros_and_nans.parquet': + return [ + true, + { + x: null, + }, + ]; + case 'float16_zeros_and_nans.parquet': + return [ + true, + { + x: null, + }, + ]; + case 'int32_decimal.parquet': + return [ + true, + { + value: 1, + }, + ]; + case 'int32_with_null_pages.parquet': + return [ + true, + { + int32_field: -654807448, + }, + ]; + case 'int64_decimal.parquet': + return [ + true, + { + value: 1, + }, + ]; + case 'list_columns.parquet': + return [ + true, + { + int64_list: { + list: [{ item: 1n }, { item: 2n }, { item: 3n }], + }, + utf8_list: { + list: [{ item: 'abc' }, { item: 'efg' }, { item: 'hij' }], + }, + }, + ]; + case 'nation.dict-malformed.parquet': + return [ + true, + { + comment_col: Buffer.from(' haggle. carefully final deposits detect slyly agai'), + name: Buffer.from('ALGERIA'), + nation_key: 0, + region_key: 0, + }, + ]; + case 'nested_maps.snappy.parquet': + return [ + true, + { + a: { + key_value: [ + { + key: 'a', + value: { + key_value: [ + { + key: 1, + value: true, + }, + { + key: 2, + value: false, + }, + ], + }, + }, + ], + }, + b: 1, + c: 1, + }, + ]; + case 'nested_lists.snappy.parquet': + return [ + true, + { + a: { + list: [ + { + element: { + list: [ + { + element: { + list: [ + { + element: 'a', + }, + { + element: 'b', + }, + ], + }, + }, + { + element: { + list: [ + { + element: 'c', + }, + ], + }, + }, + ], + }, + }, + { + element: { + list: [ + { + element: null, + }, + { + element: { + list: [ + { + element: 'd', + }, + ], + }, + }, + ], + }, + }, + ], + }, + b: 1, + }, + ]; + case 'nonnullable.impala.parquet': + return [ + true, + { + ID: 8n, + Int_Array: { + list: [ + { + element: -1, + }, + ], + }, + Int_Map: { + map: [ + { + key: 'k1', + value: -1, + }, + ], + }, + int_array_array: { + list: [ + { + element: { + list: [ + { + element: -1, + }, + { + element: -2, + }, + ], + }, + }, + { + element: { + list: null, + }, + }, + ], + }, + int_map_array: { + list: [ + { + element: { + map: null, + }, + }, + { + element: { + map: [ + { + key: 'k1', + value: 1, + }, + ], + }, + }, + { + element: { + map: null, + }, + }, + { + element: { + map: null, + }, + }, + ], + }, + nested_Struct: { + B: { + list: [ + { + element: -1, + }, + ], + }, + G: { + map: null, + }, + a: -1, + c: { + D: { + list: [ + { + element: { + list: [ + { + element: { + e: -1, + f: 'nonnullable', + }, + }, + ], + }, + }, + ], + }, + }, + }, + }, + ]; + case 'null_list.parquet': + return [ + true, + { + emptylist: { + list: null, + }, + }, + ]; + case 'nullable.impala.parquet': + return [ + true, + { + id: 1n, + int_Map_Array: { + list: [ + { + element: { + map: [ + { + key: 'k1', + value: 1, + }, + ], + }, + }, + ], + }, + int_array: { + list: [ + { + element: 1, + }, + { + element: 2, + }, + { + element: 3, + }, + ], + }, + int_array_Array: { + list: [ + { + element: { + list: [ + { + element: 1, + }, + { + element: 2, + }, + ], + }, + }, + { + element: { + list: [ + { + element: 3, + }, + { + element: 4, + }, + ], + }, + }, + ], + }, + int_map: { + map: [ + { + key: 'k1', + value: 1, + }, + { + key: 'k2', + value: 100, + }, + ], + }, + nested_struct: { + A: 1, + C: { + d: { + list: [ + { + element: { + list: [ + { + element: { + E: 10, + F: 'aaa', + }, + }, + { + element: { + E: -10, + F: 'bbb', + }, + }, + ], + }, + }, + { + element: { + list: [ + { + element: { + E: 11, + F: 'c', + }, + }, + ], + }, + }, + ], + }, + }, + b: { + list: [ + { + element: 1, + }, + ], + }, + g: { + map: [ + { + key: 'foo', + value: { + H: { + i: { + list: [ + { + element: 1.1, + }, + ], + }, + }, + }, + }, + ], + }, + }, + }, + ]; + case 'nulls.snappy.parquet': + return [ + true, + { + b_struct: { + b_c_int: null, + }, + }, + ]; + case 'overflow_i16_page_cnt.parquet': + return [ + true, + { + inc: false, + }, + ]; + case 'plain-dict-uncompressed-checksum.parquet': + return [ + true, + { + binary_field: Buffer.from('a655fd0e-9949-4059-bcae-fd6a002a4652'), + long_field: 0n, + }, + ]; + case 'repeated_no_annotation.parquet': + return [ + true, + { + id: 1, + phoneNumbers: null, + }, + ]; + case 'rle-dict-snappy-checksum.parquet': + return [ + true, + { + binary_field: Buffer.from('c95e263a-f5d4-401f-8107-5ca7146a1f98'), + long_field: 0n, + }, + ]; + case 'rle-dict-uncompressed-corrupt-checksum.parquet': + return [ + true, + { + binary_field: Buffer.from('6325c32b-f417-41aa-9e02-9b8601542aff'), + long_field: 0n, + }, + ]; + case 'single_nan.parquet': + return [ + true, + { + mycol: null, + }, + ]; + // Not supported - see read-all-test.ts + // case ("byte_stream_split.zstd.parquet"): + // case ("datapage_v2.snappy.parquet"): + // case ("delta_binary_packed.parquet"): + // case ("delta_byte_array.parquet"): + // case ("delta_encoding_optional_column.parquet"): + // case ("delta_encoding_required_column.parquet"): + // case ("delta_length_byte_array.parquet"): + // case ("hadoop_lz4_compressed.parquet"): + // case ("hadoop_lz4_compressed_larger.parquet"): + // case ("large_string_map.brotli.parquet"): + // case ("lz4_raw_compressed.parquet"): + // case ("lz4_raw_compressed_larger.parquet"): + // case ("nested_structs.rust.parquet"): + // case ("non_hadoop_lz4_compressed.parquet"): + // case ("rle_boolean_encoding.parquet"): + default: { + if (!unsupported.includes(filename)) { + return [true, 'Please provide the first row of the parquet file for comparison']; + } + return [false, null]; + } + } +} diff --git a/test/reference-test/read-all.test.ts b/test/reference-test/read-all.test.ts index 4170594b..90264c3f 100644 --- a/test/reference-test/read-all.test.ts +++ b/test/reference-test/read-all.test.ts @@ -4,13 +4,15 @@ import path from 'node:path'; import fs from 'node:fs'; import parquet from '../../parquet'; +import { getFirstRecord } from './first-record'; // Used for testing a single file. Example: // const onlyTest = 'single_nan.parquet'; const onlyTest = null; // Test files currently unsupported / needing separate test -const unsupported = [ +// eslint-disable-next-line mocha/no-exports +export const unsupported = [ 'byte_stream_split.zstd.parquet', // ZSTD unsupported 'hadoop_lz4_compressed.parquet', // LZ4 unsupported 'hadoop_lz4_compressed_larger.parquet', // LZ4 unsupported @@ -44,6 +46,12 @@ describe('Read Test for all files', function () { // Expect the same keys as top-level fields const expectedRecordKeys = schema.fieldList.filter((x) => x.path.length === 1).map((x) => x.name); expect(Object.keys(record)).to.deep.equal(expectedRecordKeys); + + // validate that the first record has the expected values + const [shouldTest, expectedFirstRow] = getFirstRecord(filename); + if (shouldTest) { + expect(record).to.deep.equal(expectedFirstRow); + } }); } }); diff --git a/test/test-files.js b/test/test-files.js index 1e1a5bd9..116a4bda 100644 --- a/test/test-files.js +++ b/test/test-files.js @@ -101,16 +101,16 @@ describe('test-files', function () { it('mr_times.parq loads', async function () { const data = await readData('mr_times.parq'); assert.deepEqual(data, [ - { id: '1', date_added: '83281000000000' }, - { id: '2', date_added: '83282000000000' }, - { id: '3', date_added: '83283000000000' }, - { id: '4', date_added: '83284000000000' }, - { id: '5', date_added: '83284000000000' }, - { id: '6', date_added: '83285000000000' }, - { id: '7', date_added: '83286000000000' }, - { id: '8', date_added: '83287000000000' }, - { id: '9', date_added: '83288000000000' }, - { id: '10', date_added: '83289000000000' }, + { id: '1', date_added: 83281000000000 }, + { id: '2', date_added: 83282000000000 }, + { id: '3', date_added: 83283000000000 }, + { id: '4', date_added: 83284000000000 }, + { id: '5', date_added: 83284000000000 }, + { id: '6', date_added: 83285000000000 }, + { id: '7', date_added: 83286000000000 }, + { id: '8', date_added: 83287000000000 }, + { id: '9', date_added: 83288000000000 }, + { id: '10', date_added: 83289000000000 }, ]); }); @@ -226,12 +226,22 @@ describe('test-files', function () { it('rle-dict-snappy-checksum.parquet loads', async function () { const data = await readData('rle/rle-dict-snappy-checksum.parquet'); - assert.deepEqual(data[0], { binary_field: 'c95e263a-f5d4-401f-8107-5ca7146a1f98', long_field: '0' }); + for (const item of data) { + assert.deepEqual(item, { + binary_field: Buffer.from('c95e263a-f5d4-401f-8107-5ca7146a1f98'), + long_field: 0n, + }); + } }); it('rle-dict-uncompressed-corrupt-checksum.parquet loads', async function () { const data = await readData('rle/rle-dict-uncompressed-corrupt-checksum.parquet'); - assert.deepEqual(data[0], { binary_field: '6325c32b-f417-41aa-9e02-9b8601542aff', long_field: '0' }); + for (const item of data) { + assert.deepEqual(item, { + binary_field: Buffer.from('6325c32b-f417-41aa-9e02-9b8601542aff'), + long_field: 0n, + }); + } }); }); });