From 01e25e47c4173d76660b436b18fb06ee6c62b018 Mon Sep 17 00:00:00 2001
From: Harry Law <66720678+harryalaw@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:07:17 +0100
Subject: [PATCH] Remove to string for dictionary pages (#138)

# Problem
As reported in #121 and #131 there are issues with the parsing of
`FIXED_LEN_BYTE_ARRAY` values. This may be caused by the fact that they
are converted to strings as part of the decoding process for
DICTIONARY_PAGE types. This should close both issues.

Solution
========
By removing the `toString` conversion this removes the error, and byte
array values get returned as Buffers, as well as also returning numeric
types as numbers, and not as strings.

To try to check that the change would only impact things as expected, I
also added some regression testing for all of the reference files. For
this I've added a check for the first record returned by each reference
file. The process to get this data was to add the test, see the error
message and then fix the test.

With this regression in place, I then removed the `toString` call, and
saw how the tests failed. The errors were all in places where the schema
was returning either numeric types or things like timestamps or byte
arrays. Previously these would be getting stringified if the file was
using a dictionary page.

**Note, this does cause a breaking change to the way that these
dictionary page files are decoded, as the values being returned will no
longer be strings by default, and this may create issues for some
consumers as the runtime types will change.**

There may be an improvement to add something like snapshot testing for
the reference-test files so that the whole file can be compared without
having to manually write it all out. I know this is natively supported
in jest/vitest but I'm not sure how to do it in mocha.

## Change summary:

- Add testing to validate the first record in each reference file
- Remove the `toString` conversion when decoding dictionary pages

## Steps to Verify:

It may be worth double checking that some of the schemas for some of the
reference files line up with the values that I've added. I've split it
into two commits so that the first commit is adding in the regression,
and the second commit is removing the `toString`.

Additionally I've ran the test added in #121, the test needs to be
updated to either parse the age value from a buffer to the number, or to
compare the ages as Buffers. The `id` is also now returned as a bigint
as the column is an int64.

```
assert.deep,Equal(records, [{
    id: 10n,
    age: Buffer.from([0,0,0,0,0,0,0,0,0,0,0,9,199,101,36,0]),
    full_name: 'Jonathan Cohen'
  },
  {
    id: 11n,
    age: Buffer.from([0,0,0,0,0,0,0,0,0,0,0,0,178,208,94,0]),
  }])
```

The data recorded in #131 is returned as a buffer, but I've not fully
checked the contents of that buffer that is returned.
---
 lib/reader.ts                        |   2 +-
 test/reference-test/first-record.ts  | 639 +++++++++++++++++++++++++++
 test/reference-test/read-all.test.ts |  10 +-
 test/test-files.js                   |  34 +-
 4 files changed, 671 insertions(+), 14 deletions(-)
 create mode 100644 test/reference-test/first-record.ts

diff --git a/lib/reader.ts b/lib/reader.ts
index 9ae6fb20..689a62d6 100644
--- a/lib/reader.ts
+++ b/lib/reader.ts
@@ -981,7 +981,7 @@ async function decodeDictionaryPage(cursor: Cursor, header: parquet_thrift.PageH
     dictCursor,
     header.dictionary_page_header!.num_values,
     opts
-  ).map((d: unknown[]) => d.toString());
+  );
 }
 
 async function decodeDataPage(cursor: Cursor, header: parquet_thrift.PageHeader, opts: Options) {
diff --git a/test/reference-test/first-record.ts b/test/reference-test/first-record.ts
new file mode 100644
index 00000000..acf9eafd
--- /dev/null
+++ b/test/reference-test/first-record.ts
@@ -0,0 +1,639 @@
+import { unsupported } from './read-all.test';
+
+/**
+ * Helper function to return the data for the first row of a parquet file
+ *
+ * If the first row has been manually parsed, returns [true, data]
+ * If the data has not been manually parsed returns [false, null]
+ */
+export function getFirstRecord(filename: string): [true, any] | [false, null] {
+  switch (filename) {
+    case 'alltypes_dictionary.parquet':
+      return [
+        true,
+        {
+          bigint_col: 0n,
+          bool_col: true,
+          date_string_col: Buffer.from('01/01/09'),
+          double_col: 0,
+          float_col: 0,
+          id: 0,
+          int_col: 0,
+          smallint_col: 0,
+          string_col: Buffer.from('0'),
+          timestamp_col: 0,
+          tinyint_col: 0,
+        },
+      ];
+    case 'alltypes_plain.parquet':
+      return [
+        true,
+        {
+          bigint_col: 0n,
+          bool_col: true,
+          date_string_col: Buffer.from('03/01/09'),
+          double_col: 0,
+          float_col: 0,
+          id: 4,
+          int_col: 0,
+          smallint_col: 0,
+          string_col: Buffer.from('0'),
+          timestamp_col: 0,
+          tinyint_col: 0,
+        },
+      ];
+    case 'alltypes_plain.snappy.parquet':
+      return [
+        true,
+        {
+          bigint_col: 0n,
+          bool_col: true,
+          date_string_col: Buffer.from('04/01/09'),
+          double_col: 0,
+          float_col: 0,
+          id: 6,
+          int_col: 0,
+          smallint_col: 0,
+          string_col: Buffer.from('0'),
+          timestamp_col: 0,
+          tinyint_col: 0,
+        },
+      ];
+    case 'nan_in_stats.parquet':
+      return [true, { x: 1 }];
+    case 'alltypes_tiny_pages.parquet':
+      return [
+        true,
+        {
+          bigint_col: 20n,
+          bool_col: true,
+          date_string_col: '01/13/09',
+          double_col: 20.2,
+          float_col: 2.200000047683716,
+          id: 122,
+          int_col: 2,
+          month: 1,
+          smallint_col: 2,
+          string_col: '2',
+          timestamp_col: 3725410000000,
+          tinyint_col: 2,
+          year: 2009,
+        },
+      ];
+    case 'alltypes_tiny_pages_plain.parquet':
+      return [
+        true,
+        {
+          bigint_col: 20n,
+          bool_col: true,
+          date_string_col: '01/13/09',
+          double_col: 20.2,
+          float_col: 2.200000047683716,
+          id: 122,
+          int_col: 2,
+          month: 1,
+          smallint_col: 2,
+          string_col: '2',
+          timestamp_col: 3725410000000,
+          tinyint_col: 2,
+          year: 2009,
+        },
+      ];
+    case 'binary.parquet':
+      return [
+        true,
+        {
+          foo: Buffer.from([0]),
+        },
+      ];
+    case 'byte_array_decimal.parquet':
+      return [
+        true,
+        {
+          value: Buffer.from([100]),
+        },
+      ];
+    case 'concatenated_gzip_members.parquet':
+      return [
+        true,
+        {
+          long_col: 1n,
+        },
+      ];
+    case 'data_index_bloom_encoding_stats.parquet':
+      return [
+        true,
+        {
+          String: 'Hello',
+        },
+      ];
+    case 'data_index_bloom_encoding_with_length.parquet':
+      return [
+        true,
+        {
+          String: 'Hello',
+        },
+      ];
+    case 'datapage_v1-corrupt-checksum.parquet':
+      return [true, { a: 50462976, b: 1734763876 }];
+    case 'datapage_v1-snappy-compressed-checksum.parquet':
+      return [true, { a: 50462976, b: 1734763876 }];
+    case 'datapage_v1-uncompressed-checksum.parquet':
+      return [true, { a: 50462976, b: 1734763876 }];
+    case 'dict-page-offset-zero.parquet':
+      return [
+        true,
+        {
+          l_partkey: 1552,
+        },
+      ];
+    case 'fixed_length_byte_array.parquet':
+      return [
+        true,
+        {
+          flba_field: Buffer.from([0, 0, 3, 232]),
+        },
+      ];
+    case 'fixed_length_decimal.parquet':
+      return [
+        true,
+        {
+          value: Buffer.from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100]),
+        },
+      ];
+    case 'fixed_length_decimal_legacy.parquet':
+      return [
+        true,
+        {
+          value: Buffer.from([0, 0, 0, 0, 0, 100]),
+        },
+      ];
+    case 'float16_nonzeros_and_nans.parquet':
+      return [
+        true,
+        {
+          x: null,
+        },
+      ];
+    case 'float16_zeros_and_nans.parquet':
+      return [
+        true,
+        {
+          x: null,
+        },
+      ];
+    case 'int32_decimal.parquet':
+      return [
+        true,
+        {
+          value: 1,
+        },
+      ];
+    case 'int32_with_null_pages.parquet':
+      return [
+        true,
+        {
+          int32_field: -654807448,
+        },
+      ];
+    case 'int64_decimal.parquet':
+      return [
+        true,
+        {
+          value: 1,
+        },
+      ];
+    case 'list_columns.parquet':
+      return [
+        true,
+        {
+          int64_list: {
+            list: [{ item: 1n }, { item: 2n }, { item: 3n }],
+          },
+          utf8_list: {
+            list: [{ item: 'abc' }, { item: 'efg' }, { item: 'hij' }],
+          },
+        },
+      ];
+    case 'nation.dict-malformed.parquet':
+      return [
+        true,
+        {
+          comment_col: Buffer.from(' haggle. carefully final deposits detect slyly agai'),
+          name: Buffer.from('ALGERIA'),
+          nation_key: 0,
+          region_key: 0,
+        },
+      ];
+    case 'nested_maps.snappy.parquet':
+      return [
+        true,
+        {
+          a: {
+            key_value: [
+              {
+                key: 'a',
+                value: {
+                  key_value: [
+                    {
+                      key: 1,
+                      value: true,
+                    },
+                    {
+                      key: 2,
+                      value: false,
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+          b: 1,
+          c: 1,
+        },
+      ];
+    case 'nested_lists.snappy.parquet':
+      return [
+        true,
+        {
+          a: {
+            list: [
+              {
+                element: {
+                  list: [
+                    {
+                      element: {
+                        list: [
+                          {
+                            element: 'a',
+                          },
+                          {
+                            element: 'b',
+                          },
+                        ],
+                      },
+                    },
+                    {
+                      element: {
+                        list: [
+                          {
+                            element: 'c',
+                          },
+                        ],
+                      },
+                    },
+                  ],
+                },
+              },
+              {
+                element: {
+                  list: [
+                    {
+                      element: null,
+                    },
+                    {
+                      element: {
+                        list: [
+                          {
+                            element: 'd',
+                          },
+                        ],
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+          b: 1,
+        },
+      ];
+    case 'nonnullable.impala.parquet':
+      return [
+        true,
+        {
+          ID: 8n,
+          Int_Array: {
+            list: [
+              {
+                element: -1,
+              },
+            ],
+          },
+          Int_Map: {
+            map: [
+              {
+                key: 'k1',
+                value: -1,
+              },
+            ],
+          },
+          int_array_array: {
+            list: [
+              {
+                element: {
+                  list: [
+                    {
+                      element: -1,
+                    },
+                    {
+                      element: -2,
+                    },
+                  ],
+                },
+              },
+              {
+                element: {
+                  list: null,
+                },
+              },
+            ],
+          },
+          int_map_array: {
+            list: [
+              {
+                element: {
+                  map: null,
+                },
+              },
+              {
+                element: {
+                  map: [
+                    {
+                      key: 'k1',
+                      value: 1,
+                    },
+                  ],
+                },
+              },
+              {
+                element: {
+                  map: null,
+                },
+              },
+              {
+                element: {
+                  map: null,
+                },
+              },
+            ],
+          },
+          nested_Struct: {
+            B: {
+              list: [
+                {
+                  element: -1,
+                },
+              ],
+            },
+            G: {
+              map: null,
+            },
+            a: -1,
+            c: {
+              D: {
+                list: [
+                  {
+                    element: {
+                      list: [
+                        {
+                          element: {
+                            e: -1,
+                            f: 'nonnullable',
+                          },
+                        },
+                      ],
+                    },
+                  },
+                ],
+              },
+            },
+          },
+        },
+      ];
+    case 'null_list.parquet':
+      return [
+        true,
+        {
+          emptylist: {
+            list: null,
+          },
+        },
+      ];
+    case 'nullable.impala.parquet':
+      return [
+        true,
+        {
+          id: 1n,
+          int_Map_Array: {
+            list: [
+              {
+                element: {
+                  map: [
+                    {
+                      key: 'k1',
+                      value: 1,
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+          int_array: {
+            list: [
+              {
+                element: 1,
+              },
+              {
+                element: 2,
+              },
+              {
+                element: 3,
+              },
+            ],
+          },
+          int_array_Array: {
+            list: [
+              {
+                element: {
+                  list: [
+                    {
+                      element: 1,
+                    },
+                    {
+                      element: 2,
+                    },
+                  ],
+                },
+              },
+              {
+                element: {
+                  list: [
+                    {
+                      element: 3,
+                    },
+                    {
+                      element: 4,
+                    },
+                  ],
+                },
+              },
+            ],
+          },
+          int_map: {
+            map: [
+              {
+                key: 'k1',
+                value: 1,
+              },
+              {
+                key: 'k2',
+                value: 100,
+              },
+            ],
+          },
+          nested_struct: {
+            A: 1,
+            C: {
+              d: {
+                list: [
+                  {
+                    element: {
+                      list: [
+                        {
+                          element: {
+                            E: 10,
+                            F: 'aaa',
+                          },
+                        },
+                        {
+                          element: {
+                            E: -10,
+                            F: 'bbb',
+                          },
+                        },
+                      ],
+                    },
+                  },
+                  {
+                    element: {
+                      list: [
+                        {
+                          element: {
+                            E: 11,
+                            F: 'c',
+                          },
+                        },
+                      ],
+                    },
+                  },
+                ],
+              },
+            },
+            b: {
+              list: [
+                {
+                  element: 1,
+                },
+              ],
+            },
+            g: {
+              map: [
+                {
+                  key: 'foo',
+                  value: {
+                    H: {
+                      i: {
+                        list: [
+                          {
+                            element: 1.1,
+                          },
+                        ],
+                      },
+                    },
+                  },
+                },
+              ],
+            },
+          },
+        },
+      ];
+    case 'nulls.snappy.parquet':
+      return [
+        true,
+        {
+          b_struct: {
+            b_c_int: null,
+          },
+        },
+      ];
+    case 'overflow_i16_page_cnt.parquet':
+      return [
+        true,
+        {
+          inc: false,
+        },
+      ];
+    case 'plain-dict-uncompressed-checksum.parquet':
+      return [
+        true,
+        {
+          binary_field: Buffer.from('a655fd0e-9949-4059-bcae-fd6a002a4652'),
+          long_field: 0n,
+        },
+      ];
+    case 'repeated_no_annotation.parquet':
+      return [
+        true,
+        {
+          id: 1,
+          phoneNumbers: null,
+        },
+      ];
+    case 'rle-dict-snappy-checksum.parquet':
+      return [
+        true,
+        {
+          binary_field: Buffer.from('c95e263a-f5d4-401f-8107-5ca7146a1f98'),
+          long_field: 0n,
+        },
+      ];
+    case 'rle-dict-uncompressed-corrupt-checksum.parquet':
+      return [
+        true,
+        {
+          binary_field: Buffer.from('6325c32b-f417-41aa-9e02-9b8601542aff'),
+          long_field: 0n,
+        },
+      ];
+    case 'single_nan.parquet':
+      return [
+        true,
+        {
+          mycol: null,
+        },
+      ];
+    // Not supported - see read-all-test.ts
+    // case ("byte_stream_split.zstd.parquet"):
+    // case ("datapage_v2.snappy.parquet"):
+    // case ("delta_binary_packed.parquet"):
+    // case ("delta_byte_array.parquet"):
+    // case ("delta_encoding_optional_column.parquet"):
+    // case ("delta_encoding_required_column.parquet"):
+    // case ("delta_length_byte_array.parquet"):
+    // case ("hadoop_lz4_compressed.parquet"):
+    // case ("hadoop_lz4_compressed_larger.parquet"):
+    // case ("large_string_map.brotli.parquet"):
+    // case ("lz4_raw_compressed.parquet"):
+    // case ("lz4_raw_compressed_larger.parquet"):
+    // case ("nested_structs.rust.parquet"):
+    // case ("non_hadoop_lz4_compressed.parquet"):
+    // case ("rle_boolean_encoding.parquet"):
+    default: {
+      if (!unsupported.includes(filename)) {
+        return [true, 'Please provide the first row of the parquet file for comparison'];
+      }
+      return [false, null];
+    }
+  }
+}
diff --git a/test/reference-test/read-all.test.ts b/test/reference-test/read-all.test.ts
index 4170594b..90264c3f 100644
--- a/test/reference-test/read-all.test.ts
+++ b/test/reference-test/read-all.test.ts
@@ -4,13 +4,15 @@ import path from 'node:path';
 import fs from 'node:fs';
 
 import parquet from '../../parquet';
+import { getFirstRecord } from './first-record';
 
 // Used for testing a single file. Example:
 // const onlyTest = 'single_nan.parquet';
 const onlyTest = null;
 
 // Test files currently unsupported / needing separate test
-const unsupported = [
+// eslint-disable-next-line mocha/no-exports
+export const unsupported = [
   'byte_stream_split.zstd.parquet', // ZSTD unsupported
   'hadoop_lz4_compressed.parquet', // LZ4 unsupported
   'hadoop_lz4_compressed_larger.parquet', // LZ4 unsupported
@@ -44,6 +46,12 @@ describe('Read Test for all files', function () {
       // Expect the same keys as top-level fields
       const expectedRecordKeys = schema.fieldList.filter((x) => x.path.length === 1).map((x) => x.name);
       expect(Object.keys(record)).to.deep.equal(expectedRecordKeys);
+
+      // validate that the first record has the expected values
+      const [shouldTest, expectedFirstRow] = getFirstRecord(filename);
+      if (shouldTest) {
+        expect(record).to.deep.equal(expectedFirstRow);
+      }
     });
   }
 });
diff --git a/test/test-files.js b/test/test-files.js
index 1e1a5bd9..116a4bda 100644
--- a/test/test-files.js
+++ b/test/test-files.js
@@ -101,16 +101,16 @@ describe('test-files', function () {
   it('mr_times.parq loads', async function () {
     const data = await readData('mr_times.parq');
     assert.deepEqual(data, [
-      { id: '1', date_added: '83281000000000' },
-      { id: '2', date_added: '83282000000000' },
-      { id: '3', date_added: '83283000000000' },
-      { id: '4', date_added: '83284000000000' },
-      { id: '5', date_added: '83284000000000' },
-      { id: '6', date_added: '83285000000000' },
-      { id: '7', date_added: '83286000000000' },
-      { id: '8', date_added: '83287000000000' },
-      { id: '9', date_added: '83288000000000' },
-      { id: '10', date_added: '83289000000000' },
+      { id: '1', date_added: 83281000000000 },
+      { id: '2', date_added: 83282000000000 },
+      { id: '3', date_added: 83283000000000 },
+      { id: '4', date_added: 83284000000000 },
+      { id: '5', date_added: 83284000000000 },
+      { id: '6', date_added: 83285000000000 },
+      { id: '7', date_added: 83286000000000 },
+      { id: '8', date_added: 83287000000000 },
+      { id: '9', date_added: 83288000000000 },
+      { id: '10', date_added: 83289000000000 },
     ]);
   });
 
@@ -226,12 +226,22 @@ describe('test-files', function () {
 
     it('rle-dict-snappy-checksum.parquet loads', async function () {
       const data = await readData('rle/rle-dict-snappy-checksum.parquet');
-      assert.deepEqual(data[0], { binary_field: 'c95e263a-f5d4-401f-8107-5ca7146a1f98', long_field: '0' });
+      for (const item of data) {
+        assert.deepEqual(item, {
+          binary_field: Buffer.from('c95e263a-f5d4-401f-8107-5ca7146a1f98'),
+          long_field: 0n,
+        });
+      }
     });
 
     it('rle-dict-uncompressed-corrupt-checksum.parquet loads', async function () {
       const data = await readData('rle/rle-dict-uncompressed-corrupt-checksum.parquet');
-      assert.deepEqual(data[0], { binary_field: '6325c32b-f417-41aa-9e02-9b8601542aff', long_field: '0' });
+      for (const item of data) {
+        assert.deepEqual(item, {
+          binary_field: Buffer.from('6325c32b-f417-41aa-9e02-9b8601542aff'),
+          long_field: 0n,
+        });
+      }
     });
   });
 });