Skip to content

Commit

Permalink
Add null pages and boundary order (Fixes #92) (#94)
Browse files Browse the repository at this point in the history
Problem
=======

Parquet file column indexes are required to have `null_pages` and
`boundary_order`, but they were missing from Parquetjs generated files.


https://github.com/apache/parquet-format/blob/1603152f8991809e8ad29659dffa224b4284f31b/src/main/thrift/parquet.thrift#L955


Closes #92 

Solution
========

Note: While required, the requirement is not always a hard requirement
depending on the library.

Steps to Verify:
----------------
1. Checkout the branch 
2. `npm i && npm run build && npm pack `
3. Install parquet cli tools (macOS brew: `brew install parquet-cli`)
4. Checkout the bug repo from #92
https://github.com/noxify/parquetjs_bug/
5. `cd parquetjs_bug/parquetjs && npm i`
6. `node index.js && parquet column-index
../generated_files/parquetjs/change.parque` will FAIL
7. npm i ../parquetjs/dsnp-parquetjs-0.0.0.tgz
8 `node index.js && parquet column-index
../generated_files/parquetjs/change.parque` will PASS!
  • Loading branch information
wilwade authored Jul 13, 2023
1 parent 19f3ffa commit 43732c5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 0 deletions.
5 changes: 5 additions & 0 deletions lib/writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -634,8 +634,11 @@ async function encodeColumnChunk(pages: Page[], opts: {column: ParquetField, bas

/* compile statistics ColumnIndex and OffsetIndex*/
let columnIndex = new parquet_thrift.ColumnIndex();
columnIndex.null_pages = [];
columnIndex.max_values = [];
columnIndex.min_values = [];
// Default to unordered
columnIndex.boundary_order = 0;
let offsetIndex = new parquet_thrift.OffsetIndex();
offsetIndex.page_locations = [];

Expand All @@ -659,6 +662,8 @@ async function encodeColumnChunk(pages: Page[], opts: {column: ParquetField, bas
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0));
page.distinct_values.forEach((value: unknown) => distinct_values.add(value));

// If the number of values and the count of nulls are the same, this is a null page
columnIndex.null_pages.push( page.num_values === statistics.null_count.valueOf() );
columnIndex.max_values.push( encodeStatisticsValue(page.statistics.max_value, opts.column) );
columnIndex.min_values.push( encodeStatisticsValue(page.statistics.min_value, opts.column) );
}
Expand Down
16 changes: 16 additions & 0 deletions test/statistics.js
Original file line number Diff line number Diff line change
Expand Up @@ -150,34 +150,50 @@ describe('statistics', async function() {
const name = await reader.envelopeReader.readColumnIndex('name', row);
assert.deepEqual(name.min_values, ['apples','banana']);
assert.deepEqual(name.max_values, ['oranges','banana']);
assert.deepEqual(name.null_pages, [false, false]);
assert.deepEqual(name.boundary_order, 0);

const quantity = await reader.envelopeReader.readColumnIndex('quantity', row);
assert.deepEqual(quantity.min_values, [10n, undefined]);
assert.deepEqual(quantity.max_values, [20n, undefined]);
assert.deepEqual(quantity.null_pages, [false, false]);
assert.deepEqual(quantity.boundary_order, 0);

const price = await reader.envelopeReader.readColumnIndex('price', row);
assert.deepEqual(price.min_values, [2.6, 3.2]);
assert.deepEqual(price.max_values, [4.2, 3.2]);
assert.deepEqual(price.null_pages, [false, false]);
assert.deepEqual(price.boundary_order, 0)

const day = await reader.envelopeReader.readColumnIndex('day', row);
assert.deepEqual(day.min_values, [ new Date('2008-11-26'), new Date('2017-11-26') ]);
assert.deepEqual(day.max_values, [ new Date('2018-03-03'), new Date('2017-11-26') ]);
assert.deepEqual(day.null_pages, [false, false]);
assert.deepEqual(day.boundary_order, 0)

const finger = await reader.envelopeReader.readColumnIndex('finger', row);
assert.deepEqual(finger.min_values, [ Buffer.from('ABCDE'), Buffer.from('FNORD') ]);
assert.deepEqual(finger.max_values, [ Buffer.from('XCVBN'), Buffer.from('FNORD')]);
assert.deepEqual(finger.null_pages, [false, false]);
assert.deepEqual(finger.boundary_order, 0)

const stockQuantity = await reader.envelopeReader.readColumnIndex('stock,quantity', row);
assert.deepEqual(stockQuantity.min_values, [ 10n, undefined ]);
assert.deepEqual(stockQuantity.max_values, [ 50n, undefined ]);
assert.deepEqual(stockQuantity.null_pages, [false, false]);
assert.deepEqual(stockQuantity.boundary_order, 0)

const stockWarehouse = await reader.envelopeReader.readColumnIndex('stock,warehouse', row);
assert.deepEqual(stockWarehouse.min_values, [ 'A', undefined ]);
assert.deepEqual(stockWarehouse.max_values, [ 'x', undefined ]);
assert.deepEqual(stockWarehouse.null_pages, [false, false]);
assert.deepEqual(stockWarehouse.boundary_order, 0)

const colour = await reader.envelopeReader.readColumnIndex('colour', row);
assert.deepEqual(colour.min_values, [ 'brown', 'yellow' ]);
assert.deepEqual(colour.max_values, [ 'yellow', 'yellow' ]);
assert.deepEqual(colour.null_pages, [false, false]);
assert.deepEqual(colour.boundary_order, 0)

const inter = await reader.envelopeReader.readColumnIndex('inter', row).catch(e => e);
assert.equal(inter.message,'Column Index Missing');
Expand Down

0 comments on commit 43732c5

Please sign in to comment.