Skip to content

Commit

Permalink
Reference Tests and Breaking Change: Optional nullable fields are now…
Browse files Browse the repository at this point in the history
… null instead of undefined (#114)

Problem
=======

We wanted to add tests for all the tests in
https://github.com/apache/parquet-testing

### Discovered Bugs
- We treated nulls as undefined, but others don't
- We incorrectly processed dictionary_page_offset >= 0 instead of only >
0


Solution
========

- Added new test that automatically tests all files:
`test/reference-test/read-all.test.ts`
- Fixed found bugs

with @shannonwells 

Steps to Verify:
----------------
1. Run the tests
1. Comment out the bug fixes and see reference test files fail
  • Loading branch information
wilwade authored Jan 19, 2024
1 parent 6fdb9da commit 8d34ac1
Show file tree
Hide file tree
Showing 60 changed files with 92 additions and 13 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ npm-debug.log
.nyc_output
dist
!test/test-files/*.parquet
!test/reference-test/files/*.parquet
examples/server/package-lock.json
test/browser/*.js
test/browser/*.js
5 changes: 3 additions & 2 deletions lib/reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -672,8 +672,9 @@ export class ParquetEnvelopeReader {
num_values: metadata.num_values
});

if (metadata.dictionary_page_offset) {
const offset = +metadata.dictionary_page_offset;
// If this exists and is greater than zero then we need to have an offset
if (metadata.dictionary_page_offset && +metadata.dictionary_page_offset > 0) {
const offset: number = +metadata.dictionary_page_offset;
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size);

await this.read(offset, size, colChunk.file_path).then(async (buffer: Buffer) => {
Expand Down
2 changes: 2 additions & 0 deletions lib/shred.ts
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ function materializeRecordField(record: Record<string, unknown>, branch: Array<P
const node = branch[0];

if (dLevel < node.dLevelMax) {
// This ensures that nulls are correctly processed
record[node.name] = value;
return;
}

Expand Down
13 changes: 9 additions & 4 deletions test/integration.js
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ async function readTestFile() {
{ quantity: [10n], warehouse: "A" },
{ quantity: [20n], warehouse: "B" }
],
colour: [ 'green', 'red' ]
colour: [ 'green', 'red' ],
meta_json: null,
});

assert.deepEqual(await cursor.next(), {
Expand All @@ -317,11 +318,13 @@ async function readTestFile() {
stock: [
{ quantity: [50n, 33n], warehouse: "X" }
],
colour: [ 'orange' ]
colour: [ 'orange' ],
meta_json: null,
});

assert.deepEqual(await cursor.next(), {
name: 'kiwi',
quantity: null,
price: 4.2,
day: new Date('2017-11-26'),
date: new Date(TEST_VTIME + 8000 * i),
Expand All @@ -337,11 +340,13 @@ async function readTestFile() {

assert.deepEqual(await cursor.next(), {
name: 'banana',
quantity: null,
price: 3.2,
day: new Date('2017-11-26'),
date: new Date(TEST_VTIME + 6000 * i),
finger: Buffer.from("FNORD"),
inter: { months: 42, days: 23, milliseconds: 777 },
stock: null,
colour: [ 'yellow' ],
meta_json: { shape: 'curved' }
});
Expand All @@ -366,8 +371,8 @@ async function readTestFile() {
for (let i = 0; i < TEST_NUM_ROWS; ++i) {
assert.deepEqual(await cursor.next(), { name: 'apples', quantity: 10n });
assert.deepEqual(await cursor.next(), { name: 'oranges', quantity: 20n });
assert.deepEqual(await cursor.next(), { name: 'kiwi' });
assert.deepEqual(await cursor.next(), { name: 'banana' });
assert.deepEqual(await cursor.next(), { name: 'kiwi', quantity: null });
assert.deepEqual(await cursor.next(), { name: 'banana', quantity: null });
}

assert.equal(await cursor.next(), null);
Expand Down
12 changes: 12 additions & 0 deletions test/reference-test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# References Tests

This is a set of tests that use the reference files from https://github.com/apache/parquet-testing/.

## Updating the Reference Files

This assumes that parquetjs is in the same folder as the clone of parquet-testing.

1. `git clone [email protected]:apache/parquet-testing.git`
1. `cd ../parquetjs`
1. `cp ../parquet-testing/data/*.parquet ./test/reference-test/files/`

Binary file not shown.
Binary file added test/reference-test/files/alltypes_plain.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/binary.parquet
Binary file not shown.
Binary file added test/reference-test/files/byte_array_decimal.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/int32_decimal.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/int64_decimal.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/list_columns.parquet
Binary file not shown.
Binary file added test/reference-test/files/lz4_raw_compressed.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/nan_in_stats.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/null_list.parquet
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/nulls.snappy.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added test/reference-test/files/single_nan.parquet
Binary file not shown.
50 changes: 50 additions & 0 deletions test/reference-test/read-all.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { expect } from "chai";
import path from "node:path";
import fs from "node:fs";

import parquet from '../../parquet';

// Used for testing a single file. Example:
// const onlyTest = 'single_nan.parquet';
const onlyTest = null;

// Test files currently unsupported / needing separate test
const unsupported = [
'byte_stream_split.zstd.parquet', // ZSTD unsupported
'hadoop_lz4_compressed.parquet', // LZ4 unsupported
'hadoop_lz4_compressed_larger.parquet', // LZ4 unsupported
'lz4_raw_compressed.parquet', // LZ4_RAW unsupported
'lz4_raw_compressed_larger.parquet', // LZ4_RAW unsupported
'nested_structs.rust.parquet', // ZSTD unsupported
'non_hadoop_lz4_compressed.parquet', // ZSTD unsupported
'rle_boolean_encoding.parquet', // BUG?: https://github.com/LibertyDSNP/parquetjs/issues/113
'datapage_v2.snappy.parquet', // DELTA_BINARY_PACKED unsupported
'delta_binary_packed.parquet', // DELTA_BINARY_PACKED unsupported
'delta_byte_array.parquet', // DELTA_BYTE_ARRAY unsupported
'delta_encoding_optional_column.parquet', // DELTA_BINARY_PACKED unsupported
'delta_encoding_required_column.parquet', // DELTA_BINARY_PACKED unsupported
'delta_length_byte_array.parquet', // ZSTD unsupported, DELTA_BINARY_PACKED unsupported
'float16_nonzeros_and_nans.parquet', // missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)
'float16_zeros_and_nans.parquet', // missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)
'large_string_map.brotli.parquet', // BUG?
];

describe("Read Test for all files", function () {

const listOfFiles = fs.readdirSync(path.join(__dirname, 'files'))
.filter(x => x.endsWith(".parquet") && !unsupported.includes(x));

for (const filename of listOfFiles) {
if (onlyTest && onlyTest !== filename) continue;
it(`Reading ${filename}`, async function () {
const reader = await parquet.ParquetReader.openFile(path.join(__dirname, 'files', filename));
const schema = reader.getSchema();
expect(schema.fieldList).to.have.length.greaterThan(0);
const cursor = reader.getCursor();
const record = await cursor.next() as any;
// Expect the same keys as top-level fields
const expectedRecordKeys = schema.fieldList.filter(x => x.path.length === 1).map(x => x.name);
expect(Object.keys(record)).to.deep.equal(expectedRecordKeys);
})
}
});
6 changes: 3 additions & 3 deletions test/shred.js
Original file line number Diff line number Diff line change
Expand Up @@ -498,11 +498,11 @@ describe('ParquetShredder', function() {

assert.deepEqual(
records[2],
{ name: "kiwi", price: 99.0 });
{ name: "kiwi", price: 99.0, stock: null });

assert.deepEqual(
records[3],
{ name: "banana", stock: [{ warehouse: "C" }], price: 42.0 });
{ name: "banana", stock: [{ quantity: null, warehouse: "C" }], price: 42.0 });
});

it('should materialize a static nested record with blank optional value', function() {
Expand Down Expand Up @@ -549,7 +549,7 @@ describe('ParquetShredder', function() {

assert.deepEqual(
records[0],
{ fruit: { name: "apple" } });
{ fruit: { name: "apple", colour: null } });

});

Expand Down
14 changes: 11 additions & 3 deletions test/test-files.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ describe('test-files', function() {

it('test-converted-type-null.parquet loads', async function() {
const data = await readData('test-converted-type-null.parquet');
assert.deepEqual(data,[{foo: 'bar'},{}]);
assert.deepEqual(data,[{foo: 'bar'},{foo: null}]);
});

it('test-enum-type.parquet loads', async function() {
Expand All @@ -119,12 +119,20 @@ describe('test-files', function() {

it('test-null-dictionary.parquet loads', async function() {
const data = await readData('test-null-dictionary.parquet');
assert.deepEqual(data,[].concat.apply([{}],[...Array(3)].map( () => ([{foo: 'bar'}, {foo: 'baz'}]))));
assert.deepEqual(
data,
[
{ foo: null },
{ foo: 'bar' }, { foo: 'baz' },
{ foo: 'bar' }, { foo: 'baz' },
{ foo: 'bar' }, { foo: 'baz' }
]
);
});

it('test-null.parquet loads', async function() {
const data = await readData('test-null.parquet');
assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1}]);
assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1, bar: null}]);
});

it('test.parquet loads', async function() {
Expand Down

0 comments on commit 8d34ac1

Please sign in to comment.