From fb4b59a9d0dd0bc62021095250b4950f56972be7 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 5 Sep 2024 12:54:28 -0500 Subject: [PATCH 01/29] minimal changes, passing test --- lib/declare.ts | 3 ++- lib/types.ts | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/lib/declare.ts b/lib/declare.ts index 049813e9..d1d2357b 100644 --- a/lib/declare.ts +++ b/lib/declare.ts @@ -54,7 +54,8 @@ export type OriginalType = | 'INT_64' // 18 | 'JSON' // 19 | 'BSON' // 20 - | 'INTERVAL'; // 21 + | 'INTERVAL' // 21 + | 'TIME'; // 22 export type SchemaDefinition = Record; diff --git a/lib/types.ts b/lib/types.ts index a2bb9a98..46cb3a1c 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -3,6 +3,7 @@ import { PrimitiveType, OriginalType, ParquetType, FieldDefinition, ParquetField } from './declare'; import { Options } from './codec/types'; import type { Document as BsonDocument } from 'bson'; +import { TimeType, TimeUnit } from '../gen-nodejs/parquet_types'; // BSON uses top level awaits, so use require for now const bsonSerialize = require('bson').serialize; const bsonDeserialize = require('bson').deserialize; @@ -21,6 +22,11 @@ interface INTERVAL { milliseconds: number; } +interface TIME { + type: TimeType; + value: number | bigint; +} + export function getParquetTypeDataObject( type: ParquetType, field?: ParquetField | Options | FieldDefinition @@ -82,6 +88,7 @@ const PARQUET_LOGICAL_TYPES = new Set([ 'INTERVAL', 'MAP', 'LIST', + 'TIME', ] satisfies ParquetType[]); const PARQUET_LOGICAL_TYPE_DATA: Record = { @@ -225,6 +232,10 @@ const PARQUET_LOGICAL_TYPE_DATA: Record = { originalType: 'LIST', toPrimitive: toPrimitive_LIST, }, + TIME: { + originalType: 'TIME', + toPrimitive: toPrimitive_TIME, + }, }; /** @@ -560,3 +571,59 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin throw 'invalid value'; } } + +/** + * Convert a TIME value to its internal representation. + * @param value The TIME object containing the value and the time type information. + * @returns The converted time value as bigint or number based on the unit. + */ +function toPrimitive_TIME(value: TIME): bigint | number { + const { type, value: timeValue } = value; + const { unit, isAdjustedToUTC } = type; + + let epochTime: number | bigint = 0; + + if (typeof timeValue === 'number') { + if (unit.MILLIS) { + epochTime = timeValue; + } else if (unit.MICROS) { + epochTime = BigInt(timeValue) * 1000n; + } else if (unit.NANOS) { + epochTime = BigInt(timeValue) * 1000000n; + } + } else if (typeof timeValue === 'bigint') { + epochTime = timeValue; + } else { + throw new Error('Invalid value for TIME type'); + } + + if (!isAdjustedToUTC) { + return adjustToLocalTimestamp(epochTime, unit); + } + + return epochTime; +} + +/** + * Adjust the timestamp to local time. + * @param timestamp The timestamp to adjust. + * @param unit The unit of the timestamp (MILLIS, MICROS, NANOS). + * @returns The adjusted timestamp. + */ +function adjustToLocalTimestamp(timestamp: bigint | number, unit: TimeUnit): bigint { + try { + const localOffset = BigInt(new Date().getTimezoneOffset()) * 60n * 1000n; + timestamp = BigInt(timestamp); + if (unit.MILLIS) { + return timestamp - localOffset; + } else if (unit.MICROS) { + return timestamp - localOffset * 1000n; + } else if (unit.NANOS) { + return timestamp - localOffset * 1000000n; + } + + throw new Error('Unsupported unit for TIME'); + } catch (e) { + throw new Error('Invalid timestamp for TIME'); + } +} From 1ca98b8ee2b598007467d6cba6cd2fbbda037a07 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 5 Sep 2024 13:10:32 -0500 Subject: [PATCH 02/29] handle possible exceptions on conversion --- lib/types.ts | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/types.ts b/lib/types.ts index 46cb3a1c..ec2235b4 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -571,9 +571,9 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin throw 'invalid value'; } } - /** * Convert a TIME value to its internal representation. + * This handles both `isAdjustedToUTC` and the correct `unit` (MILLIS, MICROS, NANOS). * @param value The TIME object containing the value and the time type information. * @returns The converted time value as bigint or number based on the unit. */ @@ -581,15 +581,17 @@ function toPrimitive_TIME(value: TIME): bigint | number { const { type, value: timeValue } = value; const { unit, isAdjustedToUTC } = type; - let epochTime: number | bigint = 0; + let epochTime: number | bigint; if (typeof timeValue === 'number') { if (unit.MILLIS) { epochTime = timeValue; } else if (unit.MICROS) { - epochTime = BigInt(timeValue) * 1000n; + epochTime = BigInt(timeValue) * 1000n; // Convert from microseconds to nanoseconds } else if (unit.NANOS) { - epochTime = BigInt(timeValue) * 1000000n; + epochTime = BigInt(timeValue) * 1000000n; // Convert from nanoseconds + } else { + throw new Error('Unsupported time unit'); } } else if (typeof timeValue === 'bigint') { epochTime = timeValue; @@ -605,7 +607,7 @@ function toPrimitive_TIME(value: TIME): bigint | number { } /** - * Adjust the timestamp to local time. + * Adjust the timestamp to local time if the time is not adjusted to UTC. * @param timestamp The timestamp to adjust. * @param unit The unit of the timestamp (MILLIS, MICROS, NANOS). * @returns The adjusted timestamp. @@ -621,9 +623,8 @@ function adjustToLocalTimestamp(timestamp: bigint | number, unit: TimeUnit): big } else if (unit.NANOS) { return timestamp - localOffset * 1000000n; } - - throw new Error('Unsupported unit for TIME'); + throw new Error('Unsupported time unit'); } catch (e) { - throw new Error('Invalid timestamp for TIME'); + throw new Error('Invalid value for TIME type'); } } From 8fcafe284da3f5e8d52061642fa7229b9bab5fee Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 5 Sep 2024 13:13:32 -0500 Subject: [PATCH 03/29] set the primitive type for TIME as INT64 --- lib/types.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/types.ts b/lib/types.ts index ec2235b4..74a627b7 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -234,6 +234,7 @@ const PARQUET_LOGICAL_TYPE_DATA: Record = { }, TIME: { originalType: 'TIME', + primitiveType: 'INT64', toPrimitive: toPrimitive_TIME, }, }; From 9949fd42802714aa19f112090b4cc4fafff4daac Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Fri, 6 Sep 2024 08:12:48 -0500 Subject: [PATCH 04/29] define field definition for time support and helper function --- lib/declare.ts | 2 ++ lib/fields.ts | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/lib/declare.ts b/lib/declare.ts index d1d2357b..76e99d8f 100644 --- a/lib/declare.ts +++ b/lib/declare.ts @@ -72,6 +72,8 @@ export interface FieldDefinition { num_children?: NumChildrenField; precision?: number; scale?: number; + unit?: 'MILLIS' | 'MICROS' | 'NANOS'; + isAdjustedToUTC?: boolean; } export interface ParquetField { diff --git a/lib/fields.ts b/lib/fields.ts index cc54cb1e..3d1cc986 100644 --- a/lib/fields.ts +++ b/lib/fields.ts @@ -80,3 +80,18 @@ export function createListField( }, }; } + +export function createTimeField( + unit: 'MILLIS' | 'MICROS' | 'NANOS', + isAdjustedToUTC = true, + optional = true, + fieldOptions: FieldDefinition = {} +): FieldDefinition { + return { + ...fieldOptions, + optional, + type: 'TIME', + unit, + isAdjustedToUTC, + }; +} From 679c96c1586f8d50c319aacb830ed4b5c5ea13a0 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Fri, 6 Sep 2024 09:44:41 -0500 Subject: [PATCH 05/29] declare additional fields for TIME --- lib/declare.ts | 2 ++ lib/schema.ts | 10 ++++++++++ test/decodeSchema.js | 17 ++++++++++++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/lib/declare.ts b/lib/declare.ts index 76e99d8f..f698b69e 100644 --- a/lib/declare.ts +++ b/lib/declare.ts @@ -94,6 +94,8 @@ export interface ParquetField { fieldCount?: number; fields?: Record; disableEnvelope?: boolean; + unit?: 'MILLIS' | 'MICROS' | 'NANOS'; + isAdjustedToUTC?: boolean; } interface ParentField { diff --git a/lib/schema.ts b/lib/schema.ts index f33165dd..23ba2547 100644 --- a/lib/schema.ts +++ b/lib/schema.ts @@ -126,6 +126,8 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP statistics: opts.statistics, fieldCount: Object.keys(opts.fields).length, fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)), + unit: opts.unit, + isAdjustedToUTC: opts.isAdjustedToUTC, }; if (opts.type == 'LIST' || opts.type == 'MAP') fieldList[name].originalType = opts.type; @@ -169,6 +171,12 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP ); } + if (typeDef.originalType === 'TIME') { + if (!opts.unit) { + fieldErrors.push(`Time type requires a unit, for Column: ${nameWithPath}`); + } + } + /* add to schema */ fieldList[name] = { name: name, @@ -184,6 +192,8 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP typeLength: opts.typeLength || typeDef.typeLength, rLevelMax: rLevelMax, dLevelMax: dLevelMax, + unit: opts.unit, + isAdjustedToUTC: opts.isAdjustedToUTC, }; } diff --git a/test/decodeSchema.js b/test/decodeSchema.js index fecb9ec0..c786c4c8 100644 --- a/test/decodeSchema.js +++ b/test/decodeSchema.js @@ -222,6 +222,21 @@ describe('ParquetSchema', function () { }; const reader = new parquet.ParquetReader(metadata, {}); - assert.deepEqual(reader.schema.fields, expected); + const removeUndefinedFields = (obj) => { + Object.keys(obj).forEach((key) => { + if (obj[key] === undefined || obj[key] === null || (Array.isArray(obj[key]) && obj[key].length === 0)) { + // eslint-disable-next-line @typescript-eslint/no-dynamic-delete + delete obj[key]; + } else if (typeof obj[key] === 'object') { + removeUndefinedFields(obj[key]); + } + }); + return obj; + }; + + const sanitizedExpected = removeUndefinedFields(expected); + const sanitizedActual = removeUndefinedFields(reader.schema.fields); + + assert.deepEqual(sanitizedActual, sanitizedExpected); }); }); From d3a1382783fe8846a46325ad7dfec1f62d006f74 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Fri, 6 Sep 2024 10:57:59 -0500 Subject: [PATCH 06/29] update jsonSchema to support TIME --- lib/jsonSchema.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 5466e1c3..0c2c62e6 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -113,6 +113,21 @@ const fromJsonSchemaField = case 'array': return fromJsonSchemaArray(fieldValue, optional); case 'object': + if (fieldValue.properties && fieldValue.properties.type && fieldValue.properties.value) { + const unit = fieldValue.properties.type.properties?.unit?.default?.toString(); + let defaultUnit: 'MILLIS' | 'MICROS' | 'NANOS' = 'MILLIS'; // Restrict to allowed values + + if (unit === 'MICROS') { + defaultUnit = 'MICROS'; + } else if (unit === 'NANOS') { + defaultUnit = 'NANOS'; + } else if (unit !== 'MILLIS') { + throw new UnsupportedJsonSchemaError(`Unit type ${unit} is unsupported.`); + } + + const isAdjustedToUTC = !!fieldValue.properties.type.properties?.isAdjustedToUTC?.default; + return fields.createTimeField(defaultUnit, isAdjustedToUTC, optional); + } return fields.createStructField(fromJsonSchema(fieldValue), optional); default: throw new UnsupportedJsonSchemaError( From e22fbe431cd5f94583429521d1906d0f0f10e7e2 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Fri, 6 Sep 2024 11:04:44 -0500 Subject: [PATCH 07/29] add field tests for time --- test/fields.test.ts | 60 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/test/fields.test.ts b/test/fields.test.ts index dc80a53a..5a87a59c 100644 --- a/test/fields.test.ts +++ b/test/fields.test.ts @@ -209,6 +209,66 @@ describe('Field Builders: Structs and Struct List', function () { assert.equal(!!c.isNested, true); assert.equal(c.fieldCount, 1); }); + + it('Can use primitive field types: Time with default MILLIS', function () { + const schema = new ParquetSchema({ + timeField: fields.createTimeField('MILLIS'), + }); + const c = schema.fields.timeField; + assert.equal(c.name, 'timeField'); + assert.equal(c.primitiveType, 'INT64'); + assert.equal(c.originalType, 'TIME'); + assert.equal(c.unit, 'MILLIS'); + assert.equal(c.isAdjustedToUTC, true); + assert.deepEqual(c.path, ['timeField']); + assert.equal(c.repetitionType, 'OPTIONAL'); + assert.equal(c.encoding, 'PLAIN'); + assert.equal(c.compression, 'UNCOMPRESSED'); + assert.equal(c.rLevelMax, 0); + assert.equal(c.dLevelMax, 1); + assert.equal(!!c.isNested, false); + assert.equal(c.fieldCount, undefined); + }); + + it('Can use primitive field types: Time with MICROS', function () { + const schema = new ParquetSchema({ + timeField: fields.createTimeField('MICROS', false), + }); + const c = schema.fields.timeField; + assert.equal(c.name, 'timeField'); + assert.equal(c.primitiveType, 'INT64'); + assert.equal(c.originalType, 'TIME'); + assert.equal(c.unit, 'MICROS'); + assert.equal(c.isAdjustedToUTC, false); + assert.deepEqual(c.path, ['timeField']); + assert.equal(c.repetitionType, 'OPTIONAL'); + assert.equal(c.encoding, 'PLAIN'); + assert.equal(c.compression, 'UNCOMPRESSED'); + assert.equal(c.rLevelMax, 0); + assert.equal(c.dLevelMax, 1); + assert.equal(!!c.isNested, false); + assert.equal(c.fieldCount, undefined); + }); + + it('Can use primitive field types: Time with NANOS', function () { + const schema = new ParquetSchema({ + timeField: fields.createTimeField('NANOS', true, true, { compression: 'GZIP' }), + }); + const c = schema.fields.timeField; + assert.equal(c.name, 'timeField'); + assert.equal(c.primitiveType, 'INT64'); + assert.equal(c.originalType, 'TIME'); + assert.equal(c.unit, 'NANOS'); + assert.equal(c.isAdjustedToUTC, true); + assert.equal(c.compression, 'GZIP'); + assert.deepEqual(c.path, ['timeField']); + assert.equal(c.repetitionType, 'OPTIONAL'); + assert.equal(c.encoding, 'PLAIN'); + assert.equal(c.rLevelMax, 0); + assert.equal(c.dLevelMax, 1); + assert.equal(!!c.isNested, false); + assert.equal(c.fieldCount, undefined); + }); }); describe('Field Builders: Lists', function () { From ddfcc0efebe21538d30121cac1eaa182d1bd33af Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Fri, 6 Sep 2024 11:32:48 -0500 Subject: [PATCH 08/29] annotate as int32 for MILLIS and int64 for rest --- lib/types.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/types.ts b/lib/types.ts index 74a627b7..d0011917 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -588,9 +588,9 @@ function toPrimitive_TIME(value: TIME): bigint | number { if (unit.MILLIS) { epochTime = timeValue; } else if (unit.MICROS) { - epochTime = BigInt(timeValue) * 1000n; // Convert from microseconds to nanoseconds + epochTime = BigInt(timeValue) * 1000n; } else if (unit.NANOS) { - epochTime = BigInt(timeValue) * 1000000n; // Convert from nanoseconds + epochTime = BigInt(timeValue) * 1000000n; } else { throw new Error('Unsupported time unit'); } From 05e24142469fdb5c5c11e072e881e40c96c2eec4 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Fri, 6 Sep 2024 11:37:38 -0500 Subject: [PATCH 09/29] spec says int32 for milli, bigint for micro and nano --- lib/types.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/types.ts b/lib/types.ts index d0011917..8c24c495 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -588,9 +588,9 @@ function toPrimitive_TIME(value: TIME): bigint | number { if (unit.MILLIS) { epochTime = timeValue; } else if (unit.MICROS) { - epochTime = BigInt(timeValue) * 1000n; + epochTime = BigInt(timeValue); } else if (unit.NANOS) { - epochTime = BigInt(timeValue) * 1000000n; + epochTime = BigInt(timeValue); } else { throw new Error('Unsupported time unit'); } From 38a1d57e30ea2985bef38c465685cb93e9515883 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Mon, 9 Sep 2024 12:05:16 -0500 Subject: [PATCH 10/29] redo based on generated types --- lib/codec/types.ts | 3 +- lib/declare.ts | 8 ++-- lib/fields.ts | 7 ++-- lib/jsonSchema.ts | 17 ++++---- lib/schema.ts | 12 +----- lib/types.ts | 96 +++++++++++++-------------------------------- test/fields.test.ts | 25 +++++++----- 7 files changed, 62 insertions(+), 106 deletions(-) diff --git a/lib/codec/types.ts b/lib/codec/types.ts index 7334fd4d..49ff5667 100644 --- a/lib/codec/types.ts +++ b/lib/codec/types.ts @@ -1,6 +1,6 @@ import { PrimitiveType } from '../declare'; import { ParquetCodec, OriginalType, ParquetField } from '../declare'; -import { Statistics } from '../../gen-nodejs/parquet_types'; +import { LogicalType, Statistics } from '../../gen-nodejs/parquet_types'; export interface Options { typeLength: number; @@ -8,6 +8,7 @@ export interface Options { disableEnvelope?: boolean; primitiveType?: PrimitiveType; originalType?: OriginalType; + logicalType?: LogicalType; encoding?: ParquetCodec; compression?: string; column?: ParquetField; diff --git a/lib/declare.ts b/lib/declare.ts index f698b69e..3c6f2f67 100644 --- a/lib/declare.ts +++ b/lib/declare.ts @@ -1,6 +1,6 @@ // Thanks to https://github.com/kbajalc/parquets -import parquet_thrift from '../gen-nodejs/parquet_types'; +import parquet_thrift, { LogicalType } from '../gen-nodejs/parquet_types'; import { Statistics, OffsetIndex, @@ -62,6 +62,7 @@ export type SchemaDefinition = Record; export interface FieldDefinition { type?: ParquetType; typeLength?: number; + logicalType?: LogicalType; encoding?: ParquetCodec; compression?: ParquetCompression; optional?: boolean; @@ -72,8 +73,6 @@ export interface FieldDefinition { num_children?: NumChildrenField; precision?: number; scale?: number; - unit?: 'MILLIS' | 'MICROS' | 'NANOS'; - isAdjustedToUTC?: boolean; } export interface ParquetField { @@ -83,6 +82,7 @@ export interface ParquetField { primitiveType?: PrimitiveType; originalType?: OriginalType; repetitionType: RepetitionType; + logicalType?: LogicalType; typeLength?: number; encoding?: ParquetCodec; compression?: ParquetCompression; @@ -94,8 +94,6 @@ export interface ParquetField { fieldCount?: number; fields?: Record; disableEnvelope?: boolean; - unit?: 'MILLIS' | 'MICROS' | 'NANOS'; - isAdjustedToUTC?: boolean; } interface ParentField { diff --git a/lib/fields.ts b/lib/fields.ts index 3d1cc986..fffba576 100644 --- a/lib/fields.ts +++ b/lib/fields.ts @@ -1,5 +1,6 @@ // Helper functions for creating fields +import { LogicalType, TimeType } from '../gen-nodejs/parquet_types'; import { FieldDefinition, ParquetType, SchemaDefinition } from './declare'; export function createStringField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition { @@ -82,8 +83,7 @@ export function createListField( } export function createTimeField( - unit: 'MILLIS' | 'MICROS' | 'NANOS', - isAdjustedToUTC = true, + logicalType: TimeType, optional = true, fieldOptions: FieldDefinition = {} ): FieldDefinition { @@ -91,7 +91,6 @@ export function createTimeField( ...fieldOptions, optional, type: 'TIME', - unit, - isAdjustedToUTC, + logicalType: new LogicalType({ TIME: logicalType }), }; } diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 0c2c62e6..6c9f02f0 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -2,6 +2,9 @@ import { JSONSchema4 } from 'json-schema'; import { FieldDefinition, SchemaDefinition } from './declare'; import * as fields from './fields'; +import { MilliSeconds, TimeUnit } from '../gen-nodejs/parquet_types'; +import { TimeType } from '../gen-nodejs/parquet_types'; +import { MicroSeconds } from '../gen-nodejs/parquet_types'; type SupportedJSONSchema4 = Omit< JSONSchema4, @@ -115,18 +118,18 @@ const fromJsonSchemaField = case 'object': if (fieldValue.properties && fieldValue.properties.type && fieldValue.properties.value) { const unit = fieldValue.properties.type.properties?.unit?.default?.toString(); - let defaultUnit: 'MILLIS' | 'MICROS' | 'NANOS' = 'MILLIS'; // Restrict to allowed values + let defaultUnit: TimeUnit = new TimeUnit({ MILLIS: MilliSeconds }); if (unit === 'MICROS') { - defaultUnit = 'MICROS'; + defaultUnit = new TimeUnit({ MILLIS: MilliSeconds }); } else if (unit === 'NANOS') { - defaultUnit = 'NANOS'; - } else if (unit !== 'MILLIS') { - throw new UnsupportedJsonSchemaError(`Unit type ${unit} is unsupported.`); + defaultUnit = new TimeUnit({ MICROS: MicroSeconds }); + } else { + defaultUnit = new TimeUnit({ MILLIS: MilliSeconds }); } - const isAdjustedToUTC = !!fieldValue.properties.type.properties?.isAdjustedToUTC?.default; - return fields.createTimeField(defaultUnit, isAdjustedToUTC, optional); + const timeLogicalType = new TimeType({ isAdjustedToUTC, unit: defaultUnit }); + return fields.createTimeField(timeLogicalType, optional); } return fields.createStructField(fromJsonSchema(fieldValue), optional); default: diff --git a/lib/schema.ts b/lib/schema.ts index 23ba2547..38e5adc6 100644 --- a/lib/schema.ts +++ b/lib/schema.ts @@ -126,8 +126,7 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP statistics: opts.statistics, fieldCount: Object.keys(opts.fields).length, fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)), - unit: opts.unit, - isAdjustedToUTC: opts.isAdjustedToUTC, + logicalType: opts.logicalType, }; if (opts.type == 'LIST' || opts.type == 'MAP') fieldList[name].originalType = opts.type; @@ -171,17 +170,12 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP ); } - if (typeDef.originalType === 'TIME') { - if (!opts.unit) { - fieldErrors.push(`Time type requires a unit, for Column: ${nameWithPath}`); - } - } - /* add to schema */ fieldList[name] = { name: name, primitiveType: typeDef.primitiveType, originalType: typeDef.originalType, + logicalType: opts.logicalType, path: path.concat([name]), repetitionType: repetitionType, encoding: opts.encoding, @@ -192,8 +186,6 @@ function buildFields(schema: SchemaDefinition, rLevelParentMax?: number, dLevelP typeLength: opts.typeLength || typeDef.typeLength, rLevelMax: rLevelMax, dLevelMax: dLevelMax, - unit: opts.unit, - isAdjustedToUTC: opts.isAdjustedToUTC, }; } diff --git a/lib/types.ts b/lib/types.ts index 8c24c495..a324e453 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -3,7 +3,6 @@ import { PrimitiveType, OriginalType, ParquetType, FieldDefinition, ParquetField } from './declare'; import { Options } from './codec/types'; import type { Document as BsonDocument } from 'bson'; -import { TimeType, TimeUnit } from '../gen-nodejs/parquet_types'; // BSON uses top level awaits, so use require for now const bsonSerialize = require('bson').serialize; const bsonDeserialize = require('bson').deserialize; @@ -22,11 +21,6 @@ interface INTERVAL { milliseconds: number; } -interface TIME { - type: TimeType; - value: number | bigint; -} - export function getParquetTypeDataObject( type: ParquetType, field?: ParquetField | Options | FieldDefinition @@ -53,6 +47,33 @@ export function getParquetTypeDataObject( toPrimitive: toPrimitive_INT64, }; } + } else if (type === 'TIME') { + if (field?.logicalType?.TIME) { + if (field.logicalType.TIME.unit.MILLIS) { + return { + primitiveType: 'INT32', + originalType: 'TIME', + toPrimitive: field.logicalType.TIME.isAdjustedToUTC? toPrimitive_TIME_UTC : toPrimitive_TIME_LOCAL, + }; + } + if (field.logicalType.TIME.unit.MICROS) { + return { + primitiveType: 'INT64', + originalType: 'TIME', + toPrimitive: field.logicalType.TIME.isAdjustedToUTC? toPrimitive_TIME_UTC : toPrimitive_TIME_LOCAL, + }; + } + if (field.logicalType.TIME.unit.NANOS) { + return { + primitiveType: 'INT64', + originalType: 'TIME', + toPrimitive: field.logicalType.TIME.isAdjustedToUTC? toPrimitive_TIME_UTC : toPrimitive_TIME_LOCAL, + }; + } + throw new Error('TIME type must have a unit'); + } else { + throw new Error('TIME type must have a logical type'); + } } else { return PARQUET_LOGICAL_TYPE_DATA[type]; } @@ -88,7 +109,6 @@ const PARQUET_LOGICAL_TYPES = new Set([ 'INTERVAL', 'MAP', 'LIST', - 'TIME', ] satisfies ParquetType[]); const PARQUET_LOGICAL_TYPE_DATA: Record = { @@ -232,11 +252,6 @@ const PARQUET_LOGICAL_TYPE_DATA: Record = { originalType: 'LIST', toPrimitive: toPrimitive_LIST, }, - TIME: { - originalType: 'TIME', - primitiveType: 'INT64', - toPrimitive: toPrimitive_TIME, - }, }; /** @@ -572,60 +587,3 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin throw 'invalid value'; } } -/** - * Convert a TIME value to its internal representation. - * This handles both `isAdjustedToUTC` and the correct `unit` (MILLIS, MICROS, NANOS). - * @param value The TIME object containing the value and the time type information. - * @returns The converted time value as bigint or number based on the unit. - */ -function toPrimitive_TIME(value: TIME): bigint | number { - const { type, value: timeValue } = value; - const { unit, isAdjustedToUTC } = type; - - let epochTime: number | bigint; - - if (typeof timeValue === 'number') { - if (unit.MILLIS) { - epochTime = timeValue; - } else if (unit.MICROS) { - epochTime = BigInt(timeValue); - } else if (unit.NANOS) { - epochTime = BigInt(timeValue); - } else { - throw new Error('Unsupported time unit'); - } - } else if (typeof timeValue === 'bigint') { - epochTime = timeValue; - } else { - throw new Error('Invalid value for TIME type'); - } - - if (!isAdjustedToUTC) { - return adjustToLocalTimestamp(epochTime, unit); - } - - return epochTime; -} - -/** - * Adjust the timestamp to local time if the time is not adjusted to UTC. - * @param timestamp The timestamp to adjust. - * @param unit The unit of the timestamp (MILLIS, MICROS, NANOS). - * @returns The adjusted timestamp. - */ -function adjustToLocalTimestamp(timestamp: bigint | number, unit: TimeUnit): bigint { - try { - const localOffset = BigInt(new Date().getTimezoneOffset()) * 60n * 1000n; - timestamp = BigInt(timestamp); - if (unit.MILLIS) { - return timestamp - localOffset; - } else if (unit.MICROS) { - return timestamp - localOffset * 1000n; - } else if (unit.NANOS) { - return timestamp - localOffset * 1000000n; - } - throw new Error('Unsupported time unit'); - } catch (e) { - throw new Error('Invalid value for TIME type'); - } -} diff --git a/test/fields.test.ts b/test/fields.test.ts index 5a87a59c..07bc09e3 100644 --- a/test/fields.test.ts +++ b/test/fields.test.ts @@ -1,6 +1,7 @@ import { assert } from 'chai'; import { ParquetSchema } from '../parquet'; import * as fields from '../lib/fields'; +import { MicroSeconds, MilliSeconds, NanoSeconds, TimeType, TimeUnit } from '../gen-nodejs/parquet_types'; describe('Field Builders: Primitive Types', function () { it('Can use primitive field types: String', function () { @@ -212,14 +213,15 @@ describe('Field Builders: Structs and Struct List', function () { it('Can use primitive field types: Time with default MILLIS', function () { const schema = new ParquetSchema({ - timeField: fields.createTimeField('MILLIS'), + timeField: fields.createTimeField( + new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit(new MilliSeconds()) }), + true + ), }); const c = schema.fields.timeField; assert.equal(c.name, 'timeField'); - assert.equal(c.primitiveType, 'INT64'); + assert.equal(c.primitiveType, 'INT32'); assert.equal(c.originalType, 'TIME'); - assert.equal(c.unit, 'MILLIS'); - assert.equal(c.isAdjustedToUTC, true); assert.deepEqual(c.path, ['timeField']); assert.equal(c.repetitionType, 'OPTIONAL'); assert.equal(c.encoding, 'PLAIN'); @@ -232,14 +234,15 @@ describe('Field Builders: Structs and Struct List', function () { it('Can use primitive field types: Time with MICROS', function () { const schema = new ParquetSchema({ - timeField: fields.createTimeField('MICROS', false), + timeField: fields.createTimeField( + new TimeType({ isAdjustedToUTC: false, unit: new TimeUnit(new MicroSeconds()) }), + true + ), }); const c = schema.fields.timeField; assert.equal(c.name, 'timeField'); assert.equal(c.primitiveType, 'INT64'); assert.equal(c.originalType, 'TIME'); - assert.equal(c.unit, 'MICROS'); - assert.equal(c.isAdjustedToUTC, false); assert.deepEqual(c.path, ['timeField']); assert.equal(c.repetitionType, 'OPTIONAL'); assert.equal(c.encoding, 'PLAIN'); @@ -252,14 +255,16 @@ describe('Field Builders: Structs and Struct List', function () { it('Can use primitive field types: Time with NANOS', function () { const schema = new ParquetSchema({ - timeField: fields.createTimeField('NANOS', true, true, { compression: 'GZIP' }), + timeField: fields.createTimeField( + new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit(new NanoSeconds()) }), + true, + { compression: 'GZIP' } + ), }); const c = schema.fields.timeField; assert.equal(c.name, 'timeField'); assert.equal(c.primitiveType, 'INT64'); assert.equal(c.originalType, 'TIME'); - assert.equal(c.unit, 'NANOS'); - assert.equal(c.isAdjustedToUTC, true); assert.equal(c.compression, 'GZIP'); assert.deepEqual(c.path, ['timeField']); assert.equal(c.repetitionType, 'OPTIONAL'); From dddb09a9defa3d2e91a294bf39629794accc44a4 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Mon, 9 Sep 2024 16:12:42 -0500 Subject: [PATCH 11/29] add TIME related primitive converters --- lib/types.ts | 98 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 11 deletions(-) diff --git a/lib/types.ts b/lib/types.ts index a324e453..4393cee5 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -26,18 +26,17 @@ export function getParquetTypeDataObject( field?: ParquetField | Options | FieldDefinition ): ParquetTypeDataObject { if (type === 'DECIMAL') { - if (field?.typeLength !== undefined && field?.typeLength !== null) { + if (field?.typeLength !== undefined) { return { primitiveType: 'FIXED_LEN_BYTE_ARRAY', originalType: 'DECIMAL', typeLength: field.typeLength, toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL, }; - } else if (field?.precision !== undefined && field?.precision !== null && field.precision > 18) { + } else if (field?.precision && field.precision > 18) { return { primitiveType: 'BYTE_ARRAY', originalType: 'DECIMAL', - typeLength: field.typeLength, toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL, }; } else { @@ -49,30 +48,33 @@ export function getParquetTypeDataObject( } } else if (type === 'TIME') { if (field?.logicalType?.TIME) { - if (field.logicalType.TIME.unit.MILLIS) { + const isAdjustedToUTC = field.logicalType.TIME.isAdjustedToUTC; + const unit = field.logicalType.TIME.unit; + + if (unit.MILLIS) { return { primitiveType: 'INT32', originalType: 'TIME', - toPrimitive: field.logicalType.TIME.isAdjustedToUTC? toPrimitive_TIME_UTC : toPrimitive_TIME_LOCAL, + toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MILLIS_UTC : toPrimitive_TIME_MILLIS_LOCAL, }; } - if (field.logicalType.TIME.unit.MICROS) { + if (unit.MICROS) { return { primitiveType: 'INT64', originalType: 'TIME', - toPrimitive: field.logicalType.TIME.isAdjustedToUTC? toPrimitive_TIME_UTC : toPrimitive_TIME_LOCAL, + toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MICROS_UTC : toPrimitive_TIME_MICROS_LOCAL, }; } - if (field.logicalType.TIME.unit.NANOS) { + if (unit.NANOS) { return { primitiveType: 'INT64', originalType: 'TIME', - toPrimitive: field.logicalType.TIME.isAdjustedToUTC? toPrimitive_TIME_UTC : toPrimitive_TIME_LOCAL, + toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_NANOS_UTC : toPrimitive_TIME_NANOS_LOCAL, }; } - throw new Error('TIME type must have a unit'); + throw new Error('TIME type must have a valid unit (MILLIS, MICROS, NANOS).'); } else { - throw new Error('TIME type must have a logical type'); + throw new Error('TIME type must have a logical type.'); } } else { return PARQUET_LOGICAL_TYPE_DATA[type]; @@ -587,3 +589,77 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin throw 'invalid value'; } } + +/** + * Convert a TIME value in MILLIS to its UTC representation. + * @param value The time value. + */ +function toPrimitive_TIME_MILLIS_UTC(value: number | string): number { + return typeof value === 'string' ? Number(value) : value; +} + +/** + * Convert a TIME value in MILLIS to its local time representation. + * @param value The time value. + */ +function toPrimitive_TIME_MILLIS_LOCAL(value: number | string): number { + const millis = typeof value === 'string' ? Number(value) : value; + return Number(adjustToLocalTimestamp(BigInt(millis), { MILLIS: true })); +} + +/** + * Convert a TIME value in MICROS to its UTC representation. + * @param value The time value. + */ +function toPrimitive_TIME_MICROS_UTC(value: bigint | string): bigint { + return BigInt(value); +} + +/** + * Convert a TIME value in MICROS to its local time representation. + * @param value The time value. + */ +function toPrimitive_TIME_MICROS_LOCAL(value: bigint | string): bigint { + const micros = BigInt(value); + return adjustToLocalTimestamp(micros, { MICROS: true }); +} + +/** + * Convert a TIME value in NANOS to its UTC representation. + * @param value The time value. + */ +function toPrimitive_TIME_NANOS_UTC(value: bigint | string): bigint { + return BigInt(value); +} + +/** + * Convert a TIME value in NANOS to its local time representation. + * @param value The time value. + */ +function toPrimitive_TIME_NANOS_LOCAL(value: bigint | string): bigint { + const nanos = BigInt(value); + return adjustToLocalTimestamp(nanos, { NANOS: true }); +} + +/** + * Adjust the timestamp to local time based on the unit (MILLIS, MICROS, NANOS). + * @param timestamp The timestamp to adjust. + * @param unit The unit of the timestamp. + * @returns The adjusted timestamp. + */ +function adjustToLocalTimestamp( + timestamp: bigint, + unit: { MILLIS?: boolean; MICROS?: boolean; NANOS?: boolean } +): bigint { + const localOffset = BigInt(new Date().getTimezoneOffset()) * 60n * 1000n; // Offset in milliseconds + + if (unit.MILLIS) { + return timestamp - localOffset; + } else if (unit.MICROS) { + return timestamp - localOffset * 1000n; + } else if (unit.NANOS) { + return timestamp - localOffset * 1000000n; + } + + throw new Error('Unsupported time unit'); +} From 9401f676427d3c1351af57ad6709b3b1cf637b65 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Mon, 9 Sep 2024 16:24:44 -0500 Subject: [PATCH 12/29] support time field for json schema --- lib/jsonSchema.ts | 74 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 6c9f02f0..590f19f7 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -2,9 +2,8 @@ import { JSONSchema4 } from 'json-schema'; import { FieldDefinition, SchemaDefinition } from './declare'; import * as fields from './fields'; -import { MilliSeconds, TimeUnit } from '../gen-nodejs/parquet_types'; +import { TimeUnit } from '../gen-nodejs/parquet_types'; import { TimeType } from '../gen-nodejs/parquet_types'; -import { MicroSeconds } from '../gen-nodejs/parquet_types'; type SupportedJSONSchema4 = Omit< JSONSchema4, @@ -73,18 +72,49 @@ const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList switch (fieldValue.items.type) { case 'string': - if (fieldValue.items.format && fieldValue.items.format == 'date-time') { + if (fieldValue.items.format && fieldValue.items.format === 'date-time') { return fields.createListField('TIMESTAMP_MILLIS', optionalFieldList); } return fields.createListField('UTF8', optionalFieldList); + case 'integer': return fields.createListField('INT64', optionalFieldList); + case 'number': return fields.createListField('DOUBLE', optionalFieldList); + case 'boolean': return fields.createListField('BOOLEAN', optionalFieldList); + case 'object': + // Handle array of time fields + if ( + fieldValue.items.properties && + fieldValue.items.properties.unit && + fieldValue.items.properties.isAdjustedToUTC + ) { + const unit = fieldValue.items.properties.unit.default?.toString() || 'MILLIS'; + const isAdjustedToUTC = !!fieldValue.items.properties.isAdjustedToUTC.default; + let timeUnit: TimeUnit; + + switch (unit) { + case 'MICROS': + timeUnit = new TimeUnit({ MICROS: true }); + break; + case 'NANOS': + timeUnit = new TimeUnit({ NANOS: true }); + break; + default: + timeUnit = new TimeUnit({ MILLIS: true }); + break; + } + + const timeLogicalType = new TimeType({ isAdjustedToUTC, unit: timeUnit }); + return fields.createTimeField(timeLogicalType, optionalFieldList); + } + return fields.createStructListField(fromJsonSchema(fieldValue.items), optionalFieldList); + default: throw new UnsupportedJsonSchemaError(`Array field type ${JSON.stringify(fieldValue.items)} is unsupported.`); } @@ -103,35 +133,47 @@ const fromJsonSchemaField = switch (fieldValue.type) { case 'string': - if (fieldValue.format && fieldValue.format == 'date-time') { + if (fieldValue.format && fieldValue.format === 'date-time') { return fields.createTimestampField(optional); } return fields.createStringField(optional); + case 'integer': return fields.createIntField(64, optional); + case 'number': return fields.createDoubleField(optional); + case 'boolean': return fields.createBooleanField(optional); + case 'array': return fromJsonSchemaArray(fieldValue, optional); + case 'object': - if (fieldValue.properties && fieldValue.properties.type && fieldValue.properties.value) { - const unit = fieldValue.properties.type.properties?.unit?.default?.toString(); - let defaultUnit: TimeUnit = new TimeUnit({ MILLIS: MilliSeconds }); - - if (unit === 'MICROS') { - defaultUnit = new TimeUnit({ MILLIS: MilliSeconds }); - } else if (unit === 'NANOS') { - defaultUnit = new TimeUnit({ MICROS: MicroSeconds }); - } else { - defaultUnit = new TimeUnit({ MILLIS: MilliSeconds }); + if (fieldValue.properties && fieldValue.properties.unit && fieldValue.properties.isAdjustedToUTC) { + const unit = fieldValue.properties.unit.default?.toString() || 'MILLIS'; + const isAdjustedToUTC = !!fieldValue.properties.isAdjustedToUTC.default; + let timeUnit: TimeUnit; + + switch (unit) { + case 'MICROS': + timeUnit = new TimeUnit({ MICROS: true }); + break; + case 'NANOS': + timeUnit = new TimeUnit({ NANOS: true }); + break; + default: + timeUnit = new TimeUnit({ MILLIS: true }); + break; } - const isAdjustedToUTC = !!fieldValue.properties.type.properties?.isAdjustedToUTC?.default; - const timeLogicalType = new TimeType({ isAdjustedToUTC, unit: defaultUnit }); + + const timeLogicalType = new TimeType({ isAdjustedToUTC, unit: timeUnit }); return fields.createTimeField(timeLogicalType, optional); } + return fields.createStructField(fromJsonSchema(fieldValue), optional); + default: throw new UnsupportedJsonSchemaError( `Unable to convert "${fieldName}" with JSON Schema type "${fieldValue.type}" to a Parquet Schema.` From 13007a593b98fcc4da044c682c92292c72003fb2 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Mon, 9 Sep 2024 16:52:02 -0500 Subject: [PATCH 13/29] finalize implementation side of TIME logical type --- lib/declare.ts | 3 +-- lib/fields.ts | 1 - lib/types.ts | 51 ++++++++++++++++++++++---------------------------- 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/lib/declare.ts b/lib/declare.ts index 3c6f2f67..ca347737 100644 --- a/lib/declare.ts +++ b/lib/declare.ts @@ -54,8 +54,7 @@ export type OriginalType = | 'INT_64' // 18 | 'JSON' // 19 | 'BSON' // 20 - | 'INTERVAL' // 21 - | 'TIME'; // 22 + | 'INTERVAL'; // 21 export type SchemaDefinition = Record; diff --git a/lib/fields.ts b/lib/fields.ts index fffba576..f0f8ee85 100644 --- a/lib/fields.ts +++ b/lib/fields.ts @@ -90,7 +90,6 @@ export function createTimeField( return { ...fieldOptions, optional, - type: 'TIME', logicalType: new LogicalType({ TIME: logicalType }), }; } diff --git a/lib/types.ts b/lib/types.ts index 4393cee5..11f79d2c 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -46,36 +46,29 @@ export function getParquetTypeDataObject( toPrimitive: toPrimitive_INT64, }; } - } else if (type === 'TIME') { - if (field?.logicalType?.TIME) { - const isAdjustedToUTC = field.logicalType.TIME.isAdjustedToUTC; - const unit = field.logicalType.TIME.unit; - - if (unit.MILLIS) { - return { - primitiveType: 'INT32', - originalType: 'TIME', - toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MILLIS_UTC : toPrimitive_TIME_MILLIS_LOCAL, - }; - } - if (unit.MICROS) { - return { - primitiveType: 'INT64', - originalType: 'TIME', - toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MICROS_UTC : toPrimitive_TIME_MICROS_LOCAL, - }; - } - if (unit.NANOS) { - return { - primitiveType: 'INT64', - originalType: 'TIME', - toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_NANOS_UTC : toPrimitive_TIME_NANOS_LOCAL, - }; - } - throw new Error('TIME type must have a valid unit (MILLIS, MICROS, NANOS).'); - } else { - throw new Error('TIME type must have a logical type.'); + } else if (field?.logicalType?.TIME) { + const isAdjustedToUTC = field.logicalType.TIME.isAdjustedToUTC; + const unit = field.logicalType.TIME.unit; + + if (unit.MILLIS) { + return { + primitiveType: 'INT32', + toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MILLIS_UTC : toPrimitive_TIME_MILLIS_LOCAL, + }; + } + if (unit.MICROS) { + return { + primitiveType: 'INT64', + toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MICROS_UTC : toPrimitive_TIME_MICROS_LOCAL, + }; + } + if (unit.NANOS) { + return { + primitiveType: 'INT64', + toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_NANOS_UTC : toPrimitive_TIME_NANOS_LOCAL, + }; } + throw new Error('TIME type must have a valid unit (MILLIS, MICROS, NANOS).'); } else { return PARQUET_LOGICAL_TYPE_DATA[type]; } From ae58c9612a7d6d7a33bb7feba24d3ab1fd55a97a Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Mon, 9 Sep 2024 16:53:48 -0500 Subject: [PATCH 14/29] set converted types for backward compatibility --- lib/types.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/types.ts b/lib/types.ts index 11f79d2c..64f3dd93 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -52,12 +52,14 @@ export function getParquetTypeDataObject( if (unit.MILLIS) { return { + originalType: 'TIME_MILLIS', primitiveType: 'INT32', toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MILLIS_UTC : toPrimitive_TIME_MILLIS_LOCAL, }; } if (unit.MICROS) { return { + originalType: 'TIME_MICROS', primitiveType: 'INT64', toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MICROS_UTC : toPrimitive_TIME_MICROS_LOCAL, }; From 42d72ad3ac0dab90224ae76340a44b3c5dc5fe3f Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Mon, 9 Sep 2024 18:15:31 -0500 Subject: [PATCH 15/29] set example field tests --- lib/fields.ts | 11 ++++++++++- lib/types.ts | 2 +- test/fields.test.ts | 12 ++++++------ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/lib/fields.ts b/lib/fields.ts index f0f8ee85..b80cc3c6 100644 --- a/lib/fields.ts +++ b/lib/fields.ts @@ -1,7 +1,7 @@ // Helper functions for creating fields import { LogicalType, TimeType } from '../gen-nodejs/parquet_types'; -import { FieldDefinition, ParquetType, SchemaDefinition } from './declare'; +import { FieldDefinition, ParquetType, PrimitiveType, SchemaDefinition } from './declare'; export function createStringField(optional = true, fieldOptions: FieldDefinition = {}): FieldDefinition { return { ...fieldOptions, optional, type: 'UTF8' }; @@ -87,9 +87,18 @@ export function createTimeField( optional = true, fieldOptions: FieldDefinition = {} ): FieldDefinition { + let primitiveType: PrimitiveType; + if (logicalType.unit.MILLIS) { + primitiveType = 'INT32'; // TIME_MILLIS uses INT32 + } else if (logicalType.unit.MICROS || logicalType.unit.NANOS) { + primitiveType = 'INT64'; // TIME_MICROS and TIME_NANOS use INT64 + } else { + throw new Error('Unsupported time unit in logicalType'); + } return { ...fieldOptions, optional, + type: primitiveType, logicalType: new LogicalType({ TIME: logicalType }), }; } diff --git a/lib/types.ts b/lib/types.ts index 64f3dd93..90b5dc83 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -46,7 +46,7 @@ export function getParquetTypeDataObject( toPrimitive: toPrimitive_INT64, }; } - } else if (field?.logicalType?.TIME) { + } else if (field?.logicalType?.TIME && (type === 'INT64' || type === 'INT32')) { const isAdjustedToUTC = field.logicalType.TIME.isAdjustedToUTC; const unit = field.logicalType.TIME.unit; diff --git a/test/fields.test.ts b/test/fields.test.ts index 07bc09e3..4760f715 100644 --- a/test/fields.test.ts +++ b/test/fields.test.ts @@ -214,14 +214,14 @@ describe('Field Builders: Structs and Struct List', function () { it('Can use primitive field types: Time with default MILLIS', function () { const schema = new ParquetSchema({ timeField: fields.createTimeField( - new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit(new MilliSeconds()) }), + new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit({ MILLIS: new MilliSeconds() }) }), true ), }); const c = schema.fields.timeField; assert.equal(c.name, 'timeField'); assert.equal(c.primitiveType, 'INT32'); - assert.equal(c.originalType, 'TIME'); + assert.equal(c.originalType, 'TIME_MILLIS'); assert.deepEqual(c.path, ['timeField']); assert.equal(c.repetitionType, 'OPTIONAL'); assert.equal(c.encoding, 'PLAIN'); @@ -235,14 +235,14 @@ describe('Field Builders: Structs and Struct List', function () { it('Can use primitive field types: Time with MICROS', function () { const schema = new ParquetSchema({ timeField: fields.createTimeField( - new TimeType({ isAdjustedToUTC: false, unit: new TimeUnit(new MicroSeconds()) }), + new TimeType({ isAdjustedToUTC: false, unit: new TimeUnit({ MICROS: new MicroSeconds() }) }), true ), }); const c = schema.fields.timeField; assert.equal(c.name, 'timeField'); assert.equal(c.primitiveType, 'INT64'); - assert.equal(c.originalType, 'TIME'); + assert.equal(c.originalType, 'TIME_MICROS'); assert.deepEqual(c.path, ['timeField']); assert.equal(c.repetitionType, 'OPTIONAL'); assert.equal(c.encoding, 'PLAIN'); @@ -256,7 +256,7 @@ describe('Field Builders: Structs and Struct List', function () { it('Can use primitive field types: Time with NANOS', function () { const schema = new ParquetSchema({ timeField: fields.createTimeField( - new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit(new NanoSeconds()) }), + new TimeType({ isAdjustedToUTC: true, unit: new TimeUnit({ NANOS: new NanoSeconds() }) }), true, { compression: 'GZIP' } ), @@ -264,7 +264,7 @@ describe('Field Builders: Structs and Struct List', function () { const c = schema.fields.timeField; assert.equal(c.name, 'timeField'); assert.equal(c.primitiveType, 'INT64'); - assert.equal(c.originalType, 'TIME'); + assert.equal(c.originalType, undefined); assert.equal(c.compression, 'GZIP'); assert.deepEqual(c.path, ['timeField']); assert.equal(c.repetitionType, 'OPTIONAL'); From 510f402072de87253e5041ea38a7acfc20bc3af3 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Tue, 10 Sep 2024 11:22:08 -0500 Subject: [PATCH 16/29] set an example time schema --- test/test-files/time.schema.json | 69 ++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 test/test-files/time.schema.json diff --git a/test/test-files/time.schema.json b/test/test-files/time.schema.json new file mode 100644 index 00000000..ffc34b5c --- /dev/null +++ b/test/test-files/time.schema.json @@ -0,0 +1,69 @@ +{ + "$id": "https://example.com/time.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical types in Parquet", + "type": "object", + "properties": { + "time_millis_field": { + "type": "object", + "properties": { + "value": { + "type": "string", + "format": "date-time", + "description": "Time field represented as a string with date-time format" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "description": "The unit for the time value" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "unit", "isAdjustedToUTC"] + }, + "time_micros_field": { + "type": "object", + "properties": { + "value": { + "type": "string", + "format": "date-time", + "description": "Time field represented as a string with date-time format" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "description": "The unit for the time value" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "unit", "isAdjustedToUTC"] + }, + "time_nanos_field": { + "type": "object", + "properties": { + "value": { + "type": "string", + "format": "date-time", + "description": "Time field represented as a string with date-time format" + }, + "unit": { + "type": "string", + "enum": ["MILLIS", "MICROS", "NANOS"], + "description": "The unit for the time value" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "unit", "isAdjustedToUTC"] + } + }, + "required": ["time_millis_field", "time_micros_field", "time_nanos_field"] +} From bc9e11c873f0579b4026bcb9ee0f6d4e69b479ac Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Tue, 10 Sep 2024 14:17:30 -0500 Subject: [PATCH 17/29] set schema test --- test/jsonSchema.test.ts | 7 + test/test-files/time.schema.result.json | 290 ++++++++++++++++++++++++ 2 files changed, 297 insertions(+) create mode 100644 test/test-files/time.schema.result.json diff --git a/test/jsonSchema.test.ts b/test/jsonSchema.test.ts index d0f29a0a..2b4c1877 100644 --- a/test/jsonSchema.test.ts +++ b/test/jsonSchema.test.ts @@ -6,6 +6,7 @@ import addressSchema from './test-files/address.schema.json'; import arraySchema from './test-files/array.schema.json'; import objectSchema from './test-files/object.schema.json'; import objectNestedSchema from './test-files/object-nested.schema.json'; +import timeSchema from './test-files/time.schema.json'; import { ParquetSchema, ParquetWriter, ParquetReader } from '../parquet'; @@ -52,6 +53,12 @@ describe('Json Schema Conversion', function () { const ps = ParquetSchema.fromJsonSchema(js); checkSnapshot(ps, './test-files/object-nested.schema.result.json', update); }); + + it('Time Schema', function () { + const js = timeSchema as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema.result.json', update); + }); }); const parquetSchema = ParquetSchema.fromJsonSchema({ diff --git a/test/test-files/time.schema.result.json b/test/test-files/time.schema.result.json new file mode 100644 index 00000000..1ca47edf --- /dev/null +++ b/test/test-files/time.schema.result.json @@ -0,0 +1,290 @@ +{ + "schema": { + "time_millis_field": { + "optional": false, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + }, + "time_micros_field": { + "optional": false, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + }, + "time_nanos_field": { + "optional": false, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_millis_field": { + "name": "time_millis_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_millis_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + }, + "time_micros_field": { + "name": "time_micros_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_micros_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + }, + "time_nanos_field": { + "name": "time_nanos_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_nanos_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_millis_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_millis_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + }, + { + "name": "time_micros_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_micros_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + }, + { + "name": "time_nanos_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_nanos_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} From 44f64aaf4f3f954ae14ab03f37f6d46c179f6d4a Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Tue, 10 Sep 2024 16:35:39 -0500 Subject: [PATCH 18/29] update schema files for testing --- test/test-files/time.schema.json | 40 ----- test/test-files/time.schema.result.json | 188 ------------------------ 2 files changed, 228 deletions(-) diff --git a/test/test-files/time.schema.json b/test/test-files/time.schema.json index ffc34b5c..55df8b9f 100644 --- a/test/test-files/time.schema.json +++ b/test/test-files/time.schema.json @@ -23,46 +23,6 @@ } }, "required": ["value", "unit", "isAdjustedToUTC"] - }, - "time_micros_field": { - "type": "object", - "properties": { - "value": { - "type": "string", - "format": "date-time", - "description": "Time field represented as a string with date-time format" - }, - "unit": { - "type": "string", - "enum": ["MILLIS", "MICROS", "NANOS"], - "description": "The unit for the time value" - }, - "isAdjustedToUTC": { - "type": "boolean", - "description": "Whether the time is adjusted to UTC" - } - }, - "required": ["value", "unit", "isAdjustedToUTC"] - }, - "time_nanos_field": { - "type": "object", - "properties": { - "value": { - "type": "string", - "format": "date-time", - "description": "Time field represented as a string with date-time format" - }, - "unit": { - "type": "string", - "enum": ["MILLIS", "MICROS", "NANOS"], - "description": "The unit for the time value" - }, - "isAdjustedToUTC": { - "type": "boolean", - "description": "Whether the time is adjusted to UTC" - } - }, - "required": ["value", "unit", "isAdjustedToUTC"] } }, "required": ["time_millis_field", "time_micros_field", "time_nanos_field"] diff --git a/test/test-files/time.schema.result.json b/test/test-files/time.schema.result.json index 1ca47edf..1d4a94b3 100644 --- a/test/test-files/time.schema.result.json +++ b/test/test-files/time.schema.result.json @@ -27,62 +27,6 @@ }, "encoding": "PLAIN", "compression": "UNCOMPRESSED" - }, - "time_micros_field": { - "optional": false, - "type": "INT32", - "logicalType": { - "STRING": null, - "MAP": null, - "LIST": null, - "ENUM": null, - "DECIMAL": null, - "DATE": null, - "TIME": { - "isAdjustedToUTC": false, - "unit": { - "MILLIS": {}, - "MICROS": null, - "NANOS": null - } - }, - "TIMESTAMP": null, - "INTEGER": null, - "UNKNOWN": null, - "JSON": null, - "BSON": null, - "UUID": null - }, - "encoding": "PLAIN", - "compression": "UNCOMPRESSED" - }, - "time_nanos_field": { - "optional": false, - "type": "INT32", - "logicalType": { - "STRING": null, - "MAP": null, - "LIST": null, - "ENUM": null, - "DECIMAL": null, - "DATE": null, - "TIME": { - "isAdjustedToUTC": false, - "unit": { - "MILLIS": {}, - "MICROS": null, - "NANOS": null - } - }, - "TIMESTAMP": null, - "INTEGER": null, - "UNKNOWN": null, - "JSON": null, - "BSON": null, - "UUID": null - }, - "encoding": "PLAIN", - "compression": "UNCOMPRESSED" } }, "fields": { @@ -118,72 +62,6 @@ "compression": "UNCOMPRESSED", "rLevelMax": 0, "dLevelMax": 0 - }, - "time_micros_field": { - "name": "time_micros_field", - "primitiveType": "INT32", - "originalType": "TIME_MILLIS", - "logicalType": { - "STRING": null, - "MAP": null, - "LIST": null, - "ENUM": null, - "DECIMAL": null, - "DATE": null, - "TIME": { - "isAdjustedToUTC": false, - "unit": { - "MILLIS": {}, - "MICROS": null, - "NANOS": null - } - }, - "TIMESTAMP": null, - "INTEGER": null, - "UNKNOWN": null, - "JSON": null, - "BSON": null, - "UUID": null - }, - "path": ["time_micros_field"], - "repetitionType": "REQUIRED", - "encoding": "PLAIN", - "compression": "UNCOMPRESSED", - "rLevelMax": 0, - "dLevelMax": 0 - }, - "time_nanos_field": { - "name": "time_nanos_field", - "primitiveType": "INT32", - "originalType": "TIME_MILLIS", - "logicalType": { - "STRING": null, - "MAP": null, - "LIST": null, - "ENUM": null, - "DECIMAL": null, - "DATE": null, - "TIME": { - "isAdjustedToUTC": false, - "unit": { - "MILLIS": {}, - "MICROS": null, - "NANOS": null - } - }, - "TIMESTAMP": null, - "INTEGER": null, - "UNKNOWN": null, - "JSON": null, - "BSON": null, - "UUID": null - }, - "path": ["time_nanos_field"], - "repetitionType": "REQUIRED", - "encoding": "PLAIN", - "compression": "UNCOMPRESSED", - "rLevelMax": 0, - "dLevelMax": 0 } }, "fieldList": [ @@ -219,72 +97,6 @@ "compression": "UNCOMPRESSED", "rLevelMax": 0, "dLevelMax": 0 - }, - { - "name": "time_micros_field", - "primitiveType": "INT32", - "originalType": "TIME_MILLIS", - "logicalType": { - "STRING": null, - "MAP": null, - "LIST": null, - "ENUM": null, - "DECIMAL": null, - "DATE": null, - "TIME": { - "isAdjustedToUTC": false, - "unit": { - "MILLIS": {}, - "MICROS": null, - "NANOS": null - } - }, - "TIMESTAMP": null, - "INTEGER": null, - "UNKNOWN": null, - "JSON": null, - "BSON": null, - "UUID": null - }, - "path": ["time_micros_field"], - "repetitionType": "REQUIRED", - "encoding": "PLAIN", - "compression": "UNCOMPRESSED", - "rLevelMax": 0, - "dLevelMax": 0 - }, - { - "name": "time_nanos_field", - "primitiveType": "INT32", - "originalType": "TIME_MILLIS", - "logicalType": { - "STRING": null, - "MAP": null, - "LIST": null, - "ENUM": null, - "DECIMAL": null, - "DATE": null, - "TIME": { - "isAdjustedToUTC": false, - "unit": { - "MILLIS": {}, - "MICROS": null, - "NANOS": null - } - }, - "TIMESTAMP": null, - "INTEGER": null, - "UNKNOWN": null, - "JSON": null, - "BSON": null, - "UUID": null - }, - "path": ["time_nanos_field"], - "repetitionType": "REQUIRED", - "encoding": "PLAIN", - "compression": "UNCOMPRESSED", - "rLevelMax": 0, - "dLevelMax": 0 } ] } From 97b1475d62ad7a447f1b2bccefbd7bf08f1e8851 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Tue, 10 Sep 2024 16:41:47 -0500 Subject: [PATCH 19/29] schema test result file --- test/test-files/time.schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-files/time.schema.json b/test/test-files/time.schema.json index 55df8b9f..f8c0c82f 100644 --- a/test/test-files/time.schema.json +++ b/test/test-files/time.schema.json @@ -25,5 +25,5 @@ "required": ["value", "unit", "isAdjustedToUTC"] } }, - "required": ["time_millis_field", "time_micros_field", "time_nanos_field"] + "required": ["time_millis_field"] } From 64857538d1b795837e4c143f4dc38beea7f8fc1f Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Wed, 11 Sep 2024 12:13:06 -0500 Subject: [PATCH 20/29] adjust types based on ongoing file tests --- lib/types.ts | 86 +++++++++++++++++----------------------------------- 1 file changed, 27 insertions(+), 59 deletions(-) diff --git a/lib/types.ts b/lib/types.ts index 90b5dc83..ab8f4a62 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -21,6 +21,12 @@ interface INTERVAL { milliseconds: number; } +interface TIME { + value: string | bigint | number; + unit: 'MILLIS' | 'MICROS' | 'NANOS'; + isAdjustedToUTC: boolean; +} + export function getParquetTypeDataObject( type: ParquetType, field?: ParquetField | Options | FieldDefinition @@ -46,28 +52,26 @@ export function getParquetTypeDataObject( toPrimitive: toPrimitive_INT64, }; } - } else if (field?.logicalType?.TIME && (type === 'INT64' || type === 'INT32')) { - const isAdjustedToUTC = field.logicalType.TIME.isAdjustedToUTC; + } else if (field?.logicalType?.TIME) { const unit = field.logicalType.TIME.unit; - if (unit.MILLIS) { return { originalType: 'TIME_MILLIS', primitiveType: 'INT32', - toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MILLIS_UTC : toPrimitive_TIME_MILLIS_LOCAL, + toPrimitive: toPrimitive_TIME, }; } if (unit.MICROS) { return { originalType: 'TIME_MICROS', primitiveType: 'INT64', - toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_MICROS_UTC : toPrimitive_TIME_MICROS_LOCAL, + toPrimitive: toPrimitive_TIME, }; } if (unit.NANOS) { return { primitiveType: 'INT64', - toPrimitive: isAdjustedToUTC ? toPrimitive_TIME_NANOS_UTC : toPrimitive_TIME_NANOS_LOCAL, + toPrimitive: toPrimitive_TIME, }; } throw new Error('TIME type must have a valid unit (MILLIS, MICROS, NANOS).'); @@ -585,63 +589,27 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin } } -/** - * Convert a TIME value in MILLIS to its UTC representation. - * @param value The time value. - */ -function toPrimitive_TIME_MILLIS_UTC(value: number | string): number { - return typeof value === 'string' ? Number(value) : value; -} - -/** - * Convert a TIME value in MILLIS to its local time representation. - * @param value The time value. - */ -function toPrimitive_TIME_MILLIS_LOCAL(value: number | string): number { - const millis = typeof value === 'string' ? Number(value) : value; - return Number(adjustToLocalTimestamp(BigInt(millis), { MILLIS: true })); -} - -/** - * Convert a TIME value in MICROS to its UTC representation. - * @param value The time value. - */ -function toPrimitive_TIME_MICROS_UTC(value: bigint | string): bigint { - return BigInt(value); -} - -/** - * Convert a TIME value in MICROS to its local time representation. - * @param value The time value. - */ -function toPrimitive_TIME_MICROS_LOCAL(value: bigint | string): bigint { - const micros = BigInt(value); - return adjustToLocalTimestamp(micros, { MICROS: true }); -} +function toPrimitive_TIME(time: TIME): bigint | number { + const { value, unit, isAdjustedToUTC } = time; -/** - * Convert a TIME value in NANOS to its UTC representation. - * @param value The time value. - */ -function toPrimitive_TIME_NANOS_UTC(value: bigint | string): bigint { - return BigInt(value); -} + const timeValue = typeof value === 'string' ? BigInt(value) : BigInt(value); -/** - * Convert a TIME value in NANOS to its local time representation. - * @param value The time value. - */ -function toPrimitive_TIME_NANOS_LOCAL(value: bigint | string): bigint { - const nanos = BigInt(value); - return adjustToLocalTimestamp(nanos, { NANOS: true }); + if (isAdjustedToUTC) { + return unit === 'MILLIS' ? Number(timeValue) : timeValue; + } else { + switch (unit) { + case 'MILLIS': + return Number(adjustToLocalTimestamp(timeValue, { MILLIS: true })); + case 'MICROS': + return adjustToLocalTimestamp(timeValue, { MICROS: true }); + case 'NANOS': + return adjustToLocalTimestamp(timeValue, { NANOS: true }); + default: + throw new Error(`Unsupported time unit: ${unit}`); + } + } } -/** - * Adjust the timestamp to local time based on the unit (MILLIS, MICROS, NANOS). - * @param timestamp The timestamp to adjust. - * @param unit The unit of the timestamp. - * @returns The adjusted timestamp. - */ function adjustToLocalTimestamp( timestamp: bigint, unit: { MILLIS?: boolean; MICROS?: boolean; NANOS?: boolean } From 6927f31bfd96c5998c9a94194b78ffcb7598cc45 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Wed, 11 Sep 2024 13:15:47 -0500 Subject: [PATCH 21/29] logical types from parquet file cannot be reconstructed back without schema --- lib/types.ts | 1 + test/jsonSchema.test.ts | 59 +++++++++++- .../json-schema-test-file.result.json | 14 ++- .../json-schema-test-file.schema.result.json | 94 +++++++++++++++++++ 4 files changed, 166 insertions(+), 2 deletions(-) diff --git a/lib/types.ts b/lib/types.ts index ab8f4a62..1cfcb7ba 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -590,6 +590,7 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin } function toPrimitive_TIME(time: TIME): bigint | number { + console.log('time', time); const { value, unit, isAdjustedToUTC } = time; const timeValue = typeof value === 'string' ? BigInt(value) : BigInt(value); diff --git a/test/jsonSchema.test.ts b/test/jsonSchema.test.ts index 2b4c1877..93447cb0 100644 --- a/test/jsonSchema.test.ts +++ b/test/jsonSchema.test.ts @@ -120,6 +120,21 @@ const parquetSchema = ParquetSchema.fromJsonSchema({ }, additionalItems: false, }, + time_field: { + type: 'object', + properties: { + value: { + type: 'number', + }, + unit: { + type: 'string', + }, + isAdjustedToUTC: { + type: 'boolean', + }, + }, + additionalProperties: false, + }, }, additionalProperties: false, }); @@ -159,6 +174,48 @@ describe('Json Schema Conversion Test File', function () { }, ], }, + time_field: { + value: 1726067527, + unit: 'MILLIS', + isAdjustedToUTC: true, + }, + }; + + const row1FromParquetFile = { + string_field: 'string value', + int_field: 10n, + number_field: 2.5, + timestamp_array_field: { list: [{ element: new Date('2023-01-01 GMT') }] }, + + timestamp_field: new Date('2023-01-01 GMT'), + + array_field: { + list: [{ element: 'array_field val1' }, { element: 'array_field val2' }], + }, + + obj_field: { + sub1: 'obj_field_sub1 val', + sub2: 'obj_field_sub2 val', + }, + + struct_field: { + list: [ + { + element: { + sub8: { + list: [{ element: 'val1' }, { element: 'val2' }], + }, + sub3: 'struct_field_string val', + sub4: 'struct_field_string val', + sub5: { + sub6: 'struct_field_struct_string1 val', + sub7: 'struct_field_struct_string2 val', + }, + }, + }, + ], + }, + time_field: 1726067527, }; let reader: ParquetReader; @@ -185,7 +242,7 @@ describe('Json Schema Conversion Test File', function () { const cursor = reader.getCursor(); const row = await cursor.next(); const rowData = { - ...row1, + ...row1FromParquetFile, }; assert.deepEqual(row, rowData); }); diff --git a/test/test-files/json-schema-test-file.result.json b/test/test-files/json-schema-test-file.result.json index 3fd11733..40635142 100644 --- a/test/test-files/json-schema-test-file.result.json +++ b/test/test-files/json-schema-test-file.result.json @@ -4,7 +4,7 @@ "type_length": null, "repetition_type": null, "name": "root", - "num_children": 8, + "num_children": 9, "converted_type": null, "scale": null, "precision": null, @@ -298,5 +298,17 @@ "precision": null, "field_id": null, "logicalType": null + }, + { + "type": 1, + "type_length": null, + "repetition_type": 1, + "name": "time_field", + "num_children": null, + "converted_type": 7, + "scale": null, + "precision": null, + "field_id": null, + "logicalType": null } ] diff --git a/test/test-files/json-schema-test-file.schema.result.json b/test/test-files/json-schema-test-file.schema.result.json index f0fe1883..32c43ec5 100644 --- a/test/test-files/json-schema-test-file.schema.result.json +++ b/test/test-files/json-schema-test-file.schema.result.json @@ -135,6 +135,34 @@ } } } + }, + "time_field": { + "optional": true, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" } }, "fields": { @@ -403,6 +431,39 @@ } }, "originalType": "LIST" + }, + "time_field": { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 1 } }, "fieldList": [ @@ -1114,6 +1175,39 @@ "compression": "UNCOMPRESSED", "rLevelMax": 2, "dLevelMax": 5 + }, + { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 1 } ] } From abf7f0b7de93c0700ed6bd6c5abee3924a76f997 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Wed, 11 Sep 2024 15:12:36 -0500 Subject: [PATCH 22/29] remove debug logs --- lib/types.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/types.ts b/lib/types.ts index 1cfcb7ba..ab8f4a62 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -590,7 +590,6 @@ function checkValidValue(lowerRange: number | bigint, upperRange: number | bigin } function toPrimitive_TIME(time: TIME): bigint | number { - console.log('time', time); const { value, unit, isAdjustedToUTC } = time; const timeValue = typeof value === 'string' ? BigInt(value) : BigInt(value); From 0a5040f0cb31f3f99a691272f85c1e0e438d3964 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 09:51:59 -0500 Subject: [PATCH 23/29] add schema test for millis, macros and nanos --- lib/jsonSchema.ts | 11 +- test/jsonSchema.test.ts | 25 ++++- test/test-files/time.schema_micros.json | 29 +++++ .../test-files/time.schema_micros.result.json | 102 ++++++++++++++++++ test/test-files/time.schema_millis.json | 29 +++++ .../test-files/time.schema_millis.result.json | 102 ++++++++++++++++++ test/test-files/time.schema_nanos.json | 29 +++++ test/test-files/time.schema_nanos.result.json | 100 +++++++++++++++++ 8 files changed, 422 insertions(+), 5 deletions(-) create mode 100644 test/test-files/time.schema_micros.json create mode 100644 test/test-files/time.schema_micros.result.json create mode 100644 test/test-files/time.schema_millis.json create mode 100644 test/test-files/time.schema_millis.result.json create mode 100644 test/test-files/time.schema_nanos.json create mode 100644 test/test-files/time.schema_nanos.result.json diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 590f19f7..34d1071c 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -93,7 +93,10 @@ const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList fieldValue.items.properties.unit && fieldValue.items.properties.isAdjustedToUTC ) { - const unit = fieldValue.items.properties.unit.default?.toString() || 'MILLIS'; + if (!fieldValue.items.properties.unit.enum) { + throw new UnsupportedJsonSchemaError('Unit enum is not defined'); + } + const unit = fieldValue.items.properties.unit.enum[0]; const isAdjustedToUTC = !!fieldValue.items.properties.isAdjustedToUTC.default; let timeUnit: TimeUnit; @@ -152,10 +155,12 @@ const fromJsonSchemaField = case 'object': if (fieldValue.properties && fieldValue.properties.unit && fieldValue.properties.isAdjustedToUTC) { - const unit = fieldValue.properties.unit.default?.toString() || 'MILLIS'; + if (!fieldValue.properties.unit.enum) { + throw new UnsupportedJsonSchemaError('Unit enum is not defined'); + } + const unit = fieldValue.properties.unit.enum[0]; const isAdjustedToUTC = !!fieldValue.properties.isAdjustedToUTC.default; let timeUnit: TimeUnit; - switch (unit) { case 'MICROS': timeUnit = new TimeUnit({ MICROS: true }); diff --git a/test/jsonSchema.test.ts b/test/jsonSchema.test.ts index 93447cb0..e7ef4251 100644 --- a/test/jsonSchema.test.ts +++ b/test/jsonSchema.test.ts @@ -7,7 +7,9 @@ import arraySchema from './test-files/array.schema.json'; import objectSchema from './test-files/object.schema.json'; import objectNestedSchema from './test-files/object-nested.schema.json'; import timeSchema from './test-files/time.schema.json'; - +import timeSchemaMillis from './test-files/time.schema_millis.json'; +import timeSchemaMicros from './test-files/time.schema_micros.json'; +import timeSchemaNanos from './test-files/time.schema_nanos.json'; import { ParquetSchema, ParquetWriter, ParquetReader } from '../parquet'; const update = false; @@ -54,11 +56,29 @@ describe('Json Schema Conversion', function () { checkSnapshot(ps, './test-files/object-nested.schema.result.json', update); }); - it('Time Schema', function () { + it('Time Schema Generic', function () { const js = timeSchema as JSONSchema4; const ps = ParquetSchema.fromJsonSchema(js); checkSnapshot(ps, './test-files/time.schema.result.json', update); }); + + it('Time Schema MILLIS', function () { + const js = timeSchemaMillis as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema_millis.result.json', update); + }); + + it('Time Schema MICROS', function () { + const js = timeSchemaMicros as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema_micros.result.json', update); + }); + + it('Time Schema NANOS', function () { + const js = timeSchemaNanos as JSONSchema4; + const ps = ParquetSchema.fromJsonSchema(js); + checkSnapshot(ps, './test-files/time.schema_nanos.result.json', update); + }); }); const parquetSchema = ParquetSchema.fromJsonSchema({ @@ -128,6 +148,7 @@ const parquetSchema = ParquetSchema.fromJsonSchema({ }, unit: { type: 'string', + enum: ['MILLIS', 'MICROS', 'NANOS'], // Define enum for time units }, isAdjustedToUTC: { type: 'boolean', diff --git a/test/test-files/time.schema_micros.json b/test/test-files/time.schema_micros.json new file mode 100644 index 00000000..37283f73 --- /dev/null +++ b/test/test-files/time.schema_micros.json @@ -0,0 +1,29 @@ +{ + "$id": "https://example.com/time-micros.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical type in Parquet with MICROS unit", + "type": "object", + "properties": { + "time_field": { + "type": "object", + "properties": { + "value": { + "type": "number", + "description": "Time value in MICROS" + }, + "unit": { + "type": "string", + "enum": ["MICROS"], + "description": "The unit is fixed to MICROS" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "isAdjustedToUTC"], + "additionalProperties": false + } + }, + "required": ["time_field"] +} diff --git a/test/test-files/time.schema_micros.result.json b/test/test-files/time.schema_micros.result.json new file mode 100644 index 00000000..889c98ef --- /dev/null +++ b/test/test-files/time.schema_micros.result.json @@ -0,0 +1,102 @@ +{ + "schema": { + "time_field": { + "optional": false, + "type": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": {}, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_field": { + "name": "time_field", + "primitiveType": "INT64", + "originalType": "TIME_MICROS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": {}, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_field", + "primitiveType": "INT64", + "originalType": "TIME_MICROS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": {}, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} diff --git a/test/test-files/time.schema_millis.json b/test/test-files/time.schema_millis.json new file mode 100644 index 00000000..2a856d0c --- /dev/null +++ b/test/test-files/time.schema_millis.json @@ -0,0 +1,29 @@ +{ + "$id": "https://example.com/time-millis.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical type in Parquet with MILLIS unit", + "type": "object", + "properties": { + "time_field": { + "type": "object", + "properties": { + "value": { + "type": "number", + "description": "Time value in MILLIS" + }, + "unit": { + "type": "string", + "enum": ["MILLIS"], + "description": "The unit is fixed to MILLIS" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "isAdjustedToUTC"], + "additionalProperties": false + } + }, + "required": ["time_field"] +} diff --git a/test/test-files/time.schema_millis.result.json b/test/test-files/time.schema_millis.result.json new file mode 100644 index 00000000..ddb9f433 --- /dev/null +++ b/test/test-files/time.schema_millis.result.json @@ -0,0 +1,102 @@ +{ + "schema": { + "time_field": { + "optional": false, + "type": "INT32", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_field": { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_field", + "primitiveType": "INT32", + "originalType": "TIME_MILLIS", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": {}, + "MICROS": null, + "NANOS": null + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} diff --git a/test/test-files/time.schema_nanos.json b/test/test-files/time.schema_nanos.json new file mode 100644 index 00000000..f7855ad2 --- /dev/null +++ b/test/test-files/time.schema_nanos.json @@ -0,0 +1,29 @@ +{ + "$id": "https://example.com/time-nanos.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "A schema to test the TIME logical type in Parquet with NANOS unit", + "type": "object", + "properties": { + "time_field": { + "type": "object", + "properties": { + "value": { + "type": "number", + "description": "Time value in NANOS" + }, + "unit": { + "type": "string", + "enum": ["NANOS"], + "description": "The unit is fixed to NANOS" + }, + "isAdjustedToUTC": { + "type": "boolean", + "description": "Whether the time is adjusted to UTC" + } + }, + "required": ["value", "isAdjustedToUTC"], + "additionalProperties": false + } + }, + "required": ["time_field"] +} diff --git a/test/test-files/time.schema_nanos.result.json b/test/test-files/time.schema_nanos.result.json new file mode 100644 index 00000000..44e653bb --- /dev/null +++ b/test/test-files/time.schema_nanos.result.json @@ -0,0 +1,100 @@ +{ + "schema": { + "time_field": { + "optional": false, + "type": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": null, + "NANOS": {} + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + }, + "fields": { + "time_field": { + "name": "time_field", + "primitiveType": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": null, + "NANOS": {} + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + }, + "fieldList": [ + { + "name": "time_field", + "primitiveType": "INT64", + "logicalType": { + "STRING": null, + "MAP": null, + "LIST": null, + "ENUM": null, + "DECIMAL": null, + "DATE": null, + "TIME": { + "isAdjustedToUTC": false, + "unit": { + "MILLIS": null, + "MICROS": null, + "NANOS": {} + } + }, + "TIMESTAMP": null, + "INTEGER": null, + "UNKNOWN": null, + "JSON": null, + "BSON": null, + "UUID": null + }, + "path": ["time_field"], + "repetitionType": "REQUIRED", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 0, + "dLevelMax": 0 + } + ] +} From 119bc703ff8743deba49bfd636348cacf7cb3c9d Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 10:08:39 -0500 Subject: [PATCH 24/29] update existing test to include logicalType --- test/decodeSchema.js | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/test/decodeSchema.js b/test/decodeSchema.js index c786c4c8..3476824f 100644 --- a/test/decodeSchema.js +++ b/test/decodeSchema.js @@ -4,7 +4,7 @@ const assert = chai.assert; const parquet = require('../parquet'); describe('ParquetSchema', function () { - it('should handle complex nesting', function () { + it('should handle complex nesting with logicalType as undefined', function () { var metadata = { version: 1, schema: [ @@ -120,6 +120,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 2, + logicalType: undefined, fields: { b: { name: 'b', @@ -130,6 +131,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 2, + logicalType: undefined, fields: { c: { name: 'c', @@ -140,6 +142,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 1, + logicalType: undefined, fields: { d: { name: 'd', @@ -150,6 +153,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -167,6 +171,7 @@ describe('ParquetSchema', function () { dLevelMax: 0, isNested: true, fieldCount: 2, + logicalType: undefined, fields: { f: { name: 'f', @@ -177,6 +182,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -192,6 +198,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -211,6 +218,7 @@ describe('ParquetSchema', function () { statistics: undefined, typeLength: undefined, encoding: 'PLAIN', + logicalType: undefined, compression: 'UNCOMPRESSED', rLevelMax: 0, dLevelMax: 0, @@ -222,21 +230,6 @@ describe('ParquetSchema', function () { }; const reader = new parquet.ParquetReader(metadata, {}); - const removeUndefinedFields = (obj) => { - Object.keys(obj).forEach((key) => { - if (obj[key] === undefined || obj[key] === null || (Array.isArray(obj[key]) && obj[key].length === 0)) { - // eslint-disable-next-line @typescript-eslint/no-dynamic-delete - delete obj[key]; - } else if (typeof obj[key] === 'object') { - removeUndefinedFields(obj[key]); - } - }); - return obj; - }; - - const sanitizedExpected = removeUndefinedFields(expected); - const sanitizedActual = removeUndefinedFields(reader.schema.fields); - - assert.deepEqual(sanitizedActual, sanitizedExpected); + assert.deepEqual(reader.schema.fields, expected); }); }); From 6958dab98de73631afeca7c4e5f55cfcc4ec5dce Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 10:11:55 -0500 Subject: [PATCH 25/29] revert and fix decodeSchema.js test inline with recent changes --- test/decodeSchema.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/decodeSchema.js b/test/decodeSchema.js index 3476824f..4eb0b823 100644 --- a/test/decodeSchema.js +++ b/test/decodeSchema.js @@ -4,7 +4,7 @@ const assert = chai.assert; const parquet = require('../parquet'); describe('ParquetSchema', function () { - it('should handle complex nesting with logicalType as undefined', function () { + it('should handle complex nesting', function () { var metadata = { version: 1, schema: [ From 6ec10b887e51dd243358ce21c27fbd262345271a Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 10:14:51 -0500 Subject: [PATCH 26/29] pick default or 0 whichever --- lib/jsonSchema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 34d1071c..cdc72559 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -96,7 +96,7 @@ const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList if (!fieldValue.items.properties.unit.enum) { throw new UnsupportedJsonSchemaError('Unit enum is not defined'); } - const unit = fieldValue.items.properties.unit.enum[0]; + const unit = fieldValue.items.properties.unit.default || fieldValue.items.properties.unit.enum[0]; const isAdjustedToUTC = !!fieldValue.items.properties.isAdjustedToUTC.default; let timeUnit: TimeUnit; From 96ec9f7c8238603b6c11e2c7a0fee04f72e6f754 Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 10:18:01 -0500 Subject: [PATCH 27/29] pick default unit for now, --- lib/jsonSchema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index cdc72559..f84c8098 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -158,7 +158,7 @@ const fromJsonSchemaField = if (!fieldValue.properties.unit.enum) { throw new UnsupportedJsonSchemaError('Unit enum is not defined'); } - const unit = fieldValue.properties.unit.enum[0]; + const unit = fieldValue.properties.unit.default || fieldValue.properties.unit.enum[0]; const isAdjustedToUTC = !!fieldValue.properties.isAdjustedToUTC.default; let timeUnit: TimeUnit; switch (unit) { From 06b7441c6e23d9d2f2b00972c52fddb73aac21aa Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 11:41:57 -0500 Subject: [PATCH 28/29] fix typo --- test/test-files/time.schema.json | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/test-files/time.schema.json b/test/test-files/time.schema.json index f8c0c82f..eec63301 100644 --- a/test/test-files/time.schema.json +++ b/test/test-files/time.schema.json @@ -8,9 +8,7 @@ "type": "object", "properties": { "value": { - "type": "string", - "format": "date-time", - "description": "Time field represented as a string with date-time format" + "type": "number | string" }, "unit": { "type": "string", From 8ca323aafddf7dec5d8b9b407c759367a61798ba Mon Sep 17 00:00:00 2001 From: Puneet Saraswat Date: Thu, 12 Sep 2024 11:53:01 -0500 Subject: [PATCH 29/29] better schema --- test/test-files/time.schema_micros.json | 3 ++- test/test-files/time.schema_millis.json | 3 ++- test/test-files/time.schema_nanos.json | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test-files/time.schema_micros.json b/test/test-files/time.schema_micros.json index 37283f73..652417df 100644 --- a/test/test-files/time.schema_micros.json +++ b/test/test-files/time.schema_micros.json @@ -13,7 +13,8 @@ }, "unit": { "type": "string", - "enum": ["MICROS"], + "enum": ["MILLIS", "MICROS", "NANOS"], + "default": "MICROS", "description": "The unit is fixed to MICROS" }, "isAdjustedToUTC": { diff --git a/test/test-files/time.schema_millis.json b/test/test-files/time.schema_millis.json index 2a856d0c..5cb0c4a8 100644 --- a/test/test-files/time.schema_millis.json +++ b/test/test-files/time.schema_millis.json @@ -13,7 +13,8 @@ }, "unit": { "type": "string", - "enum": ["MILLIS"], + "enum": ["MILLIS", "MICROS", "NANOS"], + "default": "MILLIS", "description": "The unit is fixed to MILLIS" }, "isAdjustedToUTC": { diff --git a/test/test-files/time.schema_nanos.json b/test/test-files/time.schema_nanos.json index f7855ad2..ceb12c8c 100644 --- a/test/test-files/time.schema_nanos.json +++ b/test/test-files/time.schema_nanos.json @@ -13,7 +13,8 @@ }, "unit": { "type": "string", - "enum": ["NANOS"], + "enum": ["MILLIS", "MICROS", "NANOS"], + "default": "NANOS", "description": "The unit is fixed to NANOS" }, "isAdjustedToUTC": {