Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Logical type : TIME #143

Merged
merged 30 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
fb4b59a
minimal changes, passing test
saraswatpuneet Sep 5, 2024
1ca98b8
handle possible exceptions on conversion
saraswatpuneet Sep 5, 2024
8fcafe2
set the primitive type for TIME as INT64
saraswatpuneet Sep 5, 2024
9949fd4
define field definition for time support and helper function
saraswatpuneet Sep 6, 2024
679c96c
declare additional fields for TIME
saraswatpuneet Sep 6, 2024
d3a1382
update jsonSchema to support TIME
saraswatpuneet Sep 6, 2024
e22fbe4
add field tests for time
saraswatpuneet Sep 6, 2024
ddfcc0e
annotate as int32 for MILLIS and int64 for rest
saraswatpuneet Sep 6, 2024
05e2414
spec says int32 for milli, bigint for micro and nano
saraswatpuneet Sep 6, 2024
38a1d57
redo based on generated types
saraswatpuneet Sep 9, 2024
dddb09a
add TIME related primitive converters
saraswatpuneet Sep 9, 2024
9401f67
support time field for json schema
saraswatpuneet Sep 9, 2024
13007a5
finalize implementation side of TIME logical type
saraswatpuneet Sep 9, 2024
ae58c96
set converted types for backward compatibility
saraswatpuneet Sep 9, 2024
42d72ad
set example field tests
saraswatpuneet Sep 9, 2024
510f402
set an example time schema
saraswatpuneet Sep 10, 2024
bc9e11c
set schema test
saraswatpuneet Sep 10, 2024
44f64aa
update schema files for testing
saraswatpuneet Sep 10, 2024
97b1475
schema test result file
saraswatpuneet Sep 10, 2024
6485753
adjust types based on ongoing file tests
saraswatpuneet Sep 11, 2024
6927f31
logical types from parquet file cannot be reconstructed back without …
saraswatpuneet Sep 11, 2024
bd1cb5d
Merge branch 'main' into logical_type_time
saraswatpuneet Sep 11, 2024
abf7f0b
remove debug logs
saraswatpuneet Sep 11, 2024
0a5040f
add schema test for millis, macros and nanos
saraswatpuneet Sep 12, 2024
119bc70
update existing test to include logicalType
saraswatpuneet Sep 12, 2024
6958dab
revert and fix decodeSchema.js test inline with recent changes
saraswatpuneet Sep 12, 2024
6ec10b8
pick default or 0 whichever
saraswatpuneet Sep 12, 2024
96ec9f7
pick default unit for now,
saraswatpuneet Sep 12, 2024
06b7441
fix typo
saraswatpuneet Sep 12, 2024
8ca323a
better schema
saraswatpuneet Sep 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions lib/jsonSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList
fieldValue.items.properties.unit &&
fieldValue.items.properties.isAdjustedToUTC
) {
const unit = fieldValue.items.properties.unit.default?.toString() || 'MILLIS';
if (!fieldValue.items.properties.unit.enum) {
throw new UnsupportedJsonSchemaError('Unit enum is not defined');
}
const unit = fieldValue.items.properties.unit.enum[0];
saraswatpuneet marked this conversation as resolved.
Show resolved Hide resolved
const isAdjustedToUTC = !!fieldValue.items.properties.isAdjustedToUTC.default;
let timeUnit: TimeUnit;

Expand Down Expand Up @@ -152,10 +155,12 @@ const fromJsonSchemaField =

case 'object':
if (fieldValue.properties && fieldValue.properties.unit && fieldValue.properties.isAdjustedToUTC) {
const unit = fieldValue.properties.unit.default?.toString() || 'MILLIS';
if (!fieldValue.properties.unit.enum) {
throw new UnsupportedJsonSchemaError('Unit enum is not defined');
}
const unit = fieldValue.properties.unit.enum[0];
const isAdjustedToUTC = !!fieldValue.properties.isAdjustedToUTC.default;
let timeUnit: TimeUnit;

switch (unit) {
case 'MICROS':
timeUnit = new TimeUnit({ MICROS: true });
Expand Down
25 changes: 23 additions & 2 deletions test/jsonSchema.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import arraySchema from './test-files/array.schema.json';
import objectSchema from './test-files/object.schema.json';
import objectNestedSchema from './test-files/object-nested.schema.json';
import timeSchema from './test-files/time.schema.json';

import timeSchemaMillis from './test-files/time.schema_millis.json';
import timeSchemaMicros from './test-files/time.schema_micros.json';
import timeSchemaNanos from './test-files/time.schema_nanos.json';
import { ParquetSchema, ParquetWriter, ParquetReader } from '../parquet';

const update = false;
Expand Down Expand Up @@ -54,11 +56,29 @@ describe('Json Schema Conversion', function () {
checkSnapshot(ps, './test-files/object-nested.schema.result.json', update);
});

it('Time Schema', function () {
it('Time Schema Generic', function () {
const js = timeSchema as JSONSchema4;
const ps = ParquetSchema.fromJsonSchema(js);
checkSnapshot(ps, './test-files/time.schema.result.json', update);
});

it('Time Schema MILLIS', function () {
const js = timeSchemaMillis as JSONSchema4;
const ps = ParquetSchema.fromJsonSchema(js);
checkSnapshot(ps, './test-files/time.schema_millis.result.json', update);
});

it('Time Schema MICROS', function () {
const js = timeSchemaMicros as JSONSchema4;
const ps = ParquetSchema.fromJsonSchema(js);
checkSnapshot(ps, './test-files/time.schema_micros.result.json', update);
});

it('Time Schema NANOS', function () {
const js = timeSchemaNanos as JSONSchema4;
const ps = ParquetSchema.fromJsonSchema(js);
checkSnapshot(ps, './test-files/time.schema_nanos.result.json', update);
});
});

const parquetSchema = ParquetSchema.fromJsonSchema({
Expand Down Expand Up @@ -128,6 +148,7 @@ const parquetSchema = ParquetSchema.fromJsonSchema({
},
unit: {
type: 'string',
enum: ['MILLIS', 'MICROS', 'NANOS'], // Define enum for time units
},
isAdjustedToUTC: {
type: 'boolean',
Expand Down
29 changes: 29 additions & 0 deletions test/test-files/time.schema_micros.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"$id": "https://example.com/time-micros.schema.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "A schema to test the TIME logical type in Parquet with MICROS unit",
"type": "object",
"properties": {
"time_field": {
"type": "object",
"properties": {
"value": {
"type": "number",
"description": "Time value in MICROS"
},
"unit": {
"type": "string",
"enum": ["MICROS"],
"description": "The unit is fixed to MICROS"
},
"isAdjustedToUTC": {
"type": "boolean",
"description": "Whether the time is adjusted to UTC"
}
},
"required": ["value", "isAdjustedToUTC"],
"additionalProperties": false
}
},
"required": ["time_field"]
}
102 changes: 102 additions & 0 deletions test/test-files/time.schema_micros.result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"schema": {
"time_field": {
"optional": false,
"type": "INT64",
"logicalType": {
"STRING": null,
"MAP": null,
"LIST": null,
"ENUM": null,
"DECIMAL": null,
"DATE": null,
"TIME": {
"isAdjustedToUTC": false,
"unit": {
"MILLIS": null,
"MICROS": {},
"NANOS": null
}
},
"TIMESTAMP": null,
"INTEGER": null,
"UNKNOWN": null,
"JSON": null,
"BSON": null,
"UUID": null
},
"encoding": "PLAIN",
"compression": "UNCOMPRESSED"
}
},
"fields": {
"time_field": {
"name": "time_field",
"primitiveType": "INT64",
"originalType": "TIME_MICROS",
"logicalType": {
"STRING": null,
"MAP": null,
"LIST": null,
"ENUM": null,
"DECIMAL": null,
"DATE": null,
"TIME": {
"isAdjustedToUTC": false,
"unit": {
"MILLIS": null,
"MICROS": {},
"NANOS": null
}
},
"TIMESTAMP": null,
"INTEGER": null,
"UNKNOWN": null,
"JSON": null,
"BSON": null,
"UUID": null
},
"path": ["time_field"],
"repetitionType": "REQUIRED",
"encoding": "PLAIN",
"compression": "UNCOMPRESSED",
"rLevelMax": 0,
"dLevelMax": 0
}
},
"fieldList": [
{
"name": "time_field",
"primitiveType": "INT64",
"originalType": "TIME_MICROS",
"logicalType": {
"STRING": null,
"MAP": null,
"LIST": null,
"ENUM": null,
"DECIMAL": null,
"DATE": null,
"TIME": {
"isAdjustedToUTC": false,
"unit": {
"MILLIS": null,
"MICROS": {},
"NANOS": null
}
},
"TIMESTAMP": null,
"INTEGER": null,
"UNKNOWN": null,
"JSON": null,
"BSON": null,
"UUID": null
},
"path": ["time_field"],
"repetitionType": "REQUIRED",
"encoding": "PLAIN",
"compression": "UNCOMPRESSED",
"rLevelMax": 0,
"dLevelMax": 0
}
]
}
29 changes: 29 additions & 0 deletions test/test-files/time.schema_millis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"$id": "https://example.com/time-millis.schema.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "A schema to test the TIME logical type in Parquet with MILLIS unit",
"type": "object",
"properties": {
"time_field": {
"type": "object",
"properties": {
"value": {
"type": "number",
"description": "Time value in MILLIS"
},
"unit": {
"type": "string",
"enum": ["MILLIS"],
"description": "The unit is fixed to MILLIS"
},
"isAdjustedToUTC": {
"type": "boolean",
"description": "Whether the time is adjusted to UTC"
}
},
"required": ["value", "isAdjustedToUTC"],
"additionalProperties": false
}
},
"required": ["time_field"]
}
102 changes: 102 additions & 0 deletions test/test-files/time.schema_millis.result.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"schema": {
"time_field": {
"optional": false,
"type": "INT32",
"logicalType": {
"STRING": null,
"MAP": null,
"LIST": null,
"ENUM": null,
"DECIMAL": null,
"DATE": null,
"TIME": {
"isAdjustedToUTC": false,
"unit": {
"MILLIS": {},
"MICROS": null,
"NANOS": null
}
},
"TIMESTAMP": null,
"INTEGER": null,
"UNKNOWN": null,
"JSON": null,
"BSON": null,
"UUID": null
},
"encoding": "PLAIN",
"compression": "UNCOMPRESSED"
}
},
"fields": {
"time_field": {
"name": "time_field",
"primitiveType": "INT32",
"originalType": "TIME_MILLIS",
"logicalType": {
"STRING": null,
"MAP": null,
"LIST": null,
"ENUM": null,
"DECIMAL": null,
"DATE": null,
"TIME": {
"isAdjustedToUTC": false,
"unit": {
"MILLIS": {},
"MICROS": null,
"NANOS": null
}
},
"TIMESTAMP": null,
"INTEGER": null,
"UNKNOWN": null,
"JSON": null,
"BSON": null,
"UUID": null
},
"path": ["time_field"],
"repetitionType": "REQUIRED",
"encoding": "PLAIN",
"compression": "UNCOMPRESSED",
"rLevelMax": 0,
"dLevelMax": 0
}
},
"fieldList": [
{
"name": "time_field",
"primitiveType": "INT32",
"originalType": "TIME_MILLIS",
"logicalType": {
"STRING": null,
"MAP": null,
"LIST": null,
"ENUM": null,
"DECIMAL": null,
"DATE": null,
"TIME": {
"isAdjustedToUTC": false,
"unit": {
"MILLIS": {},
"MICROS": null,
"NANOS": null
}
},
"TIMESTAMP": null,
"INTEGER": null,
"UNKNOWN": null,
"JSON": null,
"BSON": null,
"UUID": null
},
"path": ["time_field"],
"repetitionType": "REQUIRED",
"encoding": "PLAIN",
"compression": "UNCOMPRESSED",
"rLevelMax": 0,
"dLevelMax": 0
}
]
}
29 changes: 29 additions & 0 deletions test/test-files/time.schema_nanos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"$id": "https://example.com/time-nanos.schema.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "A schema to test the TIME logical type in Parquet with NANOS unit",
"type": "object",
"properties": {
"time_field": {
"type": "object",
"properties": {
"value": {
"type": "number",
"description": "Time value in NANOS"
},
"unit": {
"type": "string",
"enum": ["NANOS"],
"description": "The unit is fixed to NANOS"
},
"isAdjustedToUTC": {
"type": "boolean",
"description": "Whether the time is adjusted to UTC"
}
},
"required": ["value", "isAdjustedToUTC"],
"additionalProperties": false
}
},
"required": ["time_field"]
}
Loading
Loading