From ac5257d283d23547b32338fb72eb3119fb60e7c7 Mon Sep 17 00:00:00 2001 From: Marcus Reinhardt Date: Tue, 25 Jul 2023 19:10:20 +0200 Subject: [PATCH] Feature: Timestamp support for JSONSchema generated schemas (#95) Problem ======= Closes: #93 Solution ======== Add checks for `format == 'date-time`` inside the string type check for`string` and `array of string` Change summary: --------------- * Added format check for string fields ( and string arrays ) * Check for JSONSchema property `format` with value `date-time` * Updated jsonschema tests and updated the snapshots Steps to Verify: ---------------- 1. Generate a JSON Schema or use an existing one 2. Add `"format":"date-time"` to the field which should have a Date/Time value 3. Ensure that the value is a valid `Date` object 4. Enjoy --- README.md | 3 +- lib/jsonSchema.ts | 6 + test/jsonSchema.test.ts | 18 +- .../json-schema-test-file.result.json | 42 ++++- .../json-schema-test-file.schema.result.json | 157 +++++++++++++++++- 5 files changed, 214 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 5f8b6bb6..00b94d35 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,8 @@ var schema = new parquet.ParquetSchema.fromJsonSchema({ "type": "number" }, "date": { - "type": "string" + "type": "string", + "format": "date-time" }, "in_stock": { "type": "boolean" diff --git a/lib/jsonSchema.ts b/lib/jsonSchema.ts index 21a34fa9..da2c3608 100644 --- a/lib/jsonSchema.ts +++ b/lib/jsonSchema.ts @@ -64,6 +64,9 @@ const fromJsonSchemaArray = (fieldValue: SupportedJSONSchema4, optionalFieldList switch (fieldValue.items.type) { case 'string': + if (fieldValue.items.format && fieldValue.items.format == 'date-time') { + return fields.createListField('TIMESTAMP_MILLIS', optionalFieldList); + } return fields.createListField('UTF8', optionalFieldList); case 'integer': case 'number': @@ -88,6 +91,9 @@ const fromJsonSchemaField = (jsonSchema: JSONSchema4) => (fieldName: string, fie switch (fieldValue.type) { case 'string': + if (fieldValue.format && fieldValue.format == 'date-time') { + return fields.createTimestampField(optional); + } return fields.createStringField(optional); case 'integer': case 'number': diff --git a/test/jsonSchema.test.ts b/test/jsonSchema.test.ts index dd3c8ef3..c85b71d6 100644 --- a/test/jsonSchema.test.ts +++ b/test/jsonSchema.test.ts @@ -63,7 +63,18 @@ describe("Json Schema Conversion Test File", async function () { "items": { "type": "string" }, "additionalItems": false }, - "timestamp_field": { "type": "string" }, + "timestamp_array_field": { + "type": "array", + "items": { + "type": "string", + "format": "date-time" + }, + "additionalItems": false, + }, + "timestamp_field": { + "type": "string", + "format": "date-time" + }, "obj_field": { "type": "object", "properties": { @@ -107,7 +118,9 @@ describe("Json Schema Conversion Test File", async function () { const row1 = { string_field: 'string value', int_field: 10n, - timestamp_field: new Date("2023-01-01 GMT").toUTCString(), + timestamp_array_field: { list: [{ element: new Date("2023-01-01 GMT") }] }, + + timestamp_field: new Date("2023-01-01 GMT"), array_field: { list: [{ element: 'array_field val1' }, { element: 'array_field val2' }], @@ -162,7 +175,6 @@ describe("Json Schema Conversion Test File", async function () { const row = await cursor.next(); const rowData = { ...row1, - timestamp_field: "Sun, 01 Jan 2023 00:00:00 GMT", }; assert.deepEqual(row, rowData); }); diff --git a/test/test-files/json-schema-test-file.result.json b/test/test-files/json-schema-test-file.result.json index a409b33d..26cf2a01 100644 --- a/test/test-files/json-schema-test-file.result.json +++ b/test/test-files/json-schema-test-file.result.json @@ -4,7 +4,7 @@ "type_length": null, "repetition_type": null, "name": "root", - "num_children": 6, + "num_children": 7, "converted_type": null, "scale": null, "precision": null, @@ -72,12 +72,48 @@ "logicalType": null }, { - "type": 6, + "type": null, + "type_length": null, + "repetition_type": 1, + "name": "timestamp_array_field", + "num_children": 1, + "converted_type": 3, + "scale": null, + "precision": null, + "field_id": null, + "logicalType": null + }, + { + "type": null, + "type_length": null, + "repetition_type": 2, + "name": "list", + "num_children": 1, + "converted_type": null, + "scale": null, + "precision": null, + "field_id": null, + "logicalType": null + }, + { + "type": 2, + "type_length": null, + "repetition_type": 1, + "name": "element", + "num_children": null, + "converted_type": 9, + "scale": null, + "precision": null, + "field_id": null, + "logicalType": null + }, + { + "type": 2, "type_length": null, "repetition_type": 1, "name": "timestamp_field", "num_children": null, - "converted_type": 0, + "converted_type": 9, "scale": null, "precision": null, "field_id": null, diff --git a/test/test-files/json-schema-test-file.schema.result.json b/test/test-files/json-schema-test-file.schema.result.json index e7289cfe..9c0cf31b 100644 --- a/test/test-files/json-schema-test-file.schema.result.json +++ b/test/test-files/json-schema-test-file.schema.result.json @@ -29,9 +29,26 @@ } } }, + "timestamp_array_field": { + "type": "LIST", + "optional": true, + "fields": { + "list": { + "repeated": true, + "fields": { + "element": { + "optional": true, + "type": "TIMESTAMP_MILLIS", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED" + } + } + } + } + }, "timestamp_field": { "optional": true, - "type": "UTF8", + "type": "TIMESTAMP_MILLIS", "encoding": "PLAIN", "compression": "UNCOMPRESSED" }, @@ -183,10 +200,53 @@ }, "originalType": "LIST" }, + "timestamp_array_field": { + "name": "timestamp_array_field", + "path": [ + "timestamp_array_field" + ], + "repetitionType": "OPTIONAL", + "rLevelMax": 0, + "dLevelMax": 1, + "isNested": true, + "fieldCount": 1, + "fields": { + "list": { + "name": "list", + "path": [ + "timestamp_array_field", + "list" + ], + "repetitionType": "REPEATED", + "rLevelMax": 1, + "dLevelMax": 2, + "isNested": true, + "fieldCount": 1, + "fields": { + "element": { + "name": "element", + "primitiveType": "INT64", + "originalType": "TIMESTAMP_MILLIS", + "path": [ + "timestamp_array_field", + "list", + "element" + ], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 1, + "dLevelMax": 3 + } + } + } + }, + "originalType": "LIST" + }, "timestamp_field": { "name": "timestamp_field", - "primitiveType": "BYTE_ARRAY", - "originalType": "UTF8", + "primitiveType": "INT64", + "originalType": "TIMESTAMP_MILLIS", "path": [ "timestamp_field" ], @@ -528,10 +588,97 @@ "rLevelMax": 1, "dLevelMax": 3 }, + { + "name": "timestamp_array_field", + "path": [ + "timestamp_array_field" + ], + "repetitionType": "OPTIONAL", + "rLevelMax": 0, + "dLevelMax": 1, + "isNested": true, + "fieldCount": 1, + "fields": { + "list": { + "name": "list", + "path": [ + "timestamp_array_field", + "list" + ], + "repetitionType": "REPEATED", + "rLevelMax": 1, + "dLevelMax": 2, + "isNested": true, + "fieldCount": 1, + "fields": { + "element": { + "name": "element", + "primitiveType": "INT64", + "originalType": "TIMESTAMP_MILLIS", + "path": [ + "timestamp_array_field", + "list", + "element" + ], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 1, + "dLevelMax": 3 + } + } + } + }, + "originalType": "LIST" + }, + { + "name": "list", + "path": [ + "timestamp_array_field", + "list" + ], + "repetitionType": "REPEATED", + "rLevelMax": 1, + "dLevelMax": 2, + "isNested": true, + "fieldCount": 1, + "fields": { + "element": { + "name": "element", + "primitiveType": "INT64", + "originalType": "TIMESTAMP_MILLIS", + "path": [ + "timestamp_array_field", + "list", + "element" + ], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 1, + "dLevelMax": 3 + } + } + }, + { + "name": "element", + "primitiveType": "INT64", + "originalType": "TIMESTAMP_MILLIS", + "path": [ + "timestamp_array_field", + "list", + "element" + ], + "repetitionType": "OPTIONAL", + "encoding": "PLAIN", + "compression": "UNCOMPRESSED", + "rLevelMax": 1, + "dLevelMax": 3 + }, { "name": "timestamp_field", - "primitiveType": "BYTE_ARRAY", - "originalType": "UTF8", + "primitiveType": "INT64", + "originalType": "TIMESTAMP_MILLIS", "path": [ "timestamp_field" ],