From f4d24d552fd2009e63cd71607f6edf2cb98d82ac Mon Sep 17 00:00:00 2001 From: "Costa, Carlos" <-> Date: Tue, 25 Aug 2020 13:55:01 +0100 Subject: [PATCH] introduction of new algorithms (including DeltaLakeLoad using Delta Lake IO); refactored project structured; full load performance improvements; bug fixes; --- .gitignore | 23 + .scalafmt.conf | 189 ++++++ Dockerfile | 13 +- Jenkinsfile | 7 + README.md | 6 +- build.sbt | 53 +- common.sh | 4 +- dev-env.sh | 118 ++-- project/assembly.sbt | 2 +- project/build.properties | 2 +- project/plugins.sbt | 4 +- .../adidas/analytics/AlgorithmFactory.scala | 55 +- .../analytics/algo/AlgorithmTemplate.scala | 39 -- .../adidas/analytics/algo/AppendLoad.scala | 234 ------- .../algo/FixedSizeStringExtractor.scala | 41 +- .../com/adidas/analytics/algo/FullLoad.scala | 210 ------ .../analytics/algo/GzipDecompressor.scala | 117 ++-- .../analytics/algo/Materialization.scala | 156 +++++ .../analytics/algo/NestedFlattener.scala | 226 ++++--- .../algo/PartitionMaterialization.scala | 52 -- .../com/adidas/analytics/algo/SQLRunner.scala | 26 + .../com/adidas/analytics/algo/Transpose.scala | 82 +++ .../analytics/algo/core/Algorithm.scala | 134 ++-- .../analytics/algo/core/JobRunner.scala | 8 +- .../adidas/analytics/algo/core/Metadata.scala | 5 +- .../algo/core/PartitionHelpers.scala | 20 +- .../analytics/algo/core/TableStatistics.scala | 126 +++- .../analytics/algo/loads/AppendLoad.scala | 301 +++++++++ .../analytics/algo/loads/DeltaLakeLoad.scala | 360 +++++++++++ .../algo/{ => loads}/DeltaLoad.scala | 74 ++- .../analytics/algo/loads/FullLoad.scala | 98 +++ .../algo/shared/CustomDateFormatters.scala | 29 +- .../algo/shared/DataReshapingTask.scala | 104 +++ .../algo/shared/DateComponentDerivation.scala | 211 +++--- .../algo/templates/AlgorithmTemplate.scala | 43 ++ .../AlgorithmTemplateConfiguration.scala | 98 --- .../config/AppendLoadConfiguration.scala | 78 --- .../config/DeltaLoadConfiguration.scala | 75 --- ...ixedSizeStringExtractorConfiguration.scala | 115 ++-- .../config/FullLoadConfiguration.scala | 42 -- .../GzipDecompressorConfiguration.scala | 5 +- .../config/MaterializationConfiguration.scala | 171 +++++ .../config/NestedFlattenerConfiguration.scala | 64 +- ...artitionMaterializationConfiguration.scala | 121 ---- .../config/TransposeConfiguration.scala | 55 ++ .../loads/AppendLoadConfiguration.scala | 99 +++ .../loads/DeltaLakeLoadConfiguration.scala | 162 +++++ .../config/loads/DeltaLoadConfiguration.scala | 97 +++ .../config/loads/FullLoadConfiguration.scala | 90 +++ .../{shared => loads}/LoadConfiguration.scala | 42 +- .../config/shared/ConfigurationContext.scala | 1 - .../shared/DataReshapingTaskConfig.scala | 45 ++ ...DateComponentDerivationConfiguration.scala | 2 - .../shared/MetadataUpdateStrategy.scala | 15 +- .../AlgorithmTemplateConfiguration.scala | 98 +++ .../analytics/util/CatalogTableManager.scala | 183 ++++++ .../adidas/analytics/util/ConfigReader.scala | 46 +- .../adidas/analytics/util/DFSWrapper.scala | 60 +- .../adidas/analytics/util/DataFormat.scala | 16 +- .../analytics/util/DataFrameUtils.scala | 79 ++- .../analytics/util/DistCpLoadHelper.scala | 144 ----- .../adidas/analytics/util/DistCpWrapper.scala | 11 +- .../analytics/util/HadoopLoadHelper.scala | 217 +++++-- .../util/HiveTableAttributeReader.scala | 39 -- .../adidas/analytics/util/InputReader.scala | 76 ++- .../analytics/util/JavaConsumable.scala | 8 +- .../com/adidas/analytics/util/LoadMode.scala | 3 - .../adidas/analytics/util/OutputWriter.scala | 329 ++++++---- .../util/RecoverPartitionsCustom.scala | 28 +- .../util/RecoverPartitionsNative.scala | 6 +- .../util/RemoteIteratorWrapper.scala | 12 +- .../util/SparkRecoverPartitionsCustom.scala | 49 -- .../util/SparkRecoverPartitionsNative.scala | 11 - .../params.json | 9 - .../data_20180719111849_data_3-3.gz | Bin 182 -> 0 bytes .../data_20180101-part-00000.psv | 4 - .../data_20180101-part-00001.psv | 3 - .../expected_partitions.txt | 6 - .../expected_partitions_schema.json | 10 - .../multiple_source_files/params.json | 19 - .../data_20180101-part-00000.psv | 4 - .../data_20180101-part-00001.psv | 3 - .../expected_partitions.txt | 6 - .../expected_partitions_schema.json | 10 - .../multiple_source_files/lake_data_post.psv | 25 - .../multiple_source_files/params.json | 19 - .../matched_schema/lake_data_post.psv | 0 .../matched_schema/params.json | 0 .../matched_schema/source_data.psv | 0 .../matched_schema/source_schema.json | 0 .../matched_schema/target_schema.json | 0 .../lake_data_post.psv | 0 .../matched_schema_partitioned/params.json | 0 .../source_data.psv | 0 .../source_schema.json | 0 .../target_schema.json | 0 .../non_matched_schema1/params.json | 0 .../non_matched_schema1/source_schema.json | 0 .../non_matched_schema1/target_schema.json | 0 .../non_matched_schema2/params.json | 0 .../non_matched_schema2/source_schema.json | 0 .../non_matched_schema2/target_schema.json | 0 .../data_20180719111849_data_1-3 | 0 .../data_20180719111849_data_1-3.gz | Bin .../data_20180719111849_data_2-3 | 0 .../data_20180719111849_data_2-3.gz | Bin .../data_20180719111849_data_3-3 | 0 .../data_20180719111849_data_3-3.zip | Bin 0 -> 341 bytes .../GzipDecompressorTestRes}/params.json | 0 .../full_materialization/expected_data.psv} | 6 +- .../no_partitions/params.json | 7 + .../full_materialization}/params.json | 4 +- .../full_materialization}/schema.json | 0 .../full_materialization}/source_data.psv | 0 .../query_materialization}/expected_data.psv | 0 .../query_materialization}/initial_data.psv | 0 .../output_files_5/params.json | 0 .../query_materialization}/params.json | 0 .../query_materialization}/schema.json | 0 .../query_materialization/source_data.psv} | 6 +- .../year_month/expected_data.psv | 0 .../year_month/params.json | 0 .../year_month/schema.json | 0 .../year_month/source_data.psv | 0 .../expected_data.psv | 0 .../year_month_day_multiple_days/params.json | 0 .../year_month_day_multiple_days}/schema.json | 0 .../source_data.psv | 0 .../expected_data.psv | 0 .../initial_data.psv | 0 .../year_month_day_single_day/params.json | 0 .../year_month_day_single_day/schema.json} | 6 +- .../year_month_day_single_day/source_data.psv | 0 .../year_week/expected_data.psv | 0 .../year_week/initial_data.psv | 0 .../year_week/params.json | 0 .../year_week/schema.json | 0 .../year_week/source_data.psv | 0 .../year_week_day/params.json | 0 .../year_week_day/schema.json | 0 .../year_week_day/source_data.psv | 0 .../expected_target_data.psv | 0 ...47e0-8460-578de41057cc-c000.snappy.parquet | Bin .../scenario1/params.json | 0 .../scenario2/params.json | 0 .../target_schema.json | 0 .../feature/SQLRunnerTestRes/params.json | 1 + .../SQLRunnerTestRes/sql_runner_dataset.psv} | 15 +- .../TransposeTestRes/expected_target_data.psv | 1 + .../feature/TransposeTestRes/input_data.psv | 6 + .../feature/TransposeTestRes/params.json | 16 + .../TransposeTestRes/source_schema.json | 23 + .../TransposeTestRes/target_schema.json | 47 ++ .../different_schemas/20180101_schema.json | 0 .../data_20180101-part-00000.psv | 0 .../data_20180105-part-00000.psv | 0 .../different_schemas/lake_data_post.psv | 0 .../lake_data_post_append.psv | 0 .../different_schemas/lake_data_pre.psv | 0 .../different_schemas/params.json | 0 .../different_schemas/target_schema.json | 0 .../duplicate_values/20180101_schema.json | 0 .../data_20180101-part-00000.psv | 0 .../data_20180105-part-00000.psv | 0 .../duplicate_values/lake_data_post.psv | 0 .../lake_data_post_append.psv | 0 .../duplicate_values/lake_data_pre.psv | 0 .../duplicate_values/params.json | 0 .../duplicate_values/target_schema.json | 0 .../hierarchical_load/20180101_schema.json | 0 .../hierarchical_load/lake_data_post.psv | 0 .../hierarchical_load/lake_data_pre.psv | 0 .../hierarchical_load/params.json | 0 .../hierarchical_load/target_schema.json | 0 .../day=1/data_20180101-part-00000.psv | 0 .../day=5/data_20180105-part-00000.psv | 0 .../main_test/data_20180422-00001.psv | 0 .../main_test/lake_data_post.psv | 0 .../main_test/lake_data_pre.psv | 0 .../AppendLoadTestRes}/main_test/params.json | 0 .../main_test/target_schema.json | 0 .../missing_columns/data_20180422-00001.psv | 0 .../missing_columns/lake_data_post.psv | 0 .../missing_columns/lake_data_pre.psv | 0 .../missing_columns/params.json | 0 .../missing_columns/target_schema.json | 0 .../data_20180101-part-00000.psv | 0 .../data_20180101-part-00001.psv | 0 .../multiple_source_files/lake_data_post.psv | 0 .../multiple_source_files/lake_data_pre.psv | 0 .../multiple_source_files/params.json | 0 .../multiple_source_files/target_schema.json | 0 .../parquet_test/data_20180422-00001.parquet | Bin .../parquet_test/lake_data_post.psv | 0 .../parquet_test/lake_data_pre.psv | 0 .../parquet_test/params.json | 0 .../parquet_test/target_schema.json | 0 .../data-nodate-part-00000.psv | 0 .../data-nodate-part-00001.psv | 0 .../lake_data_post.psv | 0 .../lake_data_pre.psv | 0 .../partition_from_full_path/params.json | 0 .../target_schema.json | 0 .../lake_data_post.psv | 14 + .../lake_data_pre.psv | 10 + .../partitioned_and_date_columns/new_data.psv | 4 + .../partitioned_and_date_columns/params.json | 13 + .../target_schema.json | 37 +- .../data_20180422-00001.psv | 0 .../lake_data_post.psv | 0 .../lake_data_pre.psv | 0 .../reader_mode_specification/params.json | 0 .../params_failfast_mode.json | 0 .../params_invalid_reader_mode.json | 0 .../params_no_reader_mode.json | 0 .../params_permissive_mode.json | 0 .../target_schema.json | 0 .../wrong_data_20180422-00001.psv | 0 .../similar_schemas/20180101_schema.json | 0 .../data_20180101-part-00000.psv | 0 .../data_20180105-part-00000.psv | 0 .../similar_schemas/lake_data_post.psv | 0 .../similar_schemas/lake_data_post_append.psv | 0 .../similar_schemas/lake_data_pre.psv | 0 .../similar_schemas/params.json | 0 .../similar_schemas/target_schema.json | 0 .../control_data.psv | 18 + .../init_data.psv | 22 + .../lake_schema_final.json | 65 ++ .../lake_schema_initial.json | 59 ++ .../new_data.psv | 21 + .../nonpartitioned/control_data.psv | 18 + .../nonpartitioned/init_data.psv | 20 + .../nonpartitioned/lake_schema.json | 41 ++ .../nonpartitioned/new_data.psv | 21 + .../nonpartitioned/params.json | 37 ++ .../loads/DeltaLakeLoadTestRes/params.json | 40 ++ .../removed_columns/control_data.psv | 18 + .../removed_columns/init_data.psv | 20 + .../removed_columns/lake_schema.json | 59 ++ .../removed_columns/new_data.psv | 21 + .../control_data.psv | 18 + .../init_data.psv | 20 + .../lake_schema.json | 59 ++ .../new_data.psv | 21 + .../params.json | 41 ++ .../control_data.psv | 18 + .../init_data.psv | 20 + .../lake_schema.json | 59 ++ .../new_data.psv | 21 + .../params.json | 41 ++ .../csv_test/active_data_post.psv | 0 .../csv_test/active_data_pre.psv | 0 .../DeltaLoadTestRes}/csv_test/delta_data.psv | 2 +- .../DeltaLoadTestRes}/csv_test/params.json | 2 +- .../csv_test/params_part.json | 2 +- .../active_data_post.psv | 0 .../active_data_schema.json | 0 .../parquet_test_delta_init/delta_data.psv | 2 +- .../delta_data_schema.json | 2 +- .../parquet_test_delta_init/params.json | 4 +- .../active_data_post.psv | 0 .../active_data_pre.psv | 0 .../active_data_schema.json | 0 .../delta_data.psv | 2 +- .../delta_data_schema.json | 2 +- .../params.json | 4 +- .../active_data_post.psv | 0 .../active_data_pre.psv | 0 .../active_data_schema.json | 0 .../delta_data.psv | 2 +- .../delta_data_schema.json | 2 +- .../params.json | 4 +- .../active_data_post.psv | 0 .../active_data_pre.psv | 0 .../active_data_schema.json | 0 .../delta_data.psv | 2 +- .../delta_data_schema.json | 2 +- .../params.json | 9 + .../active_data_post.psv | 0 .../active_data_pre.psv | 0 .../active_data_schema.json | 0 .../delta_data.psv | 2 +- .../delta_data_schema.json | 2 +- .../params.json | 4 +- .../failfast_option}/lake_data_post.psv | 0 .../failfast_option}/lake_data_pre.psv | 0 .../failfast_option/new_data_wrong.psv | 0 .../failfast_option/params.json | 1 + .../params_dropmalformed_mode.json | 1 + .../params_invalid_reader_mode.json | 1 + .../params_no_reader_mode_set.json | 1 + .../params_permissive_mode.json | 1 + .../failfast_option}/target_schema.json | 0 .../FullLoadTestRes/landing}/new_data.psv | 0 .../landing/new_data_weekly.psv | 0 .../nested_flattener/data_normal_test.json | 1 + .../nested_flattener/data_transpose_test.json | 282 +++++++++ .../expected_target_data_extend.psv | 14 + .../expected_target_data_tranpose.psv | 1 + .../params_normal_scenario.json | 42 ++ .../params_transpose_scenario.json | 267 ++++++++ .../target_schema_extend.json | 35 + .../target_schema_transpose_scenario.json | 41 ++ .../non_partitioned}/lake_data_post.psv | 0 .../non_partitioned}/lake_data_pre.psv | 0 .../non_partitioned/params.json | 1 + .../non_partitioned}/target_schema.json | 0 .../partitioned/lake_data_post.psv | 0 .../partitioned/lake_data_pre.psv | 0 .../FullLoadTestRes}/partitioned/params.json | 1 + .../partitioned/target_schema.json | 0 .../lake_data_post.psv | 0 .../lake_data_pre.psv | 0 .../partitioned_date_format_wrong/params.json | 1 + .../target_schema.json | 0 .../partitioned_not_exist_dir/params.json | 1 + .../params.json | 1 + .../partitioned_weekly/lake_data_post.psv | 0 .../partitioned_weekly/lake_data_pre.psv | 0 .../partitioned_weekly/params.json | 1 + .../partitioned_weekly/target_schema.json | 0 .../data-nodate-part-00001.txt | 0 .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../semistructured_json_load/params.json | 0 .../target_schema.json | 0 .../data-nodate-part-00001.txt | 0 .../data-nodate-part-00002.txt | 0 .../data-nodate-part-00003.txt | 0 .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../params.json | 0 .../params_column_dropped.json | 0 .../target_schema.json | 0 .../target_schema_column_dropped.json | 0 .../data-nodate-part-00001.txt | 0 .../data-nodate-part-00002.txt | 0 .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../params.json | 0 .../params_evolved.json | 0 .../target_schema.json | 0 .../target_schema_evolved.json | 0 .../data-nodate-part-00001.txt | 0 .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../params.json | 0 .../target_schema.json | 0 .../data-nodate-part-00001.txt | 0 .../lake_data_pre.txt | 0 .../params.json | 0 .../target_schema.json | 0 .../20180101_schema.json | 0 .../data-nodate-part-00001.txt | 0 .../data-nodate-part-00002.txt | 0 .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../params.json | 0 .../target_schema.json | 0 .../data-nodate-part-00001.txt | 0 .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../params.json | 0 .../target_schema.json | 0 .../data_20180422-00001.parquet | Bin .../lake_data_post.txt | 0 .../lake_data_pre.txt | 0 .../semistructured_parquet_test/params.json | 0 .../semistructured_parquet_test/sales.parquet | Bin .../target_schema.json | 0 .../algorithm_template_params.json | 1 + .../lake_data_post.psv | 0 .../lake_data_pre.psv | 0 .../AlgorithmTemplateTestRes}/new_data.psv | 0 .../target_schema.json | 0 .../landing/new_data.psv | 0 .../landing/new_data_wrong_format.psv | 0 .../partitioned/expected_partitions.txt | 0 .../expected_partitions_schema.json | 0 .../partitioned/lake_data_post.psv | 0 .../partitioned/params.json | 1 + .../partitioned/target_schema.json | 0 .../data_20180101-part-00000.psv | 0 .../data_20180101-part-00001.psv | 0 .../expected_partitions.txt | 0 .../expected_partitions_schema.json | 0 .../multiple_source_files/lake_data_post.psv | 0 .../multiple_source_files/lake_data_pre.psv | 0 .../multiple_source_files/params.json | 0 .../multiple_source_files}/target_schema.json | 0 .../data_20180101-part-00000.psv | 0 .../data_20180101-part-00001.psv | 0 .../expected_partitions.txt | 0 .../expected_partitions_schema.json | 0 .../multiple_source_files/lake_data_post.psv | 0 .../multiple_source_files/lake_data_pre.psv | 0 .../multiple_source_files/params.json | 0 .../multiple_source_files}/target_schema.json | 0 .../analytics/feature/DeltaLoadTest.scala | 217 ------- .../FixedSizeStringExtractorTest.scala | 71 ++- .../analytics/feature/FullLoadTest.scala | 445 ------------- .../feature/GzipDecompressorTest.scala | 121 ++-- .../feature/MaterializationTest.scala | 559 ++++++++++++++++ .../feature/NestedFlattenerTest.scala | 40 +- .../PartitionMaterializationTest.scala | 377 ----------- .../analytics/feature/SQLRunnerTest.scala | 81 +++ .../analytics/feature/TransposeTest.scala | 83 +++ .../feature/{ => loads}/AppendLoadTest.scala | 320 +++++++--- .../feature/loads/DeltaLakeLoadTest.scala | 285 +++++++++ .../feature/loads/DeltaLoadTest.scala | 311 +++++++++ .../feature/loads/FullLoadTest.scala | 599 ++++++++++++++++++ .../{ => loads}/SemiStructuredLoadTest.scala | 400 ++++++++---- .../AlgorithmTemplateTest.scala | 56 +- .../integration/BaseIntegrationTest.scala | 54 +- .../integration/FailFastIntegrationTest.scala | 101 +-- ...coverPartitionsCustomIntegrationTest.scala | 58 +- ...coverPartitionsNativeIntegrationTest.scala | 58 +- ...coverPartitionsCustomIntegrationTest.scala | 70 -- ...coverPartitionsNativeIntegrationTest.scala | 70 -- .../unit/DateComponentDerivationTest.scala | 145 ++--- .../unit/RecoverPartitionsCustomTest.scala | 58 +- .../SparkRecoverPartitionsCustomTest.scala | 99 --- .../com/adidas/utils/BaseAlgorithmTest.scala | 58 +- .../scala/com/adidas/utils/FileReader.scala | 41 +- .../scala/com/adidas/utils/HDFSSupport.scala | 7 +- .../adidas/utils/SparkSessionWrapper.scala | 15 +- .../scala/com/adidas/utils/SparkSupport.scala | 22 +- src/test/scala/com/adidas/utils/Table.scala | 107 ++-- .../scala/com/adidas/utils/TestUtils.scala | 3 +- static/images/m3d_logo.png | Bin 0 -> 12043 bytes 431 files changed, 8495 insertions(+), 4279 deletions(-) create mode 100644 .gitignore create mode 100644 .scalafmt.conf delete mode 100644 src/main/scala/com/adidas/analytics/algo/AlgorithmTemplate.scala delete mode 100644 src/main/scala/com/adidas/analytics/algo/AppendLoad.scala delete mode 100644 src/main/scala/com/adidas/analytics/algo/FullLoad.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/Materialization.scala delete mode 100644 src/main/scala/com/adidas/analytics/algo/PartitionMaterialization.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/SQLRunner.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/Transpose.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/loads/AppendLoad.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/loads/DeltaLakeLoad.scala rename src/main/scala/com/adidas/analytics/algo/{ => loads}/DeltaLoad.scala (53%) create mode 100644 src/main/scala/com/adidas/analytics/algo/loads/FullLoad.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/shared/DataReshapingTask.scala create mode 100644 src/main/scala/com/adidas/analytics/algo/templates/AlgorithmTemplate.scala delete mode 100644 src/main/scala/com/adidas/analytics/config/AlgorithmTemplateConfiguration.scala delete mode 100644 src/main/scala/com/adidas/analytics/config/AppendLoadConfiguration.scala delete mode 100644 src/main/scala/com/adidas/analytics/config/DeltaLoadConfiguration.scala delete mode 100644 src/main/scala/com/adidas/analytics/config/FullLoadConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/config/MaterializationConfiguration.scala delete mode 100644 src/main/scala/com/adidas/analytics/config/PartitionMaterializationConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/config/TransposeConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/config/loads/AppendLoadConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/config/loads/DeltaLakeLoadConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/config/loads/DeltaLoadConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/config/loads/FullLoadConfiguration.scala rename src/main/scala/com/adidas/analytics/config/{shared => loads}/LoadConfiguration.scala (58%) create mode 100644 src/main/scala/com/adidas/analytics/config/shared/DataReshapingTaskConfig.scala create mode 100644 src/main/scala/com/adidas/analytics/config/templates/AlgorithmTemplateConfiguration.scala create mode 100644 src/main/scala/com/adidas/analytics/util/CatalogTableManager.scala delete mode 100644 src/main/scala/com/adidas/analytics/util/DistCpLoadHelper.scala delete mode 100644 src/main/scala/com/adidas/analytics/util/HiveTableAttributeReader.scala delete mode 100644 src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsCustom.scala delete mode 100644 src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsNative.scala delete mode 100644 src/test/resources/DeltaLoadTest/parquet_test_delta_merge_unpartitioned/params.json delete mode 100644 src/test/resources/GzipDecompressorTest/data_20180719111849_data_3-3.gz delete mode 100644 src/test/resources/SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/data_20180101-part-00000.psv delete mode 100644 src/test/resources/SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/data_20180101-part-00001.psv delete mode 100644 src/test/resources/SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/expected_partitions.txt delete mode 100644 src/test/resources/SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/expected_partitions_schema.json delete mode 100644 src/test/resources/SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/params.json delete mode 100644 src/test/resources/SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/data_20180101-part-00000.psv delete mode 100644 src/test/resources/SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/data_20180101-part-00001.psv delete mode 100644 src/test/resources/SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/expected_partitions.txt delete mode 100644 src/test/resources/SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/expected_partitions_schema.json delete mode 100644 src/test/resources/SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/lake_data_post.psv delete mode 100644 src/test/resources/SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/params.json rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema/lake_data_post.psv (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema/params.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema/source_data.psv (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema/source_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema/target_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema_partitioned/lake_data_post.psv (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema_partitioned/params.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema_partitioned/source_data.psv (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema_partitioned/source_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/matched_schema_partitioned/target_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/non_matched_schema1/params.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/non_matched_schema1/source_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/non_matched_schema1/target_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/non_matched_schema2/params.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/non_matched_schema2/source_schema.json (100%) rename src/test/resources/{FixedSizeStringExtractorTest => com/adidas/analytics/feature/FixedSizeStringExtractorTestRes}/non_matched_schema2/target_schema.json (100%) rename src/test/resources/{GzipDecompressorTest => com/adidas/analytics/feature/GzipDecompressorTestRes}/data_20180719111849_data_1-3 (100%) rename src/test/resources/{GzipDecompressorTest => com/adidas/analytics/feature/GzipDecompressorTestRes}/data_20180719111849_data_1-3.gz (100%) rename src/test/resources/{GzipDecompressorTest => com/adidas/analytics/feature/GzipDecompressorTestRes}/data_20180719111849_data_2-3 (100%) rename src/test/resources/{GzipDecompressorTest => com/adidas/analytics/feature/GzipDecompressorTestRes}/data_20180719111849_data_2-3.gz (100%) rename src/test/resources/{GzipDecompressorTest => com/adidas/analytics/feature/GzipDecompressorTestRes}/data_20180719111849_data_3-3 (100%) create mode 100644 src/test/resources/com/adidas/analytics/feature/GzipDecompressorTestRes/data_20180719111849_data_3-3.zip rename src/test/resources/{GzipDecompressorTest => com/adidas/analytics/feature/GzipDecompressorTestRes}/params.json (100%) rename src/test/resources/{SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files/lake_data_pre.psv => com/adidas/analytics/feature/MaterializationTestRes/full_materialization/expected_data.psv} (84%) create mode 100644 src/test/resources/com/adidas/analytics/feature/MaterializationTestRes/full_materialization/no_partitions/params.json rename src/test/resources/{PartitionMaterializationTest/output_files_3 => com/adidas/analytics/feature/MaterializationTestRes/full_materialization}/params.json (61%) rename src/test/resources/{PartitionMaterializationTest/condition_materialization => com/adidas/analytics/feature/MaterializationTestRes/full_materialization}/schema.json (100%) rename src/test/resources/{PartitionMaterializationTest/condition_materialization => com/adidas/analytics/feature/MaterializationTestRes/full_materialization}/source_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest/condition_materialization => com/adidas/analytics/feature/MaterializationTestRes/query_materialization}/expected_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest/condition_materialization => com/adidas/analytics/feature/MaterializationTestRes/query_materialization}/initial_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes/query_materialization}/output_files_5/params.json (100%) rename src/test/resources/{PartitionMaterializationTest/condition_materialization => com/adidas/analytics/feature/MaterializationTestRes/query_materialization}/params.json (100%) rename src/test/resources/{PartitionMaterializationTest/range_materialization/year_month_day_multiple_days => com/adidas/analytics/feature/MaterializationTestRes/query_materialization}/schema.json (100%) rename src/test/resources/{SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/lake_data_pre.psv => com/adidas/analytics/feature/MaterializationTestRes/query_materialization/source_data.psv} (84%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month/expected_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month/params.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month/schema.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month/source_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_multiple_days/expected_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_multiple_days/params.json (100%) rename src/test/resources/{PartitionMaterializationTest/range_materialization/year_month_day_single_day => com/adidas/analytics/feature/MaterializationTestRes/range_materialization/year_month_day_multiple_days}/schema.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_multiple_days/source_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_single_day/expected_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_single_day/initial_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_single_day/params.json (100%) rename src/test/resources/{SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/target_schema.json => com/adidas/analytics/feature/MaterializationTestRes/range_materialization/year_month_day_single_day/schema.json} (92%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_month_day_single_day/source_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week/expected_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week/initial_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week/params.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week/schema.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week/source_data.psv (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week_day/params.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week_day/schema.json (100%) rename src/test/resources/{PartitionMaterializationTest => com/adidas/analytics/feature/MaterializationTestRes}/range_materialization/year_week_day/source_data.psv (100%) rename src/test/resources/{NestedFlattenerTest => com/adidas/analytics/feature/NestedFlattenerTestRes}/expected_target_data.psv (100%) rename src/test/resources/{NestedFlattenerTest => com/adidas/analytics/feature/NestedFlattenerTestRes}/nest_test/data/part-00000-3924d987-4115-47e0-8460-578de41057cc-c000.snappy.parquet (100%) rename src/test/resources/{NestedFlattenerTest => com/adidas/analytics/feature/NestedFlattenerTestRes}/scenario1/params.json (100%) rename src/test/resources/{NestedFlattenerTest => com/adidas/analytics/feature/NestedFlattenerTestRes}/scenario2/params.json (100%) rename src/test/resources/{NestedFlattenerTest => com/adidas/analytics/feature/NestedFlattenerTestRes}/target_schema.json (100%) create mode 100644 src/test/resources/com/adidas/analytics/feature/SQLRunnerTestRes/params.json rename src/test/resources/{SparkRecoverPartitionsCustomIntegrationTest/multiple_source_files/lake_data_post.psv => com/adidas/analytics/feature/SQLRunnerTestRes/sql_runner_dataset.psv} (60%) create mode 100644 src/test/resources/com/adidas/analytics/feature/TransposeTestRes/expected_target_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/TransposeTestRes/input_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/TransposeTestRes/params.json create mode 100644 src/test/resources/com/adidas/analytics/feature/TransposeTestRes/source_schema.json create mode 100644 src/test/resources/com/adidas/analytics/feature/TransposeTestRes/target_schema.json rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/20180101_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/data_20180101-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/data_20180105-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/lake_data_post_append.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/different_schemas/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/20180101_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/data_20180101-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/data_20180105-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/lake_data_post_append.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/duplicate_values/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/20180101_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/year=2018/month=1/day=1/data_20180101-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/hierarchical_load/year=2018/month=1/day=5/data_20180105-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/main_test/data_20180422-00001.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/main_test/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/main_test/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/main_test/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/main_test/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/missing_columns/data_20180422-00001.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/missing_columns/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/missing_columns/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/missing_columns/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/missing_columns/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/multiple_source_files/data_20180101-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/multiple_source_files/data_20180101-part-00001.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/multiple_source_files/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/multiple_source_files/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/multiple_source_files/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/multiple_source_files/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/parquet_test/data_20180422-00001.parquet (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/parquet_test/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/parquet_test/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/parquet_test/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/parquet_test/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/partition_from_full_path/data-nodate-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/partition_from_full_path/data-nodate-part-00001.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/partition_from_full_path/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/partition_from_full_path/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/partition_from_full_path/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/partition_from_full_path/target_schema.json (100%) create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/AppendLoadTestRes/partitioned_and_date_columns/lake_data_post.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/AppendLoadTestRes/partitioned_and_date_columns/lake_data_pre.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/AppendLoadTestRes/partitioned_and_date_columns/new_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/AppendLoadTestRes/partitioned_and_date_columns/params.json rename src/test/resources/{SparkRecoverPartitionsNativeIntegrationTest/multiple_source_files => com/adidas/analytics/feature/loads/AppendLoadTestRes/partitioned_and_date_columns}/target_schema.json (59%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/data_20180422-00001.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/params_failfast_mode.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/params_invalid_reader_mode.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/params_no_reader_mode.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/params_permissive_mode.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/target_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/reader_mode_specification/wrong_data_20180422-00001.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/20180101_schema.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/data_20180101-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/data_20180105-part-00000.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/lake_data_post.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/lake_data_post_append.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/lake_data_pre.psv (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/params.json (100%) rename src/test/resources/{AppendLoadTest => com/adidas/analytics/feature/loads/AppendLoadTestRes}/similar_schemas/target_schema.json (100%) create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/added_columns_and_duplicates_in_init/control_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/added_columns_and_duplicates_in_init/init_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/added_columns_and_duplicates_in_init/lake_schema_final.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/added_columns_and_duplicates_in_init/lake_schema_initial.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/added_columns_and_duplicates_in_init/new_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/nonpartitioned/control_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/nonpartitioned/init_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/nonpartitioned/lake_schema.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/nonpartitioned/new_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/nonpartitioned/params.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/params.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/removed_columns/control_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/removed_columns/init_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/removed_columns/lake_schema.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/removed_columns/new_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_right_params/control_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_right_params/init_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_right_params/lake_schema.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_right_params/new_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_right_params/params.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_wrong_params/control_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_wrong_params/init_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_wrong_params/lake_schema.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_wrong_params/new_data.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLakeLoadTestRes/unstable_partitions_wrong_params/params.json rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/csv_test/active_data_post.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/csv_test/active_data_pre.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/csv_test/delta_data.psv (92%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/csv_test/params.json (79%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/csv_test/params_part.json (77%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_init/active_data_post.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_init/active_data_schema.json (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_init/delta_data.psv (91%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_init/delta_data_schema.json (97%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_init/params.json (63%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_additional_columns/active_data_post.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_additional_columns/active_data_pre.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_additional_columns/active_data_schema.json (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_additional_columns/delta_data.psv (92%) rename src/test/resources/{DeltaLoadTest/parquet_test_delta_merge_missing_columns => com/adidas/analytics/feature/loads/DeltaLoadTestRes/parquet_test_delta_merge_additional_columns}/delta_data_schema.json (97%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_additional_columns/params.json (63%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_missing_columns/active_data_post.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_missing_columns/active_data_pre.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_missing_columns/active_data_schema.json (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_missing_columns/delta_data.psv (92%) rename src/test/resources/{DeltaLoadTest/parquet_test_delta_merge_additional_columns => com/adidas/analytics/feature/loads/DeltaLoadTestRes/parquet_test_delta_merge_missing_columns}/delta_data_schema.json (97%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_missing_columns/params.json (63%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_partitioned/active_data_post.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_partitioned/active_data_pre.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_partitioned/active_data_schema.json (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_partitioned/delta_data.psv (92%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_partitioned/delta_data_schema.json (97%) create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/DeltaLoadTestRes/parquet_test_delta_merge_partitioned/params.json rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_unpartitioned/active_data_post.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_unpartitioned/active_data_pre.psv (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_unpartitioned/active_data_schema.json (100%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_unpartitioned/delta_data.psv (92%) rename src/test/resources/{DeltaLoadTest => com/adidas/analytics/feature/loads/DeltaLoadTestRes}/parquet_test_delta_merge_unpartitioned/delta_data_schema.json (96%) rename src/test/resources/{DeltaLoadTest/parquet_test_delta_merge_partitioned => com/adidas/analytics/feature/loads/DeltaLoadTestRes/parquet_test_delta_merge_unpartitioned}/params.json (63%) rename src/test/resources/{AlgorithmTemplateTest => com/adidas/analytics/feature/loads/FullLoadTestRes/failfast_option}/lake_data_post.psv (100%) rename src/test/resources/{AlgorithmTemplateTest => com/adidas/analytics/feature/loads/FullLoadTestRes/failfast_option}/lake_data_pre.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/failfast_option/new_data_wrong.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/failfast_option/params.json (93%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/failfast_option/params_dropmalformed_mode.json (93%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/failfast_option/params_invalid_reader_mode.json (93%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/failfast_option/params_no_reader_mode_set.json (93%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/failfast_option/params_permissive_mode.json (93%) rename src/test/resources/{AlgorithmTemplateTest => com/adidas/analytics/feature/loads/FullLoadTestRes/failfast_option}/target_schema.json (100%) rename src/test/resources/{AlgorithmTemplateTest => com/adidas/analytics/feature/loads/FullLoadTestRes/landing}/new_data.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/landing/new_data_weekly.psv (100%) create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/data_normal_test.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/data_transpose_test.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/expected_target_data_extend.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/expected_target_data_tranpose.psv create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/params_normal_scenario.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/params_transpose_scenario.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/target_schema_extend.json create mode 100644 src/test/resources/com/adidas/analytics/feature/loads/FullLoadTestRes/nested_flattener/target_schema_transpose_scenario.json rename src/test/resources/{FullLoadTest/failfast_option => com/adidas/analytics/feature/loads/FullLoadTestRes/non_partitioned}/lake_data_post.psv (100%) rename src/test/resources/{FullLoadTest/failfast_option => com/adidas/analytics/feature/loads/FullLoadTestRes/non_partitioned}/lake_data_pre.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/non_partitioned/params.json (93%) rename src/test/resources/{FullLoadTest/failfast_option => com/adidas/analytics/feature/loads/FullLoadTestRes/non_partitioned}/target_schema.json (100%) rename src/test/resources/{FailFastIntegrationTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned/lake_data_post.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned/lake_data_pre.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned/params.json (94%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned/target_schema.json (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_date_format_wrong/lake_data_post.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_date_format_wrong/lake_data_pre.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_date_format_wrong/params.json (94%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_date_format_wrong/target_schema.json (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_not_exist_dir/params.json (94%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_partition_column_wrong/params.json (94%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_weekly/lake_data_post.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_weekly/lake_data_pre.psv (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_weekly/params.json (93%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/feature/loads/FullLoadTestRes}/partitioned_weekly/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load/params.json (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest/multiple_source_files => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes/semistructured_json_load}/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/data-nodate-part-00002.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/data-nodate-part-00003.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/params_column_dropped.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_dropping_column/target_schema_column_dropped.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/data-nodate-part-00002.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/params_evolved.json (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest/multiple_source_files => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes/semistructured_json_load_evolving_schema}/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_evolving_schema/target_schema_evolved.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_mismatching_schema/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_mismatching_schema/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_mismatching_schema/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_mismatching_schema/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest/semistructured_json_load => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes/semistructured_json_load_mismatching_schema}/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_wrong_configuration/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_wrong_configuration/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_json_load_wrong_configuration/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest/semistructured_json_load_evolving_schema => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes/semistructured_json_load_wrong_configuration}/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_load_with_existing_header/20180101_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_load_with_existing_header/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_load_with_existing_header/data-nodate-part-00002.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_load_with_existing_header/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_load_with_existing_header/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_load_with_existing_header/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest/semistructured_json_load_mismatching_schema => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes/semistructured_load_with_existing_header}/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_nested_json_load/data-nodate-part-00001.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_nested_json_load/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_nested_json_load/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_nested_json_load/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_nested_json_load/target_schema.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_parquet_test/data_20180422-00001.parquet (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_parquet_test/lake_data_post.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_parquet_test/lake_data_pre.txt (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_parquet_test/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_parquet_test/sales.parquet (100%) rename src/test/resources/{SemiStructuredLoadTest => com/adidas/analytics/feature/loads/SemiStructuredLoadTestRes}/semistructured_parquet_test/target_schema.json (100%) rename src/test/resources/{AlgorithmTemplateTest => com/adidas/analytics/feature/templates/AlgorithmTemplateTestRes}/algorithm_template_params.json (93%) rename src/test/resources/{FullLoadTest/non_partitioned => com/adidas/analytics/feature/templates/AlgorithmTemplateTestRes}/lake_data_post.psv (100%) rename src/test/resources/{FullLoadTest/non_partitioned => com/adidas/analytics/feature/templates/AlgorithmTemplateTestRes}/lake_data_pre.psv (100%) rename src/test/resources/{FailFastIntegrationTest/landing => com/adidas/analytics/feature/templates/AlgorithmTemplateTestRes}/new_data.psv (100%) rename src/test/resources/{FullLoadTest/non_partitioned => com/adidas/analytics/feature/templates/AlgorithmTemplateTestRes}/target_schema.json (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/landing/new_data.psv (100%) rename src/test/resources/{FailFastIntegrationTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/landing/new_data_wrong_format.psv (100%) rename src/test/resources/{FailFastIntegrationTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/partitioned/expected_partitions.txt (100%) rename src/test/resources/{FailFastIntegrationTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/partitioned/expected_partitions_schema.json (100%) rename src/test/resources/{FullLoadTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/partitioned/lake_data_post.psv (100%) rename src/test/resources/{FailFastIntegrationTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/partitioned/params.json (94%) rename src/test/resources/{FailFastIntegrationTest => com/adidas/analytics/integration/FailFastIntegrationTestRes}/partitioned/target_schema.json (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/data_20180101-part-00000.psv (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/data_20180101-part-00001.psv (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/expected_partitions.txt (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/expected_partitions_schema.json (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/lake_data_post.psv (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/lake_data_pre.psv (100%) rename src/test/resources/{RecoverPartitionsCustomIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes}/multiple_source_files/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest/semistructured_json_load_wrong_configuration => com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTestRes/multiple_source_files}/target_schema.json (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/data_20180101-part-00000.psv (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/data_20180101-part-00001.psv (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/expected_partitions.txt (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/expected_partitions_schema.json (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/lake_data_post.psv (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/lake_data_pre.psv (100%) rename src/test/resources/{RecoverPartitionsNativeIntegrationTest => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes}/multiple_source_files/params.json (100%) rename src/test/resources/{SemiStructuredLoadTest/semistructured_load_with_existing_header => com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTestRes/multiple_source_files}/target_schema.json (100%) delete mode 100644 src/test/scala/com/adidas/analytics/feature/DeltaLoadTest.scala delete mode 100644 src/test/scala/com/adidas/analytics/feature/FullLoadTest.scala create mode 100644 src/test/scala/com/adidas/analytics/feature/MaterializationTest.scala delete mode 100644 src/test/scala/com/adidas/analytics/feature/PartitionMaterializationTest.scala create mode 100644 src/test/scala/com/adidas/analytics/feature/SQLRunnerTest.scala create mode 100644 src/test/scala/com/adidas/analytics/feature/TransposeTest.scala rename src/test/scala/com/adidas/analytics/feature/{ => loads}/AppendLoadTest.scala (63%) create mode 100644 src/test/scala/com/adidas/analytics/feature/loads/DeltaLakeLoadTest.scala create mode 100644 src/test/scala/com/adidas/analytics/feature/loads/DeltaLoadTest.scala create mode 100644 src/test/scala/com/adidas/analytics/feature/loads/FullLoadTest.scala rename src/test/scala/com/adidas/analytics/feature/{ => loads}/SemiStructuredLoadTest.scala (52%) rename src/test/scala/com/adidas/analytics/feature/{ => templates}/AlgorithmTemplateTest.scala (57%) delete mode 100644 src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsCustomIntegrationTest.scala delete mode 100644 src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsNativeIntegrationTest.scala delete mode 100644 src/test/scala/com/adidas/analytics/unit/SparkRecoverPartitionsCustomTest.scala create mode 100644 static/images/m3d_logo.png diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..decf5d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +*.class +*.log +*.iml +.idea/* + +# sbt specific +.cache/ +.history/ +.lib/ +dist/* +project/project/ +project/target/ +src/main/java/ +src/main/resources/ +src/main/scala-2.10/ +src/test/java/ +src/test/scala-2.10/ +target/ +metastore_db/ + +# Scala-IDE specific +.scala_dependencies +.worksheet diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 0000000..1717851 --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,189 @@ +version = "2.6.4" +maxColumn = 100 +docstrings.wrap = yes +docstrings.style = SpaceAsterisk +comments.wrap = trailing +comments.wrapStandaloneSlcAsSlc = false +optIn.configStyleArguments = true +optIn.breaksInsideChains = false +optIn.breakChainOnFirstMethodDot = true +optIn.encloseClassicChains = false +optIn.selfAnnotationNewline = true +optIn.annotationNewlines = true +optIn.forceBlankLineBeforeDocstring = true +optIn.blankLineBeforeDocstring = false +binPack.unsafeCallSite = false +binPack.unsafeDefnSite = false +binPack.parentConstructors = never +binPack.literalArgumentLists = true +binPack.literalsIncludeSimpleExpr = false +binPack.literalsSingleLine = false +binPack.literalsMinArgCount = 5 +binPack.literalsInclude = [ + ".*" +] +binPack.literalsExclude = [ + String + "Term.Name" +] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 4 +continuationIndent.ctorSite = null +continuationIndent.extendSite = 4 +continuationIndent.withSiteRelativeToExtends = 0 +align.multiline = false +align.stripMargin = true +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [ + { + code = "=>" + owner = Case + } +] +align.arrowEnumeratorGenerator = false +align.ifWhileOpenParen = false +align.treeCategory."Defn.Trait" = "class/object/trait" +align.treeCategory."Defn.Object" = "class/object/trait" +align.treeCategory."Defn.Val" = "val/var/def" +align.treeCategory."Defn.Def" = "val/var/def" +align.treeCategory."Defn.Var" = "val/var/def" +align.treeCategory."Enumerator.Generator" = for +align.treeCategory."Enumerator.Val" = for +align.treeCategory."Defn.Class" = "class/object/trait" +spaces.beforeContextBoundColon = Never +spaces.afterTripleEquals = false +spaces.inImportCurlyBraces = false +spaces.inParentheses = false +spaces.neverAroundInfixTypes = [] +spaces.afterKeywordBeforeParen = true +spaces.inByNameTypes = true +spaces.afterSymbolicDefs = false +literals.long = Upper +literals.float = Lower +literals.double = Lower +literals.hexDigits = Lower +literals.hexPrefix = Lower +literals.scientific = Lower +lineEndings = unix +rewrite.rules = [RedundantBraces] +rewrite.redundantBraces.methodBodies = true +rewrite.redundantBraces.includeUnitMethods = true +rewrite.redundantBraces.maxLines = 100 +rewrite.redundantBraces.stringInterpolation = true +rewrite.redundantBraces.parensForOneLineApply = null +rewrite.redundantBraces.generalExpressions = true +rewrite.sortModifiers.order = [ + "`implicit`" + "`final`" + "`sealed`" + "`abstract`" + "`override`" + "`private`" + "`protected`" + "`lazy`" +] +rewrite.neverInfix.includeFilters = [ + """[\w\d_]+""" +] +rewrite.neverInfix.excludeFilters = [ + until + to + by + eq + ne + "should.*" + "contain.*" + "must.*" + in + ignore + be + taggedAs + thrownBy + synchronized + have + when + size + only + noneOf + oneElementOf + noElementsOf + atLeastOneElementOf + atMostOneElementOf + allElementsOf + inOrderElementsOf + theSameElementsAs +] +indentOperator.include = ".*" +indentOperator.exclude = """^(&&|\|\|)$""" +newlines.neverInResultType = false +newlines.neverBeforeJsNative = false +newlines.sometimesBeforeColonInMethodReturnType = true +newlines.penalizeSingleSelectMultiArgList = true +newlines.alwaysBeforeCurlyBraceLambdaParams = false +newlines.topLevelStatementsMinBreaks = 1 +newlines.topLevelStatements = [before] +newlines.alwaysBeforeTopLevelStatements = false +newlines.implicitParamListModifierForce = [] +newlines.implicitParamListModifierPrefer = null +newlines.alwaysBeforeElseAfterCurlyIf = false +newlines.alwaysBeforeMultilineDef = true +newlines.afterInfix = null +newlines.afterInfixBreakOnNested = false +newlines.afterInfixMaxCountPerExprForSome = 10 +newlines.afterInfixMaxCountPerFile = 500 +newlines.afterCurlyLambda = squash +newlines.avoidForSimpleOverflow = [] +newlines.avoidAfterYield = true +runner.debug = false +runner.eventCallback = " Unit>" +runner.optimizer.dequeueOnNewStatements = true +runner.optimizer.escapeInPathologicalCases = true +runner.optimizer.maxVisitsPerToken = 10000 +runner.optimizer.maxEscapes = 16 +runner.optimizer.maxDepth = 100 +runner.optimizer.acceptOptimalAtHints = true +runner.optimizer.disableOptimizationsInsideSensitiveAreas = true +runner.optimizer.pruneSlowStates = true +runner.optimizer.recurseOnBlocks = true +runner.optimizer.forceConfigStyleOnOffset = 150 +runner.optimizer.forceConfigStyleMinArgCount = 2 +runner.maxStateVisits = 1000000 +runner.dialect = "scala211" +runner.ignoreWarnings = false +runner.fatalWarnings = false +indentYieldKeyword = true +importSelectors = noBinPack +unindentTopLevelOperators = false +includeCurlyBraceInSelectChains = true +includeNoParensInSelectChains = false +assumeStandardLibraryStripMargin = false +danglingParentheses.callSite = true +danglingParentheses.defnSite = true +danglingParentheses.ctrlSite = true +danglingParentheses.exclude = [] +poorMansTrailingCommasInConfigStyle = false +trailingCommas = never +verticalMultiline.atDefnSite = false +verticalMultiline.arityThreshold = 100 +verticalMultiline.newlineBeforeImplicitKW = false +verticalMultiline.newlineAfterImplicitKW = false +verticalMultiline.newlineAfterOpenParen = false +verticalMultiline.excludeDanglingParens = [ + "`class`" + "`trait`" +] +verticalAlignMultilineOperators = false +onTestFailure = "" +encoding = "UTF-8" +project.git = true +project.files = [] +project.includeFilters = [ + """.*\.scala$""" + """.*\.sbt$""" + """.*\.sc$""" +] +project.excludeFilters = [ + "target" +] +xmlLiterals.assumeFormatted = false \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 3610c7c..1fee3de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM amazonlinux:2.0.20190508 +FROM amazonlinux:2.0.20200722.0 ARG JAVA_VERSION=1.8.0 @@ -7,7 +7,14 @@ RUN curl https://bintray.com/sbt/rpm/rpm | tee /etc/yum.repos.d/bintray-sbt-rpm. # Installing system dependencies RUN yum update -y && \ - yum install -y java-${JAVA_VERSION}-openjdk java-${JAVA_VERSION}-openjdk-devel sbt && \ - yum clean all + yum install -y java-${JAVA_VERSION}-openjdk java-${JAVA_VERSION}-openjdk-devel sbt shadow-utils && \ + yum clean all && \ + rm -rf /var/cache/yum + +RUN groupadd -r m3d && \ + useradd -r -g m3d m3d && \ + mkdir -p /home/m3d && \ + chown m3d:m3d /home/m3d +USER m3d CMD ["/bin/bash"] diff --git a/Jenkinsfile b/Jenkinsfile index 69f2049..bbc1391 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,4 +1,5 @@ pipeline { + options { ansiColor('xterm') disableConcurrentBuilds() @@ -35,6 +36,12 @@ pipeline { } } + stage('lint code') { + steps { + sh "./dev-env.sh project-lint -w ${workspace}" + } + } + stage('run tests') { steps { sh "./dev-env.sh project-test -w ${workspace}" diff --git a/README.md b/README.md index f8a7ad1..5debf9a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ M3D Engine ======= +![M3D logo](/static/images/m3d_logo.png) + **M3D** stands for _Metadata Driven Development_ and is a cloud and platform agnostic framework for the automated creation, management and governance of metadata and data flows from multiple source to multiple target systems. The main features and design goals of M3D are: * Cloud and platform agnostic @@ -46,7 +48,7 @@ M3D Engine supports: * Loading structured and semi-structured data in Full mode * Loading structured and semi-structured data in Append mode -* Loading structured and semi-structured data in Delta mode +* Loading structured and semi-structured data in Delta mode (DeltaLoad - in memory, by comparing new data and target table partitions; DeltaLakeLoad - using [Delta Lake IO](https://delta.io) capabilities) * Decompression of compressed data * Extraction from parquet file format * Extraction from delimiter separated files (CSV,TSV,etc.) @@ -102,7 +104,7 @@ The parameter file for the full load algorithm for example has the following con * `delimiter` delimiter used in the case of `dsv` format * `has_header` flag defining whether the input files have a header * `partition_column` column that contains the partitioning information -* `partition_column_format` format of the partitioning column in the case of of time/date columns +* `partition_column_format` format of the partitioning column in the case of time/date columns * `target_partitions` partitioning columns in the target * `target_table` target table where the data will be available for querying after loading diff --git a/build.sbt b/build.sbt index f48e5a9..189aaf4 100644 --- a/build.sbt +++ b/build.sbt @@ -1,27 +1,25 @@ import sbt.ExclusionRule name := "m3d-engine" - version := "1.0" scalaVersion := "2.11.12" +semanticdbEnabled := true +semanticdbVersion := scalafixSemanticdb.revision +scalacOptions += "-Ywarn-unused-import" -val sparkVersion = "2.4.0" +val sparkVersion = "2.4.4" val hadoopVersion = "2.8.5" -conflictManager := sbt.ConflictManager.latestTime +conflictManager := sbt.ConflictManager.latestRevision mainClass in Compile := Some("com.adidas.analytics.AlgorithmFactory") -// TODO: should be deleted as it exists in the Spark distribution -libraryDependencies += "org.scala-lang" % "scala-library" % scalaVersion.value - -libraryDependencies += "joda-time" % "joda-time" % "2.9.3" % Provided -libraryDependencies += "org.joda" % "joda-convert" % "2.1.1" - -libraryDependencies += "org.slf4j" % "slf4j-log4j12" % "1.7.16" +/* ===================== + * Dependencies + * ===================== */ -libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % Provided withExclusions Vector( +libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % Provided withExclusions Vector( ExclusionRule("org.apache.hadoop", "hadoop-common"), ExclusionRule("org.apache.hadoop", "hadoop-hdfs"), ExclusionRule("com.google.guava", "guava") @@ -32,30 +30,37 @@ libraryDependencies += "org.apache.spark" %% "spark-hive" % sparkVersion % Provi ) libraryDependencies += "org.apache.hadoop" % "hadoop-common" % hadoopVersion % Provided withExclusions Vector( - ExclusionRule("io.netty", "netty-all") + ExclusionRule("io.netty", "netty-all") ) libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion % Provided libraryDependencies += "org.apache.hadoop" % "hadoop-distcp" % hadoopVersion % Provided -// TODO: replace exiting configuration with pureconfig -//libraryDependencies += "com.github.pureconfig" %% "pureconfig" % "0.9.2" +libraryDependencies += "joda-time" % "joda-time" % "2.9.3" % Provided +libraryDependencies += "org.joda" % "joda-convert" % "2.2.1" + +libraryDependencies += "org.slf4j" % "slf4j-log4j12" % "1.7.30" -// Dependencies for test +libraryDependencies += "io.delta" %% "delta-core" % "0.6.1" -libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.5" % Test +/* ===================== + * Dependencies for test + * ===================== */ -libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion % Test classifier "tests" withExclusions Vector( - ExclusionRule("io.netty", "netty-all") -) -libraryDependencies += "org.apache.hadoop" % "hadoop-common" % hadoopVersion % Test classifier "tests" withExclusions Vector( - ExclusionRule("io.netty", "netty-all") -) +libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.1" % Test +libraryDependencies += + "org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion % Test classifier "tests" withExclusions Vector( + ExclusionRule("io.netty", "netty-all") + ) +libraryDependencies += + "org.apache.hadoop" % "hadoop-common" % hadoopVersion % Test classifier "tests" withExclusions Vector( + ExclusionRule("io.netty", "netty-all") + ) fork in Test := true // disable parallel execution parallelExecution in Test := false -// skipping tests when running assembly -test in assembly := {} \ No newline at end of file +// skipping tests when running assembly +test in assembly := {} diff --git a/common.sh b/common.sh index 95ff249..915c71f 100644 --- a/common.sh +++ b/common.sh @@ -81,10 +81,10 @@ function exec_command_within_container() { if [[ -z "$LOCAL_IS_INTERACTIVE" ]]; then echo "Executing command within container: $LOCAL_CMD" - docker exec "$LOCAL_CONTAINER_INSTANCE_NAME" bash -c "cd /root/workspace/${LOCAL_PROJECT_NAME} && ${LOCAL_CMD}" + docker exec "$LOCAL_CONTAINER_INSTANCE_NAME" bash -c "cd /m3d/workspace/${LOCAL_PROJECT_NAME} && ${LOCAL_CMD}" else echo "Executing command within container in interactive mode: $LOCAL_CMD" - docker exec -it "$LOCAL_CONTAINER_INSTANCE_NAME" bash -c "cd /root/workspace/${LOCAL_PROJECT_NAME} && ${LOCAL_CMD}" + docker exec -it "$LOCAL_CONTAINER_INSTANCE_NAME" bash -c "cd /m3d/workspace/${LOCAL_PROJECT_NAME} && ${LOCAL_CMD}" fi } diff --git a/dev-env.sh b/dev-env.sh index 7dec435..8c6f3ff 100755 --- a/dev-env.sh +++ b/dev-env.sh @@ -6,31 +6,35 @@ SCRIPT_NAME="dev-env.sh" PROJECT_NAME="m3d-engine" CONTAINER_IMAGE_NAME="$PROJECT_NAME" -PARAM_WORKSPACE=( "workspace" "w" "m3d-engine code directory (must be the same within the container life-cycle)") -PARAM_TEST_FILTER=( "test-filter" "f" "filter string for selecting specific tests when testing the code with sbt") -OPTION_HELP=( "help" "h" "show help message for the command") -OPTION_DEBUG=( "debug" "d" "start containers or run tests in debug mode (must be the same within the container life-cycle)") -OPTION_INTERACTIVE=( "interactive" "i" "use interactive mode and allocate pseudo-TTY when executing a command inside the container") - -ARG_ACTION_IMAGE_BUILD=( "image-build" "build the docker image") -ARG_ACTION_CONTAINER_RUN=( "container-run" "run a container from the docker image") -ARG_ACTION_CONTAINER_EXECUTE=( "container-execute" "execute an external command within the container") -ARG_ACTION_CONTAINER_STOP=( "container-stop" "stop the container") -ARG_ACTION_CONTAINER_DELETE=( "container-delete" "delete the container") -ARG_ACTION_PROJECT_ASSEMBLE=( "project-assemble" "build the code and create a jar-file") -ARG_ACTION_PROJECT_TEST=( "project-test" "run tests within the container") -ARG_ACTION_PROJECT_CLEAN=( "project-clean" "clean pyc-files in the project directory") -ARG_COMMAND=( "command" "command to execute within the container") - -AVAILABLE_ACTIONS=(\ - "$ARG_ACTION_IMAGE_BUILD" \ - "$ARG_ACTION_CONTAINER_RUN" \ - "$ARG_ACTION_CONTAINER_EXECUTE" \ - "$ARG_ACTION_CONTAINER_STOP" \ - "$ARG_ACTION_CONTAINER_DELETE" \ - "$ARG_ACTION_PROJECT_ASSEMBLE" \ - "$ARG_ACTION_PROJECT_TEST" \ - "$ARG_ACTION_PROJECT_CLEAN" \ +PARAM_WORKSPACE=("workspace" "w" "m3d-engine code directory (must be the same within the container life-cycle)") +PARAM_TEST_FILTER=("test-filter" "f" "filter string for selecting specific tests when testing the code with sbt") +OPTION_HELP=("help" "h" "show help message for the command") +OPTION_DEBUG=("debug" "d" "start containers or run tests in debug mode (must be the same within the container life-cycle)") +OPTION_INTERACTIVE=("interactive" "i" "use interactive mode and allocate pseudo-TTY when executing a command inside the container") + +ARG_ACTION_IMAGE_BUILD=("image-build" "build the docker image") +ARG_ACTION_CONTAINER_RUN=("container-run" "run a container from the docker image") +ARG_ACTION_CONTAINER_EXECUTE=("container-execute" "execute an external command within the container") +ARG_ACTION_CONTAINER_STOP=("container-stop" "stop the container") +ARG_ACTION_CONTAINER_DELETE=("container-delete" "delete the container") +ARG_ACTION_PROJECT_FORMAT=("project-format" "format the code") +ARG_ACTION_PROJECT_LINT=("project-lint" "lint code") +ARG_ACTION_PROJECT_ASSEMBLE=("project-assemble" "build the code and create a jar-file") +ARG_ACTION_PROJECT_TEST=("project-test" "run tests within the container") +ARG_ACTION_PROJECT_CLEAN=("project-clean" "clean pyc-files in the project directory") +ARG_COMMAND=("command" "command to execute within the container") + +AVAILABLE_ACTIONS=( + "$ARG_ACTION_IMAGE_BUILD" + "$ARG_ACTION_CONTAINER_RUN" + "$ARG_ACTION_CONTAINER_EXECUTE" + "$ARG_ACTION_CONTAINER_STOP" + "$ARG_ACTION_CONTAINER_DELETE" + "$ARG_ACTION_PROJECT_FORMAT" + "$ARG_ACTION_PROJECT_LINT" + "$ARG_ACTION_PROJECT_ASSEMBLE" + "$ARG_ACTION_PROJECT_TEST" + "$ARG_ACTION_PROJECT_CLEAN" ) source "./common.sh" @@ -112,24 +116,30 @@ fi OTHER_ARGS=() while [[ $# -gt 0 ]]; do case $1 in - -w|--workspace) - shift - validate_args_non_empty "$HELP_STRING" "$@" - WORKSPACE="$1";; - -f|--test-filter) - shift - validate_args_non_empty "$HELP_STRING" "$@" - validate_possible_values "$HELP_STRING" "$PARAM_TEST_FILTER" "${ACTION_AVAILABLE_ARGS[@]}" - TEST_FILTER="$1";; - -i|--interactive) - validate_possible_values "$HELP_STRING" "$OPTION_INTERACTIVE" "${ACTION_AVAILABLE_ARGS[@]}" - INTERACTIVE=1;; - -d|--debug) - DEBUG=1;; - -h|--help) - exit_with_messages "$HELP_STRING";; - *) - OTHER_ARGS+=("$1") + -w | --workspace) + shift + validate_args_non_empty "$HELP_STRING" "$@" + WORKSPACE="$1" + ;; + -f | --test-filter) + shift + validate_args_non_empty "$HELP_STRING" "$@" + validate_possible_values "$HELP_STRING" "$PARAM_TEST_FILTER" "${ACTION_AVAILABLE_ARGS[@]}" + TEST_FILTER="$1" + ;; + -i | --interactive) + validate_possible_values "$HELP_STRING" "$OPTION_INTERACTIVE" "${ACTION_AVAILABLE_ARGS[@]}" + INTERACTIVE=1 + ;; + -d | --debug) + DEBUG=1 + ;; + -h | --help) + exit_with_messages "$HELP_STRING" + ;; + *) + OTHER_ARGS+=("$1") + ;; esac shift done @@ -159,10 +169,10 @@ elif [[ "$ACTION" == "$ARG_ACTION_CONTAINER_RUN" ]]; then validate_args_are_empty "$HELP_STRING" "${OTHER_ARGS[@]}" if [[ -z "$DEBUG" ]]; then - docker run -t -d --name "$CONTAINER_INSTANCE_NAME" -v "${WORKSPACE}:/root/workspace/${PROJECT_NAME}" "$CONTAINER_IMAGE_NAME" + docker run -t -d --name "$CONTAINER_INSTANCE_NAME" -v "${WORKSPACE}:/m3d/workspace/${PROJECT_NAME}" "$CONTAINER_IMAGE_NAME" else echo "Debugging is enabled" - docker run -t -d --name "$CONTAINER_INSTANCE_NAME" -v "${WORKSPACE}:/root/workspace/${PROJECT_NAME}" -p 5005:5005 "$CONTAINER_IMAGE_NAME" + docker run -t -d --name "$CONTAINER_INSTANCE_NAME" -v "${WORKSPACE}:/m3d/workspace/${PROJECT_NAME}" -p 5005:5005 "$CONTAINER_IMAGE_NAME" fi # cleanup files generated by SBT @@ -181,6 +191,22 @@ elif [[ "$ACTION" == "$ARG_ACTION_CONTAINER_EXECUTE" ]]; then EXTERNAL_CMD="${OTHER_ARGS[0]}" exec_command_within_container "$CONTAINER_INSTANCE_NAME" "$PROJECT_NAME" "$EXTERNAL_CMD" "$INTERACTIVE" +# format the code +elif [[ "$ACTION" == "$ARG_ACTION_PROJECT_FORMAT" ]]; then + echo "Formatting code ..." + validate_args_are_empty "$HELP_STRING" "${OTHER_ARGS[@]}" + + SBT_CMD='sbt scalafmtAll "scalafix RemoveUnused" "test:scalafix RemoveUnused"' + exec_command_within_container "$CONTAINER_INSTANCE_NAME" "$PROJECT_NAME" "$SBT_CMD" "$INTERACTIVE" + +# lint code +elif [[ "$ACTION" == "$ARG_ACTION_PROJECT_LINT" ]]; then + echo "linting code ..." + validate_args_are_empty "$HELP_STRING" "${OTHER_ARGS[@]}" + + SBT_CMD='sbt scalafmtCheckAll' + exec_command_within_container "$CONTAINER_INSTANCE_NAME" "$PROJECT_NAME" "$SBT_CMD" "$INTERACTIVE" + # build the code and assembly a jar-file elif [[ "$ACTION" == "$ARG_ACTION_PROJECT_ASSEMBLE" ]]; then echo "Creating a jar-file ..." @@ -196,7 +222,7 @@ elif [[ "$ACTION" == "$ARG_ACTION_PROJECT_TEST" ]]; then if [[ -z "$DEBUG" ]]; then SBT_OPTS="-Xms512M -Xmx512M" - SBT_CMD="SBT_OPTS=\"${SBT_OPTS}\" sbt \"test\"" + SBT_CMD="SBT_OPTS=\"${SBT_OPTS}\" sbt \"test:testOnly ${TEST_FILTER}\"" exec_command_within_container "$CONTAINER_INSTANCE_NAME" "$PROJECT_NAME" "$SBT_CMD" "$INTERACTIVE" else echo "Debugging is enabled" diff --git a/project/assembly.sbt b/project/assembly.sbt index d95475f..72477a2 100644 --- a/project/assembly.sbt +++ b/project/assembly.sbt @@ -1 +1 @@ -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0") diff --git a/project/build.properties b/project/build.properties index 8db5ca2..302b6be 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version = 1.2.1 \ No newline at end of file +sbt.version = 1.3.13 \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt index 14a6ca1..a4684b7 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1 +1,3 @@ -logLevel := Level.Warn \ No newline at end of file +logLevel := Level.Warn +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.2") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.19") diff --git a/src/main/scala/com/adidas/analytics/AlgorithmFactory.scala b/src/main/scala/com/adidas/analytics/AlgorithmFactory.scala index 07eb338..e29e1ec 100644 --- a/src/main/scala/com/adidas/analytics/AlgorithmFactory.scala +++ b/src/main/scala/com/adidas/analytics/AlgorithmFactory.scala @@ -2,11 +2,11 @@ package com.adidas.analytics import com.adidas.analytics.algo._ import com.adidas.analytics.algo.core.JobRunner +import com.adidas.analytics.algo.loads.{AppendLoad, DeltaLakeLoad, DeltaLoad, FullLoad} import com.adidas.analytics.util.DFSWrapper import org.apache.spark.sql.SparkSession -/** - * Driver class to initiate execution +/** Driver class to initiate execution */ object AlgorithmFactory { @@ -27,18 +27,15 @@ object AlgorithmFactory { // get Spark session val spark = createSparkSession(appClassName) - try { - createAlgorithmInstance(spark, appClassName, appParamFile).run() - } finally { - spark.close() - } + try createAlgorithmInstance(spark, appClassName, appParamFile).run() + finally spark.close() } - /** - * Create an instance of SparkSession with all the necessary parameters + /** Create an instance of SparkSession with all the necessary parameters */ - private def createSparkSession(appClassName: String): SparkSession = { - SparkSession.builder() + private def createSparkSession(appClassName: String): SparkSession = + SparkSession + .builder() .appName(appClassName) .config("hive.cbo.enable", "true") .config("hive.compute.query.using.stats", "true") @@ -47,30 +44,38 @@ object AlgorithmFactory { .config("hive.stats.fetch.column.stats", "true") .config("hive.stats.fetch.partition.stats", "true") .config("spark.sql.parquet.compression.codec", "snappy") - .config("spark.sql.parquet.writeLegacyFormat","true") + .config("spark.sql.parquet.writeLegacyFormat", "true") .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false") .config("spark.sql.csv.parser.columnPruning.enabled", "false") .enableHiveSupport() .getOrCreate() - } - /** - * Select algorithm to execute and create an instance of it + /** Select algorithm to execute and create an instance of it */ - private def createAlgorithmInstance(spark: SparkSession, className: String, configLocation: String): JobRunner = { + private def createAlgorithmInstance( + spark: SparkSession, + className: String, + configLocation: String + ): JobRunner = { val dfs = DFSWrapper(spark.sparkContext.hadoopConfiguration) className match { - case "AppendLoad" => AppendLoad(spark, dfs, configLocation) - case "DeltaLoad" => DeltaLoad(spark, dfs, configLocation) - case "FullLoad" => FullLoad(spark, dfs, configLocation) - case "GzipDecompressorBytes" => GzipDecompressor(spark, dfs, configLocation) - case "PartitionFullMaterialization" => PartitionMaterialization.newFullMaterialization(spark, dfs, configLocation) - case "PartitionRangeMaterialization" => PartitionMaterialization.newRangeMaterialization(spark, dfs, configLocation) - case "PartitionQueryMaterialization" => PartitionMaterialization.newQueryMaterialization(spark, dfs, configLocation) + case "AppendLoad" => AppendLoad(spark, dfs, configLocation) + case "DeltaLoad" => DeltaLoad(spark, dfs, configLocation) + case "DeltaLakeLoad" => DeltaLakeLoad(spark, dfs, configLocation) case "FixedSizeStringExtractor" => FixedSizeStringExtractor(spark, dfs, configLocation) - case "NestedFlattener" => NestedFlattener(spark, dfs, configLocation) - case _ => throw new RuntimeException(s"Unable to find algorithm corresponding to $className") + case "FullLoad" => FullLoad(spark, dfs, configLocation) + case "FullMaterialization" => + Materialization.newFullMaterialization(spark, dfs, configLocation) + case "GzipDecompressorBytes" => GzipDecompressor(spark, dfs, configLocation) + case "SQLRunner" => SQLRunner(spark, configLocation) + case "NestedFlattener" => NestedFlattener(spark, dfs, configLocation) + case "QueryMaterialization" => + Materialization.newQueryMaterialization(spark, dfs, configLocation) + case "RangeMaterialization" => + Materialization.newRangeMaterialization(spark, dfs, configLocation) + case "Transpose" => Transpose(spark, dfs, configLocation) + case _ => throw new RuntimeException(s"Unable to find algorithm corresponding to $className") } } } diff --git a/src/main/scala/com/adidas/analytics/algo/AlgorithmTemplate.scala b/src/main/scala/com/adidas/analytics/algo/AlgorithmTemplate.scala deleted file mode 100644 index 806316f..0000000 --- a/src/main/scala/com/adidas/analytics/algo/AlgorithmTemplate.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.adidas.analytics.algo - -import com.adidas.analytics.algo.core.Algorithm -import com.adidas.analytics.config.AlgorithmTemplateConfiguration -import com.adidas.analytics.util.{DFSWrapper} -import org.apache.spark.sql._ - -final class AlgorithmTemplate protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) - extends Algorithm with AlgorithmTemplateConfiguration { - - override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - /** - * In this method perform all the operations required to obtain the desired dataframe. - * For example, adding new columns, calculating values for columns, exploding, etc. - * - * @param dataFrames this would be a 3-D array, where each cell of the Vector has a 2-D spark dataframe - */ - - throw new NotImplementedError("This class is not meant to be used. Please, considering implementing your own class based on this template") - } -} - - -object AlgorithmTemplate { - - /** - * Additionally, one can define a companion object, with different attributes and methods. - * These methods could be helpers for the transform method. - * In this case, an instantiation of AlgorithmTemplate occurs in the companion object. - * - * @param spark instance of SparkSession class. - * @param dfs instance of DFSWrapper class for FS operations helper. - * @param configLocation path of configuration file for the algorithm. - * @return - */ - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): AlgorithmTemplate = { - new AlgorithmTemplate(spark, dfs, configLocation) - } -} diff --git a/src/main/scala/com/adidas/analytics/algo/AppendLoad.scala b/src/main/scala/com/adidas/analytics/algo/AppendLoad.scala deleted file mode 100644 index c19451f..0000000 --- a/src/main/scala/com/adidas/analytics/algo/AppendLoad.scala +++ /dev/null @@ -1,234 +0,0 @@ -package com.adidas.analytics.algo - -import java.util.regex.Pattern - -import com.adidas.analytics.algo.AppendLoad.{logger, _} -import com.adidas.analytics.algo.core.{Algorithm, TableStatistics} -import com.adidas.analytics.config.AppendLoadConfiguration -import com.adidas.analytics.util.DFSWrapper._ -import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetFormat} -import com.adidas.analytics.util.DataFrameUtils._ -import com.adidas.analytics.util._ -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{StructType, _} -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.immutable - -/** - * Performs append load of new records to an existing table. - */ -final class AppendLoad protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) - extends Algorithm with AppendLoadConfiguration with TableStatistics { - - override protected def read(): Vector[DataFrame] = { - readInputData(targetSchema, spark, dfs) - } - - override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - dataFrames.map(df => df.transform(addtargetPartitions(columnToRegexPairs, targetSchema))) - } - - override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - writeHeaders(dataFrames, targetPartitions, headerDir, dfs) - super.write(dataFrames) - } - - override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = { - if (computeTableStatistics && dataType == STRUCTURED && targetTable.isDefined) { - if (targetPartitions.nonEmpty) { - dataFrames.foreach(df => computeStatisticsForTablePartitions(df, targetTable.get, targetPartitions)) - } - computeStatisticsForTable(targetTable) - } - - } - - private def readInputData(targetSchema: StructType, spark: SparkSession, dfs: DFSWrapper): Vector[DataFrame] = { - val inputDirPath = new Path(inputDir) - val headerDirPath = new Path(headerDir) - - val fs = dfs.getFileSystem(inputDirPath) - val sources = listSources(inputDirPath, headerDirPath, fs, targetSchema) - readSources(sources, fs, spark) - } - - private def listSources(inputDirPath: Path, headerDirPath: Path, fs: FileSystem, targetSchema: StructType): Seq[Source] = { - val targetSchemaWithouttargetPartitions = getSchemaWithouttargetPartitions(targetSchema, targetPartitions.toSet) - - logger.info(s"Looking for input files in $inputDirPath") - val groupedHeaderPathAndSourcePaths = fs.ls(inputDirPath, recursive = true).groupBy { inputPath => - buildHeaderFilePath(columnToRegexPairs, targetSchema, extractPathWithoutServerAndProtocol(inputPath.toString), headerDirPath) - } - - def getMapSchemaStructToPath: immutable.Iterable[Source] = { - val mapSchemaStructToPath = groupedHeaderPathAndSourcePaths.toSeq.map { case (headerPath, sourcePaths) => - getSchemaFromHeaderOrSource(fs, headerPath, sourcePaths, targetSchemaWithouttargetPartitions) - }.groupBy(_._1).map { case (k, v) => (k, v.flatMap(_._2)) } - - val filteredMapSchemaStructToPath = mapSchemaStructToPath.filter(schemaFromInputData => matchingSchemas_?(schemaFromInputData._1, targetSchema, schemaFromInputData._2)) - - if (mapSchemaStructToPath.size != filteredMapSchemaStructToPath.size) - throw new RuntimeException("Schema does not match the input data for some of the input folders.") - - mapSchemaStructToPath.flatMap { case (schema, sourcePaths) => - sourcePaths.map { sourcePath => - Source(targetSchema, sourcePath.toString) - } - } - } - - val schemaAndSourcePath = if (!verifySchema) { - groupedHeaderPathAndSourcePaths.flatMap { case (headerPath, sourcePaths) => - val schema = if (fs.exists(headerPath)) loadHeader(headerPath, fs) else targetSchemaWithouttargetPartitions - sourcePaths.map { sourcePath => - Source(schema, sourcePath.toString) - } - } - } else { - getMapSchemaStructToPath - } - schemaAndSourcePath.toSeq - } - - private def getSchemaFromHeaderOrSource(fs: FileSystem, headerPath: Path, sourcePaths: Seq[Path], targetSchemaWithouttargetPartitions: StructType): (StructType, Seq[Path]) = { - val schema = if (fs.exists(headerPath)) { - loadHeader(headerPath, fs) - } - else { - inferSchemaFromSource(sourcePaths) - } - (schema, sourcePaths) - } - - private def inferSchemaFromSource(sourcePaths: Seq[Path]): StructType = { - val reader = spark.read.options(sparkReaderOptions) - val dataFormat = fileFormat match { - case "dsv" => DSVFormat() - case "parquet" => ParquetFormat() - case "json" => JSONFormat() - case anotherFormat => throw new RuntimeException(s"Unknown file format: $anotherFormat") - } - dataFormat.read(reader, sourcePaths.map(_.toString): _*).schema - } - - private def matchingSchemas_?(schemaFromInputData: StructType, targetSchema: StructType, paths: Seq[Path]): Boolean = { - val inputColumnsVector = schemaFromInputData.names.toVector - val targetColumnsVector = targetSchema.names.toVector - val diff = inputColumnsVector.diff(targetColumnsVector) - if (diff.nonEmpty) - logger.error(s"Inferred schema does not match the target schema for ${paths.toString}") - diff.isEmpty - } - - private def readSources(sources: Seq[Source], fs: FileSystem, spark: SparkSession): Vector[DataFrame] = { - groupSourcesBySchema(sources).map { - case (schema, inputPaths) => readInputFiles(inputPaths, fileFormat, schema, spark.read.options(sparkReaderOptions)) - }.toVector - } - - private def readInputFiles(inputPaths: Seq[String], fileFormat: String, schema: StructType, reader: DataFrameReader): DataFrame = { - fileFormat match { - case "dsv" => DSVFormat(Some(schema)).read(reader, inputPaths: _*) - case "parquet" => ParquetFormat(Some(schema)).read(reader, inputPaths: _*) - case "json" => JSONFormat(Some(schema)).read(reader, inputPaths: _*) - case anotherFormat => throw new RuntimeException(s"Unknown file format: $anotherFormat") - } - } -} - - -object AppendLoad { - - private val logger: Logger = LoggerFactory.getLogger(getClass) - private val headerFileName: String = "header.json" - - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): AppendLoad = { - new AppendLoad(spark, dfs, configLocation) - } - - private def extractPathWithoutServerAndProtocol(path: String): String = { - path.replaceFirst("\\w+\\d*://.+?/", "") - } - - private def getSchemaWithouttargetPartitions(targetSchema: StructType, targetPartitions: Set[String]): StructType = { - StructType(targetSchema.fields.filterNot(field => targetPartitions.contains(field.name))) - } - - private def groupSourcesBySchema(sources: Seq[Source]): Map[StructType, Seq[String]] = { - sources.groupBy(_.schema).mapValues { sources => - sources.map(_.inputFileLocation) - } - } - - private def addtargetPartitions(columnNameToRegexPairs: Seq[(String, String)], schema: StructType)(inputDf: DataFrame): DataFrame = { - def getInputFileName: Column = { - udf((path: String) => extractPathWithoutServerAndProtocol(path)).apply(input_file_name) - } - - val tempInputFileNameColumn = col("temp_input_file_name") - val columnNameToTypeMapping = schema.fields.map(field => field.name -> field.dataType).toMap - - columnNameToRegexPairs.foldLeft(inputDf.withColumn(tempInputFileNameColumn.toString, getInputFileName)) { - case (df, (columnName, regex)) => - val targetColumnType = columnNameToTypeMapping(columnName) - df.withColumn(columnName, regexp_extract(tempInputFileNameColumn, regex, 1).cast(targetColumnType)) - }.drop(tempInputFileNameColumn) - } - - private def buildHeaderFilePath(columnNameToRegexPairs: Seq[(String, String)], schema: StructType, inputFileName: String, headerDirPath: Path): Path = { - val columnNameToTypeMapping = schema.fields.map(field => field.name -> field.dataType).toMap - val subdirectories = columnNameToRegexPairs.map { - case (columnName, regex) => - implicit val dataType: DataType = columnNameToTypeMapping(columnName) - extractPartitionColumnValue(inputFileName, regex) match { - case Some(columnValue) => s"$columnName=$columnValue" - case None => throw new RuntimeException(s"Unable to extract value for $columnName with '$regex' from $inputFileName") - } - } - new Path(headerDirPath.join(subdirectories), headerFileName) - } - - private def loadHeader(headerPath: Path, fs: FileSystem): StructType = { - DataType.fromJson(fs.readFile(headerPath)).asInstanceOf[StructType] - } - - protected def writeHeaders(dataFrames: Seq[DataFrame], targetPartitions: Seq[String], headerDir: String, dfs: DFSWrapper): Unit = { - logger.info(s"Writing header files to $headerDir") - val headerDirPath = new Path(headerDir) - val fs = dfs.getFileSystem(headerDirPath) - dataFrames.foreach { df => - val schemaJson = getSchemaWithouttargetPartitions(df.schema, targetPartitions.toSet).prettyJson - df.collectPartitions(targetPartitions).foreach { partitionCriteria => - val subdirectories = DataFrameUtils.mapPartitionsToDirectories(partitionCriteria) - val headerPath = new Path(headerDirPath.join(subdirectories), headerFileName) - if (!fs.exists(headerPath)) { - logger.info(s"Writing header $headerPath") - fs.writeFile(headerPath, schemaJson) - } - } - } - } - - private def extractPartitionColumnValue(fileName: String, regex: String)(implicit dataType: DataType): Option[String] = { - val matcher = Pattern.compile(regex).matcher(fileName) - Option(matcher) - .filter(_.find) - .map(_.group(1)) //modifications to regexes demand taking group 1 instead of group 0 - .map(restoreFromTypedValue) - } - - private def restoreFromTypedValue(stringColumnValue: String)(implicit dataType: DataType): String = { - val columnValue = dataType match { - case ByteType | ShortType | IntegerType | LongType => stringColumnValue.toLong - case BooleanType => stringColumnValue.toBoolean - case StringType => stringColumnValue - } - columnValue.toString - } - - protected case class Source(schema: StructType, inputFileLocation: String) - -} diff --git a/src/main/scala/com/adidas/analytics/algo/FixedSizeStringExtractor.scala b/src/main/scala/com/adidas/analytics/algo/FixedSizeStringExtractor.scala index 37cfd1e..351d3bc 100644 --- a/src/main/scala/com/adidas/analytics/algo/FixedSizeStringExtractor.scala +++ b/src/main/scala/com/adidas/analytics/algo/FixedSizeStringExtractor.scala @@ -8,14 +8,18 @@ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType - -final class FixedSizeStringExtractor protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) - extends Algorithm with FixedSizeStringExtractorConfiguration { +final class FixedSizeStringExtractor protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with FixedSizeStringExtractorConfiguration { override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { val filteredDf = Option(partitionsCriteria).filter(_.nonEmpty).foldLeft(dataFrames(0)) { case (df, criteria) => - val isRequiredPartition = DataFrameUtils.buildPartitionsCriteriaMatcherFunc(Seq(criteria), df.schema) + val isRequiredPartition = + DataFrameUtils.buildPartitionsCriteriaMatcherFunc(Seq(criteria), df.schema) df.filter(isRequiredPartition) } @@ -24,31 +28,34 @@ final class FixedSizeStringExtractor protected(val spark: SparkSession, val dfs: } def extractFields(df: DataFrame, targetSchema: StructType): DataFrame = { - val nonPartitionFields = targetSchema.fields.filter(field => !targetPartitionsSet.contains(field.name)) - if (substringPositions.length != nonPartitionFields.length) { + val nonPartitionFields = + targetSchema.fields.filter(field => !targetPartitionsSet.contains(field.name)) + if (substringPositions.length != nonPartitionFields.length) throw new RuntimeException("Field positions do not correspond to the target schema") - } val sourceCol = col(sourceField) val extractedDf = nonPartitionFields.zip(substringPositions).foldLeft(df) { case (tempDf, (field, (startPos, endPos))) => - tempDf.withColumn(field.name, withExtractedString(sourceCol, startPos, endPos).cast(field.dataType)) + tempDf.withColumn( + field.name, + withExtractedString(sourceCol, startPos, endPos).cast(field.dataType) + ) } extractedDf.selectExpr(targetSchema.fieldNames: _*) } } - object FixedSizeStringExtractor { - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): FixedSizeStringExtractor = { - new FixedSizeStringExtractor(spark, dfs, configLocation) - } + def apply( + spark: SparkSession, + dfs: DFSWrapper, + configLocation: String + ): FixedSizeStringExtractor = new FixedSizeStringExtractor(spark, dfs, configLocation) - private def withExtractedString(column: Column, startPos: Int, endPos: Int): Column = { - udf[Option[String], String]((in: String) => Option(in.substring(startPos - 1, endPos).trim).filter(_.nonEmpty)).apply(column) - } + private def withExtractedString(column: Column, startPos: Int, endPos: Int): Column = + udf[Option[String], String]((in: String) => + Option(in.substring(startPos - 1, endPos).trim).filter(_.nonEmpty) + ).apply(column) } - - diff --git a/src/main/scala/com/adidas/analytics/algo/FullLoad.scala b/src/main/scala/com/adidas/analytics/algo/FullLoad.scala deleted file mode 100644 index f56c5fc..0000000 --- a/src/main/scala/com/adidas/analytics/algo/FullLoad.scala +++ /dev/null @@ -1,210 +0,0 @@ -package com.adidas.analytics.algo - -import com.adidas.analytics.config.FullLoadConfiguration -import com.adidas.analytics.algo.FullLoad._ -import com.adidas.analytics.algo.core.{Algorithm, TableStatistics} -import com.adidas.analytics.algo.core.Algorithm.WriteOperation -import com.adidas.analytics.algo.shared.DateComponentDerivation -import com.adidas.analytics.util.DFSWrapper._ -import com.adidas.analytics.util.DataFormat.{DSVFormat, ParquetFormat} -import com.adidas.analytics.util._ -import org.apache.hadoop.fs.Path -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.slf4j.{Logger, LoggerFactory} - -import scala.util.{Failure, Success, Try} - - -final class FullLoad protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) - extends Algorithm with WriteOperation with FullLoadConfiguration with DateComponentDerivation with TableStatistics { - - val currentHdfsDir: String = HiveTableAttributeReader(targetTable, spark).getTableLocation - - override protected def read(): Vector[DataFrame] = { - createBackupTable() - - val dataFormat: DataFormat = fileFormat match { - case "parquet" => ParquetFormat(Some(targetSchema)) - case "dsv" => DSVFormat(Some(targetSchema)) - case _ => throw new RuntimeException(s"Unsupported input data format $fileFormat.") - } - - readInputData(dataFormat) - } - - override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - withDatePartitions(dataFrames) - } - - override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - Try{ - super.write(dataFrames) - } match { - case Failure(exception) => - logger.error(s"Handled Exception: ${exception.getMessage}. " + - s"Start Rolling Back the Full Load of table: ${targetTable}!") - recoverFailedWrite() - cleanupDirectory(backupDir) - throw new RuntimeException(exception.getMessage) - case Success(outputDaframe) => - restoreTable() - outputDaframe - } - - } - - override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = { - if (computeTableStatistics && dataType == STRUCTURED) { - if(targetPartitions.nonEmpty) { - dataFrames.foreach(df => computeStatisticsForTablePartitions(df,targetTable, targetPartitions)) - } - computeStatisticsForTable(Option(targetTable)) - } - - } - - private def createBackupTable(): Unit = { - createDirectory(backupDir) - - // backup the data from the current dir because currently data directory for full load is varying - - backupDataDirectory(currentHdfsDir, backupDir) - - try { - dropAndRecreateTableInNewLocation(targetTable, backupDir, targetPartitions) - } catch { - case e: Throwable => - logger.error("Data backup failed", e) - logger.info(s"Restoring previous state $backupDir -> $currentDir") - recoverFailBackup() - cleanupDirectory(backupDir) - throw new RuntimeException("Unable to change table location.", e) - } - } - - private def readInputData(dataFormat: DataFormat): Vector[DataFrame] ={ - try { - Vector(dataFormat.read(spark.read.options(sparkReaderOptions), inputDir)) - } catch { - case e: Throwable => - logger.error("Data reading failed", e) - recoverFailedRead() - cleanupDirectory(backupDir) - throw new RuntimeException("Unable to read input location.", e) - } - } - - private def createDirectory(dir: String): Unit = { - val path = new Path(dir) - - logger.info(s"Creating directory ${path.toString}") - val fs = dfs.getFileSystem(path) - fs.createDirIfNotExists(path) - } - - private def cleanupDirectory(dir: String): Unit = { - DistCpLoadHelper.cleanupDirectoryContent(dfs, dir) - } - - private def backupDataDirectory(sourceDir: String, destinationDir: String): Unit = { - DistCpLoadHelper.cleanupDirectoryContent(dfs, destinationDir) - DistCpLoadHelper.backupDirectoryContent(dfs, sourceDir, destinationDir) - } - - private def dropAndRecreateTableInNewLocation(table: String, destinationDir: String, targetPartitions: Seq[String]): Unit = { - val tempTable: String = s"${table}_temp" - val tempTableDummyLocation: String = s"/tmp/$table" - - //create a temp table like the target table in a dummy location to preserve the schema - createTable(table, tempTable, tempTableDummyLocation) - - //create the target table like the temp table with data in the new directory - createTable(tempTable, table, destinationDir) - - if (targetPartitions.nonEmpty) { - spark.catalog.recoverPartitions(table) - } - } - - private def createTable(sourceTable: String, destinationTable: String, location: String): Unit ={ - val createTempTableWithLocation = createExternalTableStatement(sourceTable, destinationTable, location) - spark.sql(createTempTableWithLocation) - spark.sql(s"DROP TABLE IF EXISTS $sourceTable") - } - - private def withDatePartitions(dataFrames: Vector[DataFrame]): Vector[DataFrame] ={ - logger.info("Adding partitioning information if needed") - try { - if (targetPartitions.nonEmpty) { - dataFrames.map(df => df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions))) - } else { - dataFrames - } - } catch { - case e: Throwable => - logger.error("Cannot add partitioning information for data frames.", e) - logger.info(s"Restoring previous state $backupDir -> $currentDir") - recoverFailedWrite() - cleanupDirectory(backupDir) - throw new RuntimeException("Unable to transform data frames.", e) - } - } - - private def restoreTable(): Unit ={ - try { - dropAndRecreateTableInNewLocation(targetTable, currentDir, targetPartitions) - } catch { - case e: Throwable => - logger.error("Data writing failed", e) - logger.info(s"Restoring previous state $backupDir -> $currentDir") - recoverFailedWrite() - throw new RuntimeException("Unable to change table location ", e) - } finally { - cleanupDirectory(backupDir) - } - } - - private def recoverFailBackup(): Unit = { - val tempTable: String = s"${targetTable}_temp" - - try { - createTable(tempTable, targetTable, currentDir) - } catch { - case e: Exception => logger.warn(s"Failure when restoring table from temp table",e) - } finally { - spark.sql(s"DROP TABLE IF EXISTS $tempTable") - } - - if (targetPartitions.nonEmpty) { - spark.catalog.recoverPartitions(targetTable) - } - } - - private def recoverFailedRead(): Unit = { - dropAndRecreateTableInNewLocation(targetTable, currentDir, targetPartitions) - } - - private def recoverFailedWrite(): Unit = { - restoreDirectoryContent(currentDir, backupDir) - dropAndRecreateTableInNewLocation(targetTable, currentDir, targetPartitions) - } - - private def restoreDirectoryContent(sourceDir: String, backupDir: String): Unit = { - DistCpLoadHelper.restoreDirectoryContent(dfs, sourceDir, backupDir) - } - -} - - -object FullLoad { - - private val logger: Logger = LoggerFactory.getLogger(getClass) - - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): FullLoad = { - new FullLoad(spark, dfs, configLocation) - } - - private def createExternalTableStatement(sourceTable: String, destTable:String, location: String) : String = { - s"CREATE TABLE $destTable LIKE $sourceTable LOCATION '$location'" - } -} \ No newline at end of file diff --git a/src/main/scala/com/adidas/analytics/algo/GzipDecompressor.scala b/src/main/scala/com/adidas/analytics/algo/GzipDecompressor.scala index 17ac3c1..aff96e4 100644 --- a/src/main/scala/com/adidas/analytics/algo/GzipDecompressor.scala +++ b/src/main/scala/com/adidas/analytics/algo/GzipDecompressor.scala @@ -1,8 +1,8 @@ package com.adidas.analytics.algo import java.util.concurrent.{Executors, TimeUnit} - -import com.adidas.analytics.algo.GzipDecompressor.{changeFileExtension, compressedExtension, _} +import java.util.zip.ZipInputStream +import com.adidas.analytics.algo.GzipDecompressor.{getDecompressedFilePath, logger} import com.adidas.analytics.algo.core.JobRunner import com.adidas.analytics.config.GzipDecompressorConfiguration import com.adidas.analytics.util.DFSWrapper @@ -13,80 +13,91 @@ import org.apache.hadoop.io.IOUtils import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} - import scala.concurrent._ import scala.concurrent.duration._ -/** - * Spark Scala code utility that can decompress the gzip files and put - * them in the same location into hadoop distributed file systems. - * This will be utilised as preliminary activity before spark code execution - * is being called on these files. As gzip compressed files are not splittable, - * spark parallel processing can not be used efficiently for these files. - * This utility will help to uncompress files at runtime of a workflow. +/** Spark Scala code utility that can decompress the gzip files and put them in the same location + * into hadoop distributed file systems. This will be utilised as preliminary activity before spark + * code execution is being called on these files. As gzip compressed files are not splittable, + * spark parallel processing can not be used efficiently for these files. This utility will help to + * uncompress files at runtime of a workflow. */ -final class GzipDecompressor protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) - extends JobRunner with GzipDecompressorConfiguration { +final class GzipDecompressor protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends JobRunner + with GzipDecompressorConfiguration { private val hadoopConfiguration: Configuration = spark.sparkContext.hadoopConfiguration private val fileSystem: FileSystem = dfs.getFileSystem(inputDirectoryPath) - override def run(): Unit = { //check if directory exists - if (!fileSystem.exists(inputDirectoryPath)){ + if (!fileSystem.exists(inputDirectoryPath)) { logger.error(s"Input directory: $inputDirectoryPath does not exist.") throw new RuntimeException(s"Directory $inputDirectoryPath does not exist.") } - val compressedFilePaths = fileSystem.ls(inputDirectoryPath, recursive) - .filterNot(path => fileSystem.isDirectory(path)) - .filter(_.getName.toLowerCase.endsWith(compressedExtension)) - - if (compressedFilePaths.isEmpty) { - logger.warn(s"Input directory $inputDirectoryPath does not contain compressed files. Skipping...") - } else { - implicit val ec: ExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(threadPoolSize)) - Await.result(Future.sequence( - compressedFilePaths.map { compressedFilePath => - Future { - logger.info(s"Decompressing file: $compressedFilePath") - - val decompressedFileName = changeFileExtension(compressedFilePath.getName, compressedExtension, outputExtension) - val decompressedFilePath = new Path(compressedFilePath.getParent, decompressedFileName) - - val compressionCodecFactory = new CompressionCodecFactory(hadoopConfiguration) - val inputCodec = compressionCodecFactory.getCodec(compressedFilePath) - - val inputStream = inputCodec.createInputStream(fileSystem.open(compressedFilePath)) - val output = fileSystem.create(decompressedFilePath) - - IOUtils.copyBytes(inputStream, output, hadoopConfiguration) - logger.info(s"Finished decompressing file: $compressedFilePath") - - //Delete the compressed file - fileSystem.delete(compressedFilePath, false) - logger.info(s"Removed file: $compressedFilePath") + implicit val ec: ExecutionContext = + ExecutionContext.fromExecutor(Executors.newFixedThreadPool(threadPoolSize)) + Await.result( + Future.sequence( + fileSystem + .ls(inputDirectoryPath, recursive) + .filterNot(path => fileSystem.isDirectory(path)) + .map { compressedFilePath => + Future { + val decompressedFilePath = + getDecompressedFilePath(compressedFilePath, outputExtension) + val inputStream = + if (compressedFilePath.getName.endsWith(".zip")) { + val zin = new ZipInputStream(fileSystem.open(compressedFilePath)) + /* Warning: we intentionally only support zip files with one entry here as we want + * to control */ + /* the output name and can not merge multiple entries because they may have + * headers. */ + zin.getNextEntry + zin + } else { + val compressionCodecFactory = new CompressionCodecFactory(hadoopConfiguration) + val inputCodec = compressionCodecFactory.getCodec(compressedFilePath) + if (inputCodec != null) + inputCodec.createInputStream(fileSystem.open(compressedFilePath)) + else { + logger.error(s"No codec found for file $compressedFilePath!") + throw new RuntimeException(s"No codec found for file $compressedFilePath!") + } + } + + logger.info(s"Decompressing file: $compressedFilePath") + val outputStream = fileSystem.create(decompressedFilePath) + + IOUtils.copyBytes(inputStream, outputStream, hadoopConfiguration) + logger.info(s"Finished decompressing file: $compressedFilePath") + + inputStream.close() + outputStream.close() + + fileSystem.delete(compressedFilePath, false) + logger.info(s"Removed file: $compressedFilePath") + } } - } - ), Duration(4, TimeUnit.HOURS)) - } + ), + Duration(4, TimeUnit.HOURS) + ) } } - object GzipDecompressor { private val logger: Logger = LoggerFactory.getLogger(this.getClass) - private val compressedExtension: String = ".gz" - - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): GzipDecompressor = { + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): GzipDecompressor = new GzipDecompressor(spark, dfs, configLocation) - } - private def changeFileExtension(fileName: String, currentExt: String, newExt: String): String = { - val newFileName = fileName.substring(0, fileName.lastIndexOf(currentExt)) - if (newFileName.endsWith(newExt)) newFileName else newFileName + newExt + private def getDecompressedFilePath(compressedFilePath: Path, outputExt: String): Path = { + val decompressedFileName = compressedFilePath.getName.replaceAll("\\.[^.]*$", ".") + outputExt + new Path(compressedFilePath.getParent, decompressedFileName) } } diff --git a/src/main/scala/com/adidas/analytics/algo/Materialization.scala b/src/main/scala/com/adidas/analytics/algo/Materialization.scala new file mode 100644 index 0000000..17c8f08 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/Materialization.scala @@ -0,0 +1,156 @@ +package com.adidas.analytics.algo + +import com.adidas.analytics.algo.core.Algorithm +import com.adidas.analytics.config.MaterializationConfiguration +import com.adidas.analytics.config.MaterializationConfiguration._ +import com.adidas.analytics.config.shared.ConfigurationContext +import com.adidas.analytics.util._ +import org.apache.hadoop.fs.{Path, PathFilter} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions.col +import org.slf4j.{Logger, LoggerFactory} + +/** Performs loading of partitions from existing view/table to a specified location overwriting + * existing data + */ +trait Materialization extends Algorithm with MaterializationConfiguration { + + override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + val inputDf = if (toCache) dataFrames(0).cache() else dataFrames(0) + val result = Option(partitionsCriteria).filter(_.nonEmpty).foldLeft(inputDf) { + case (df, partitionsCriteria) => + val isRequiredPartition = + DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitionsCriteria, df.schema) + df.filter(isRequiredPartition) + } + + Vector(result) + } +} + +object Materialization { + + def newFullMaterialization( + spark: SparkSession, + dfs: DFSWrapper, + configLocation: String + ): Materialization = new FullMaterialization(spark, dfs, LoadMode.OverwriteTable, configLocation) + + def newRangeMaterialization( + spark: SparkSession, + dfs: DFSWrapper, + configLocation: String + ): Materialization = + new RangeMaterialization( + spark, + dfs, + LoadMode.OverwritePartitionsWithAddedColumns, + configLocation + ) + + def newQueryMaterialization( + spark: SparkSession, + dfs: DFSWrapper, + configLocation: String + ): Materialization = + new QueryMaterialization( + spark, + dfs, + LoadMode.OverwritePartitionsWithAddedColumns, + configLocation + ) + + private class FullMaterialization( + val spark: SparkSession, + val dfs: DFSWrapper, + val loadMode: LoadMode, + val configLocation: String + ) extends Materialization + with FullMaterializationConfiguration + with ConfigurationContext { + + private val logger: Logger = LoggerFactory.getLogger(getClass) + + override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + val outputDfs = { + try dataFrames.map { df => + if (targetPartitions.isEmpty) + writer.write(dfs, outputFilesNum.map(df.repartition).getOrElse(df)) + else { + val partitionCols = targetPartitions.map(columnName => col(columnName)) + writer.write( + dfs, + outputFilesNum + .map(n => df.repartition(n, partitionCols: _*)) + .getOrElse(df.repartition(partitionCols: _*)) + ) + } + } catch { + case e: Throwable => + logger.info( + s"An exception occurred while writing the data... cleaning up temporary files: ${e.getMessage}" + ) + HadoopLoadHelper.cleanupDirectoryContent(dfs, nextTableLocation.toString) + throw new RuntimeException("Unable to write data for the materialized view.", e) + } + } + + try CatalogTableManager(targetTable, spark) + .recreateTable(nextTableLocation.toString, targetPartitions) + catch { + case e: Throwable => + logger.info(s"An exception occurred while recreating materialized view: ${e.getMessage}") + HadoopLoadHelper.cleanupDirectoryContent(dfs, nextTableLocation.toString) + CatalogTableManager(targetTable, spark) + .recreateTable(currentTableLocation.toString, targetPartitions) + throw new RuntimeException( + s"Unable to recreate materialized view in location: $nextTableLocation", + e + ) + } + + val orderedSubFolders = HadoopLoadHelper.getOrderedSubFolders( + dfs, + tableDataDir.toString, + Some(numVersionsToRetain + 1), + Some(new FullMaterializationPathFilter()) + ) + + HadoopLoadHelper.cleanupDirectoryLeftovers(dfs, tableDataDir.toString, orderedSubFolders) + + outputDfs + } + + /** A custom path filter to ignore EMR S3 folder placeholders, partition folders and parquet + * files when listing folders/files using HadoopFS. Such feature is useful to allow ordering + * timestamped folders containing different versions of the materialized view and keep only the + * last x versions, without letting leftover partitions or files interfere with the sorting + * logic. + */ + private class FullMaterializationPathFilter extends PathFilter { + + override def accept(path: Path): Boolean = + !sortingIgnoreFolderNames.exists(path.toString.contains) + } + + } + + private class RangeMaterialization( + val spark: SparkSession, + val dfs: DFSWrapper, + val loadMode: LoadMode, + val configLocation: String + ) extends Materialization + with RangeMaterializationConfiguration + with ConfigurationContext {} + + private class QueryMaterialization( + val spark: SparkSession, + val dfs: DFSWrapper, + val loadMode: LoadMode, + val configLocation: String + ) extends Materialization + with QueryMaterializationConfiguration + with ConfigurationContext {} + +} diff --git a/src/main/scala/com/adidas/analytics/algo/NestedFlattener.scala b/src/main/scala/com/adidas/analytics/algo/NestedFlattener.scala index 933c2d3..e915e31 100644 --- a/src/main/scala/com/adidas/analytics/algo/NestedFlattener.scala +++ b/src/main/scala/com/adidas/analytics/algo/NestedFlattener.scala @@ -5,22 +5,33 @@ import com.adidas.analytics.config.NestedFlattenerConfiguration import com.adidas.analytics.util.DFSWrapper import org.apache.spark.sql.functions.{col, explode_outer} import org.apache.spark.sql.types.{ArrayType, StructField, StructType} -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import org.slf4j.{Logger, LoggerFactory} -/** - * An algorithm for flattening semi-structured JSON data in a configurable way, i.e., giving the user the ability to choose - * which struct fields should be flattened or array fields should be exploded by the algorithm. +/** An algorithm for flattening semi-structured JSON data in a configurable way, + * i.e., giving the user the ability to choose which struct fields should be flattened or array + * fields should be exploded by the algorithm. * - * @param spark spark session - * @param dfs distributed file system - * @param configLocation location of the configuration file for the algorithm + * @param spark + * spark session + * @param dfs + * distributed file system + * @param configLocation + * location of the configuration file for the algorithm */ -final class NestedFlattener protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) extends Algorithm with NestedFlattenerConfiguration { +final class NestedFlattener protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with NestedFlattenerConfiguration { override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { val inputDf = dataFrames(0) - val replacedDf = NestedFlattener.replaceCharsInColumns(spark, inputDf, charsToReplace, replacementChar) - val flattenedDf = NestedFlattener.flatDataFrame(spark, replacedDf, fieldsToFlatten, columnMapping) + val replacedDf = + NestedFlattener.replaceCharsInColumns(spark, inputDf, charsToReplace, replacementChar) + val flattenedDf = + NestedFlattener.flatDataFrame(spark, replacedDf, fieldsToFlatten, columnMapping) Vector(flattenedDf) } @@ -28,22 +39,36 @@ final class NestedFlattener protected(val spark: SparkSession, val dfs: DFSWrapp object NestedFlattener { - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): NestedFlattener = { + private val logger: Logger = LoggerFactory.getLogger(getClass) + + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): NestedFlattener = new NestedFlattener(spark, dfs, configLocation) - } - /** - * Replaces problematic characters present in the semi-structured file format (e.g., JSON), such as "." which makes spark think that the field is a struct. - * Note: needs to run before any flattening attempt, that's why the transform function of this algorithm executes this step first. - * Moreover don't forget to consider the charsToReplace in the name mapping in the acon file, because these chars will be replaced by the replacementChar. + /** Replaces problematic characters present in the semi-structured file format (e.g., JSON), such + * as "." which makes spark think that the field is a struct. Note: needs to run before any + * flattening attempt, that's why the transform function of this algorithm executes this step + * first. Moreover don't forget to consider the charsToReplace in the name mapping in the acon + * file, because these chars will be replaced by the replacementChar. * - * @param spark spark session - * @param df dataframe to process - * @param charsToReplace problematic column name characters to be replaced - * @param replacementChar char that replaces the charsToReplace - * @return a dataframe with the column names cleansed of problematic characters + * @param spark + * spark session + * @param df + * dataframe to process + * @param charsToReplace + * problematic column name characters to be replaced + * @param replacementChar + * char that replaces the charsToReplace + * @return + * a dataframe with the column names cleansed of problematic characters */ - def replaceCharsInColumns(spark: SparkSession, df: DataFrame, charsToReplace: String, replacementChar: String): DataFrame = { + def replaceCharsInColumns( + spark: SparkSession, + df: DataFrame, + charsToReplace: String, + replacementChar: String + ): DataFrame = { + + logger.info("Replacing characters $charsToReplace for $replacementChar") def changeSchemaNames(f: StructField): StructField = { val cleansedName = f.name.replaceAll(charsToReplace, replacementChar) @@ -53,9 +78,13 @@ object NestedFlattener { StructField(cleansedName, StructType(children), f.nullable, f.metadata) case at: ArrayType => val childrenDataType = changeSchemaNames(StructField("element", at.elementType)).dataType - StructField(cleansedName, ArrayType(childrenDataType, at.containsNull), f.nullable, f.metadata) - case _ => - StructField(cleansedName, f.dataType, f.nullable, f.metadata) + StructField( + cleansedName, + ArrayType(childrenDataType, at.containsNull), + f.nullable, + f.metadata + ) + case _ => StructField(cleansedName, f.dataType, f.nullable, f.metadata) } } @@ -63,79 +92,126 @@ object NestedFlattener { spark.createDataFrame(df.rdd, schema) } - /** - * Flattens a given DataFrame according to the attributes (arrays or structs) to process. - * Important Note: the chars separating parent and child fieldnames in the flattened attributes is two underscores, - * so make sure you consider this in the name mapping config in the acon file. + /** Flattens a given DataFrame according to the attributes (arrays or structs) to process. + * Important Note: the chars separating parent and child fieldnames in the flattened attributes + * is two underscores, so make sure you consider this in the name mapping config in the acon + * file. * - * @param spark spark session - * @param df dataframe to be processed - * @param fieldsToFlatten fields to include for the flattening process. Note: you should specify not only top-level attributes but sub-levels as well - * if you want them included. - * @param columnMapping columns to include in the final dataframe and with new (more friendly) names. Note: columns not in the columnMapping will be excluded - * @return flattened DataFrame according to the configuration of the algorithm + * @param spark + * spark session + * @param df + * dataframe to be processed + * @param fieldsToFlatten + * fields to include for the flattening process. Note: you should specify not only top-level + * attributes but sub-levels as well if you want them included. + * @param columnMapping + * columns to include in the final dataframe and with new (more friendly) names. Note: columns + * not in the columnMapping will be excluded + * @param sideFlatten + * a map with column name as key, a seq of string with new column name _ position on the + * initial array. That name must be on the columnMapping as well + * @return + * flattened DataFrame according to the configuration of the algorithm */ - def flatDataFrame(spark: SparkSession, df: DataFrame, fieldsToFlatten: Seq[String], columnMapping: Map[String, String]): DataFrame = { - - def dropFieldIfNotForFlattening(df: DataFrame, fieldName: String, callback: () => DataFrame): DataFrame = { - if (fieldsToFlatten.contains(fieldName)) - callback() - else - df.drop(fieldName) + def flatDataFrame( + spark: SparkSession, + df: DataFrame, + fieldsToFlatten: Seq[String], + columnMapping: Map[String, String], + sideFlatten: Option[Map[String, Seq[String]]] = None + ): DataFrame = { + + logger.info("Flattening source data") + + def dropFieldIfNotForFlattening( + df: DataFrame, + fieldName: String, + callback: () => DataFrame + ): DataFrame = if (fieldsToFlatten.contains(fieldName)) callback() else df.drop(fieldName) + + def regularFlattening( + df: DataFrame, + columnsWithoutArray: Array[Column], + f: StructField + ): DataFrame = { + // explode_outer based on column with data name + val columnsWithExplode = columnsWithoutArray ++ Array(explode_outer(col(f.name)).as(f.name)) + df.select(columnsWithExplode: _*) } + def sideFlattening(sideFlatteners: Map[String, Seq[String]], df: DataFrame, f: StructField) = + // get new columns from array contents + sideFlatteners(f.name) + .foldLeft(df) { (df, c) => + df.withColumn(c.split("__")(0), col(f.name).getItem(c.split("__")(1).toInt)) + } + .drop(f.name) + @scala.annotation.tailrec def flatDataFrameAux(df: DataFrame): DataFrame = { var auxDf = df var continueFlat = false - - auxDf.schema.fields.foreach(f => { + auxDf.schema.fields.foreach { f => f.dataType match { + case _: ArrayType => - auxDf = dropFieldIfNotForFlattening(auxDf, f.name, () => { - val columnsWithoutArray = auxDf.schema.fieldNames - .filter(_ != f.name) - .map(n => col(n)) - val columnsWithExplode = columnsWithoutArray ++ Array(explode_outer(col(f.name)).as(f.name)) - continueFlat = true - auxDf.select(columnsWithExplode: _*) - }) + auxDf = dropFieldIfNotForFlattening( + auxDf, + f.name, + () => { + val columnsWithoutArray = + auxDf.schema.fieldNames.filter(_ != f.name).map(n => col(n)) + sideFlatten match { + case Some(sideFlatteners) => + if (sideFlatteners.exists(_._1 == f.name)) { + continueFlat = true + sideFlattening(sideFlatteners, auxDf, f) + + } else { + continueFlat = true + regularFlattening(auxDf, columnsWithoutArray, f) + } + + case _ => + continueFlat = true + regularFlattening(auxDf, columnsWithoutArray, f) + + } + } + ) case st: StructType => - auxDf = dropFieldIfNotForFlattening(auxDf, f.name, () => { - // renames all struct fields to have full names and removes original struct root - val fullPathNames = st.fieldNames.map(n => f.name + "." + n) - val columnNamesWithoutStruct = auxDf.schema.fieldNames.filter(_ != f.name) ++ fullPathNames - val renamedColumns = columnNamesWithoutStruct.map(n => col(n).as(n.replace(".", "__"))) - continueFlat = true - auxDf.select(renamedColumns: _*) - }) + auxDf = dropFieldIfNotForFlattening( + auxDf, + f.name, + () => { + /* renames all struct fields to have full names and removes original struct root */ + val fullPathNames = st.fieldNames.map(n => f.name + "." + n) + val columnNamesWithoutStruct = auxDf.schema.fieldNames.filter(_ != f.name) ++ + fullPathNames + val renamedColumns = + columnNamesWithoutStruct.map(n => col(n).as(n.replace(".", "__"))) + continueFlat = true + auxDf.select(renamedColumns: _*) + } + ) case _ => // do nothing } - }) + } - if (continueFlat) - flatDataFrameAux(auxDf) - else - auxDf + if (continueFlat) flatDataFrameAux(auxDf) else auxDf } - // Rename fields according to columnMapping and drop columns that are not mapped + /* Rename fields according to columnMapping and drop columns that are not mapped */ var flattenedDf = flatDataFrameAux(df) - flattenedDf.schema.foreach(f => { + flattenedDf.schema.foreach { f => if (columnMapping.contains(f.name)) flattenedDf = flattenedDf.withColumnRenamed(f.name, columnMapping(f.name)) - else - flattenedDf = flattenedDf.drop(f.name) - }) + else flattenedDf = flattenedDf.drop(f.name) + } flattenedDf } - } - - - - diff --git a/src/main/scala/com/adidas/analytics/algo/PartitionMaterialization.scala b/src/main/scala/com/adidas/analytics/algo/PartitionMaterialization.scala deleted file mode 100644 index aa33787..0000000 --- a/src/main/scala/com/adidas/analytics/algo/PartitionMaterialization.scala +++ /dev/null @@ -1,52 +0,0 @@ -package com.adidas.analytics.algo - -import com.adidas.analytics.algo.core.Algorithm -import com.adidas.analytics.config.PartitionMaterializationConfiguration -import com.adidas.analytics.config.PartitionMaterializationConfiguration._ -import com.adidas.analytics.config.shared.ConfigurationContext -import com.adidas.analytics.util._ -import org.apache.spark.sql._ - -/** - * Performs loading of partitions from existing view/table to a specified location overwriting existing data - */ -trait PartitionMaterialization extends Algorithm with PartitionMaterializationConfiguration { - - override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - val result = Option(partitionsCriteria).filter(_.nonEmpty).foldLeft(dataFrames(0)) { - case (df, partitionsCriteria) => - val isRequiredPartition = DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitionsCriteria, df.schema) - df.filter(isRequiredPartition) - } - - Vector(result) - } -} - - -object PartitionMaterialization { - - def newFullMaterialization(spark: SparkSession, dfs: DFSWrapper, configLocation: String): PartitionMaterialization = { - new FullMaterialization(spark, dfs, LoadMode.OverwriteTable, configLocation) - } - - def newRangeMaterialization(spark: SparkSession, dfs: DFSWrapper, configLocation: String): PartitionMaterialization = { - new RangeMaterialization(spark, dfs, LoadMode.OverwritePartitionsWithAddedColumns, configLocation) - } - - def newQueryMaterialization(spark: SparkSession, dfs: DFSWrapper, configLocation: String): PartitionMaterialization = { - new QueryMaterialization(spark, dfs, LoadMode.OverwritePartitionsWithAddedColumns, configLocation) - } - - private class FullMaterialization(val spark: SparkSession, val dfs: DFSWrapper, val loadMode: LoadMode, val configLocation: String) - extends PartitionMaterialization with FullMaterializationConfiguration with ConfigurationContext { - } - - private class RangeMaterialization(val spark: SparkSession, val dfs: DFSWrapper, val loadMode: LoadMode, val configLocation: String) - extends PartitionMaterialization with RangeMaterializationConfiguration with ConfigurationContext { - } - - private class QueryMaterialization(val spark: SparkSession, val dfs: DFSWrapper, val loadMode: LoadMode, val configLocation: String) - extends PartitionMaterialization with QueryMaterializationConfiguration with ConfigurationContext { - } -} \ No newline at end of file diff --git a/src/main/scala/com/adidas/analytics/algo/SQLRunner.scala b/src/main/scala/com/adidas/analytics/algo/SQLRunner.scala new file mode 100644 index 0000000..97927b8 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/SQLRunner.scala @@ -0,0 +1,26 @@ +package com.adidas.analytics.algo + +import com.adidas.analytics.algo.core.JobRunner +import org.apache.spark.sql.SparkSession + +final class SQLRunner(spark: SparkSession, configLocation: String) extends JobRunner { + + override def run(): Unit = { + // parse parameter file + val algoParams = spark.read.option("multiline", "true").json(configLocation).cache() + val steps = algoParams.select("steps").collect()(0)(0).toString + + // execute steps provided in parameter file + for (i <- 0 until steps.toInt) { + val sql = algoParams.select("" + i).collect()(0)(0).toString + val df = spark.sql(sql) + df.show(1000, truncate = false) + } + } +} + +object SQLRunner { + + def apply(spark: SparkSession, configLocation: String): SQLRunner = + new SQLRunner(spark, configLocation) +} diff --git a/src/main/scala/com/adidas/analytics/algo/Transpose.scala b/src/main/scala/com/adidas/analytics/algo/Transpose.scala new file mode 100644 index 0000000..9c0ad30 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/Transpose.scala @@ -0,0 +1,82 @@ +package com.adidas.analytics.algo + +import com.adidas.analytics.algo.core.Algorithm +import com.adidas.analytics.config.TransposeConfiguration +import com.adidas.analytics.util.DFSWrapper +import org.apache.spark.sql.functions.{col, first} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +final class Transpose protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with TransposeConfiguration { + + override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + + val inputDf = dataFrames(0) + val transposeDf = Transpose + .transposeTask(spark, inputDf, pivotColumn, aggregationColumn, groupByColumn, targetSchema) + val result = + if (enforceSchema) { + var castedTargetCols = targetSchema.map(c => col(c.name).cast(c.dataType)) + targetSchema.fields.foreach { f => + if (!transposeDf.schema.fieldNames.contains(f.name)) + castedTargetCols = castedTargetCols.filter(_ != col(f.name).cast(f.dataType)) + } + transposeDf.select(castedTargetCols: _*) + } else transposeDf + Vector(result) + } +} + +object Transpose { + + private val logger: Logger = LoggerFactory.getLogger(getClass) + + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): Transpose = + new Transpose(spark, dfs, configLocation) + + /** Transposes a given DataFrame using a group by and pivot operation. The columns used on that + * operations are passed using the acon file. Important Note: the char separating parent and + * child fieldnames in the flattened attributes is two underscores, so make sure you consider + * this in the name mapping config in the acon file. + * + * @param spark + * spark session + * @param df + * dataframe to be processed + * @param pivotColumn + * column from the data frame input used to rotate the data frame + * @param aggrColumn + * column used to aggregate the values under a row + * @param groupColumn + * column or sequence of columns used to group by the data frame + * @param targetSchema + * final schema to remove columns or the flatten data frame and not on the final + * @return + * flattened and transposed DataFrame according to the configuration of the algorithm + */ + + def transposeTask( + spark: SparkSession, + df: DataFrame, + pivotColumn: String, + aggrColumn: String, + groupColumn: Seq[String], + targetSchema: StructType + ): DataFrame = { + + logger.info("Transposing source data") + val result = df + .filter(col(pivotColumn).isNotNull) + .groupBy(groupColumn map col: _*) + .pivot(pivotColumn, targetSchema.map(c => c.name).diff(groupColumn)) + .agg(first(col(aggrColumn))) + result + } + +} diff --git a/src/main/scala/com/adidas/analytics/algo/core/Algorithm.scala b/src/main/scala/com/adidas/analytics/algo/core/Algorithm.scala index da2b585..f033984 100644 --- a/src/main/scala/com/adidas/analytics/algo/core/Algorithm.scala +++ b/src/main/scala/com/adidas/analytics/algo/core/Algorithm.scala @@ -6,30 +6,29 @@ import com.adidas.analytics.util.{DFSWrapper, InputReader, OutputWriter} import org.apache.spark.sql.{DataFrame, SparkSession} import org.slf4j.{Logger, LoggerFactory} - -/** - * Base trait for algorithms that defines their base methods +/** Base trait for algorithms that defines their base methods */ -trait Algorithm extends JobRunner - with Serializable - with BaseReadOperation - with BaseWriteOperation - with BaseUpdateStatisticsOperation { +trait Algorithm + extends JobRunner + with Serializable + with BaseReadOperation + with BaseWriteOperation + with BaseUpdateStatisticsOperation { protected def spark: SparkSession protected def dfs: DFSWrapper - /** - * A function which is supposed to have DataFrame transformations, its implementation is optional + /** A function which is supposed to have DataFrame transformations, its implementation is optional * - * @param dataFrames input DataFrame - * @return modified DataFrame + * @param dataFrames + * input DataFrame + * @return + * modified DataFrame */ protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = dataFrames - /** - * The main entry point for running the algorithm + /** The main entry point for running the algorithm */ override def run(): Unit = { logger.info("Starting reading stage...") @@ -43,142 +42,119 @@ trait Algorithm extends JobRunner } } - object Algorithm { private val logger: Logger = LoggerFactory.getLogger(this.getClass) - /** - * Base trait for read operations + /** Base trait for read operations */ trait BaseReadOperation { - /** - * Reads a DataFrame using logic defined in the inheritor class + /** Reads a DataFrame using logic defined in the inheritor class * - * @return DataFrame which was read + * @return + * DataFrame which was read */ protected def read(): Vector[DataFrame] } - /** - * Base trait for update statistics operations + /** Base trait for update statistics operations */ trait BaseUpdateStatisticsOperation { - /** - * Reads the produced output dataframe and update table statistics + /** Reads the produced output dataframe and update table statistics * - * @return DataFrame written in writer() step + * @return + * DataFrame written in writer() step */ protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit } - /** - * The simplest implementation of update statistics + /** The simplest implementation of update statistics */ trait UpdateStatisticsOperation extends BaseUpdateStatisticsOperation { - /** - * By default the Update Statistics are disabled for a given Algorithm - * @param dataFrames Dataframes to compute statistics + + /** By default the Update Statistics are disabled for a given Algorithm + * @param dataFrames + * Dataframes to compute statistics */ - override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = logger.info("Skipping update statistics step!") + override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = + logger.info("Skipping update statistics step!") } - /** - * Base trait for write operations + /** Base trait for write operations */ trait BaseWriteOperation { - /** - * Defines a number of output partitions + /** Defines a number of output partitions * - * @return number of output partitions + * @return + * number of output partitions */ - protected def outputFilesNum: Option[Int] = None // TODO: make it configurable for all algorithms + protected def outputFilesNum: Option[Int] = None - /** - * Writes the DataFrame using logic defined in the inheritor class + /** Writes the DataFrame using logic defined in the inheritor class * - * @param dataFrames DataFrame to write + * @param dataFrames + * DataFrame to write */ protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] } - /** - * Simple implementation of read operation. It just reads data - * using a reader which is defined in the inheritor class + /** Simple implementation of read operation. It just reads data using a reader which is defined in + * the inheritor class */ trait ReadOperation extends BaseReadOperation { protected def spark: SparkSession - /** - * Defines a reader which is used for reading data + /** Defines a reader which is used for reading data * - * @return An implementation of InputReader + * @return + * An implementation of InputReader */ protected def readers: Vector[InputReader] override protected def read(): Vector[DataFrame] = readers.map(_.read(spark)) } - /** - * Implementation of write operation that uses a writer which is defined - * in the inheritor class for writing data to the file system in an atomic way + /** Implementation of write operation that uses a writer which is defined in the inheritor class + * for writing data to the file system in an atomic way */ trait SafeWriteOperation extends BaseWriteOperation { protected def dfs: DFSWrapper - /** - * Defines a writer which is used for writing data + /** Defines a writer which is used for writing data * - * @return An implementation of AtomicWriter which support writing data in atomic way + * @return + * An implementation of AtomicWriter which support writing data in atomic way */ protected def writer: AtomicWriter - override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = dataFrames.map { df => writer.writeWithBackup(dfs, outputFilesNum.map(df.repartition).getOrElse(df)) } - } } - /** - * Simple implementation of write operation. It just writes data - * using a writer which is defined in the inheritor class + /** Simple implementation of write operation. It just writes data using a writer which is defined + * in the inheritor class */ trait WriteOperation extends BaseWriteOperation { protected def dfs: DFSWrapper - /** - * Defines a writer which is used for writing data + /** Defines a writer which is used for writing data * - * @return An implementation of OutputWriter + * @return + * An implementation of OutputWriter */ protected def writer: OutputWriter - override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - dataFrames.map { df => - writer.write(dfs, outputFilesNum.map(df.repartition).getOrElse(df)) - } - } - } - - trait ComputeTableStatisticsOperation { - - protected def dfs: DFSWrapper - - protected def spark: SparkSession - - protected def computeStatisticsForTable(tableName: Option[String]): Unit = - tableName match { - case Some(table) => spark.sql(s"ANALYZE TABLE ${table} COMPUTE STATISTICS") - case None => Unit - } + override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = + dataFrames.map(df => writer.write(dfs, outputFilesNum.map(df.repartition).getOrElse(df))) } -} \ No newline at end of file +} diff --git a/src/main/scala/com/adidas/analytics/algo/core/JobRunner.scala b/src/main/scala/com/adidas/analytics/algo/core/JobRunner.scala index 3ac594e..9726343 100644 --- a/src/main/scala/com/adidas/analytics/algo/core/JobRunner.scala +++ b/src/main/scala/com/adidas/analytics/algo/core/JobRunner.scala @@ -1,13 +1,11 @@ package com.adidas.analytics.algo.core -/** - * This is a generic trait for all the executable algorithms. - * It should be used when the concept of the algorithm is different from the regular ETL process. +/** This is a generic trait for all the executable algorithms. It should be used when the concept of + * the algorithm is different from the regular ETL process. */ trait JobRunner { - /** - * Execute algorithm + /** Execute algorithm */ def run(): Unit } diff --git a/src/main/scala/com/adidas/analytics/algo/core/Metadata.scala b/src/main/scala/com/adidas/analytics/algo/core/Metadata.scala index 140f240..9b307b7 100644 --- a/src/main/scala/com/adidas/analytics/algo/core/Metadata.scala +++ b/src/main/scala/com/adidas/analytics/algo/core/Metadata.scala @@ -2,9 +2,7 @@ package com.adidas.analytics.algo.core import org.apache.spark.sql.DataFrame -/** - * This is a generic trait for all strategies that will - * add new partitions on metadata table +/** This is a generic trait for all strategies that will add new partitions on metadata table */ trait Metadata { @@ -17,4 +15,3 @@ trait Metadata { outputDataFrame.sparkSession.catalog.refreshTable(tableName) } - diff --git a/src/main/scala/com/adidas/analytics/algo/core/PartitionHelpers.scala b/src/main/scala/com/adidas/analytics/algo/core/PartitionHelpers.scala index fc3b6f1..30c66af 100644 --- a/src/main/scala/com/adidas/analytics/algo/core/PartitionHelpers.scala +++ b/src/main/scala/com/adidas/analytics/algo/core/PartitionHelpers.scala @@ -3,14 +3,16 @@ package com.adidas.analytics.algo.core import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} -/** - * This is a trait with generic logic to interact with dataframes on partition level +/** This is a trait with generic logic to interact with dataframes on partition level */ trait PartitionHelpers { - protected def getDistinctPartitions(outputDataFrame: DataFrame, targetPartitions: Seq[String]): Dataset[Row] = { - val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString)) - + protected def getDistinctPartitions( + outputDataFrame: DataFrame, + targetPartitions: Seq[String] + ): Dataset[Row] = { + val targetPartitionsColumns: Seq[Column] = + targetPartitions.map(partitionString => col(partitionString)) outputDataFrame.select(targetPartitionsColumns: _*).distinct } @@ -19,10 +21,10 @@ trait PartitionHelpers { protected def createParameterValue(partitionRawValue: Any): String = partitionRawValue match { - case value: java.lang.Short => value.toString - case value: java.lang.Integer => value.toString + case value: java.lang.Short => value.toString + case value: java.lang.Integer => value.toString case value: scala.Predef.String => "'" + value + "'" - case null => throw new Exception("Partition Value is null. No support for null partitions!") - case value => throw new Exception("Unsupported partition DataType: " + value.getClass) + case null => throw new Exception("Partition Value is null. No support for null partitions!") + case value => throw new Exception("Unsupported partition DataType: " + value.getClass) } } diff --git a/src/main/scala/com/adidas/analytics/algo/core/TableStatistics.scala b/src/main/scala/com/adidas/analytics/algo/core/TableStatistics.scala index 7d4efc3..8ef8724 100644 --- a/src/main/scala/com/adidas/analytics/algo/core/TableStatistics.scala +++ b/src/main/scala/com/adidas/analytics/algo/core/TableStatistics.scala @@ -1,48 +1,124 @@ package com.adidas.analytics.algo.core +import com.adidas.analytics.util.DataFrameUtils.PartitionCriteria import org.apache.spark.sql._ -import scala.collection.JavaConversions._ -/** - * This is a generic trait to use in the algorithms where we want - * to compute statistics on table and partition level +/** This is a generic trait to use in the algorithms where we want to compute statistics on table + * and partition level */ trait TableStatistics extends PartitionHelpers { protected def spark: SparkSession - /** - * will add statistics on partition level using HiveQL statements + /** Will add statistics on partition level using HiveQL statements + * + * @param df + * dataframe + * @param targetTable + * target table + * @param targetPartitions + * target table partitions */ - protected def computeStatisticsForTablePartitions(df: DataFrame, - targetTable: String, - targetPartitions: Seq[String]): Unit = { - + protected def computeStatisticsForTablePartitions( + df: DataFrame, + targetTable: String, + targetPartitions: Seq[String] + ): Unit = { val distinctPartitions: DataFrame = getDistinctPartitions(df, targetPartitions) generateComputePartitionStatements(distinctPartitions, targetTable, targetPartitions) - .collectAsList() + .collect() .foreach((statement: String) => spark.sql(statement)) } - /** - * will add statistics on table level using HiveQL statements + /** Will add statistics on partition level using HiveQL statements, given a set of affected + * partitions + * + * @param targetTable + * target table + * @param affectedPartitions + * sequence containing the partitions in the DataFrame */ - protected def computeStatisticsForTable(tableName: Option[String]): Unit = tableName match { - case Some(table) => spark.sql(s"ANALYZE TABLE ${table} COMPUTE STATISTICS") - case None => Unit - } + protected def computeStatisticsForTablePartitions( + targetTable: String, + affectedPartitions: Seq[PartitionCriteria] + ): Unit = + generateComputePartitionStatements(targetTable, affectedPartitions) + .foreach((statement: String) => spark.sql(statement)) - private def generateComputePartitionStatements(df: DataFrame, - targetTable: String, - targetPartitions: Seq[String]): Dataset[String] = { - df.map(partitionValue => { - val partitionStatementValues: Seq[String] = targetPartitions - .map(partitionColumn => s"${partitionColumn}=${getParameterValue(partitionValue, partitionColumn)}") + /** Will add statistics for all table partitions using HiveQL statements + * + * @param targetTable + * target table + */ + protected def computeStatisticsForTablePartitions(targetTable: String): Unit = { + val partitionSpecs = spark.sql(s"SHOW PARTITIONS $targetTable").collect() + partitionSpecs.foreach { row => + val formattedSpec = row + .getAs[String]("partition") + .split('/') + .map { p => + val pSplitted = p.split('=') + "%s='%s'".format(pSplitted(0), pSplitted(1)) + } + .mkString(",") - s"ANALYZE TABLE ${targetTable} PARTITION(${partitionStatementValues.mkString(",")}) COMPUTE STATISTICS" - })(Encoders.STRING) + spark.sql(s"ANALYZE TABLE $targetTable PARTITION($formattedSpec) COMPUTE STATISTICS") + } } + /** Will add statistics on table level using HiveQL statements + * + * @param tableName + * table name + */ + protected def computeStatisticsForTable(tableName: Option[String]): Unit = + tableName match { + case Some(table) => spark.sql(s"ANALYZE TABLE $table COMPUTE STATISTICS") + case None => Unit + } + + /** Generates analyze table commands to be submitted per partition + * + * @param df + * dataframe + * @param targetTable + * target table + * @param targetPartitions + * target table partitions + * @return + */ + private def generateComputePartitionStatements( + df: DataFrame, + targetTable: String, + targetPartitions: Seq[String] + ): Dataset[String] = + df.map { partitionValue => + val partitionStatementValues: Seq[String] = targetPartitions.map(partitionColumn => + s"$partitionColumn=${getParameterValue(partitionValue, partitionColumn)}" + ) + + s"ANALYZE TABLE $targetTable PARTITION(${partitionStatementValues.mkString(",")}) COMPUTE STATISTICS" + }(Encoders.STRING) + + /** Generates analyze table commands to be submitted per partition + * + * @param targetTable + * target table + * @param affectedPartitions + * target table partitions that were affected by a change so stats need to be recomputed + * @return + */ + private def generateComputePartitionStatements( + targetTable: String, + affectedPartitions: Seq[PartitionCriteria] + ): Seq[String] = + affectedPartitions.map { partitionCriteria => + val partitionStatementValues: Seq[String] = + partitionCriteria.map(partition => s"${partition._1}=${partition._2}") + + s"ANALYZE TABLE $targetTable PARTITION(${partitionStatementValues.mkString(",")}) COMPUTE STATISTICS" + } + } diff --git a/src/main/scala/com/adidas/analytics/algo/loads/AppendLoad.scala b/src/main/scala/com/adidas/analytics/algo/loads/AppendLoad.scala new file mode 100644 index 0000000..b542e48 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/loads/AppendLoad.scala @@ -0,0 +1,301 @@ +package com.adidas.analytics.algo.loads + +import java.util.regex.Pattern +import com.adidas.analytics.algo.core.{Algorithm, TableStatistics} +import com.adidas.analytics.algo.loads.AppendLoad._ +import com.adidas.analytics.algo.shared.DateComponentDerivation +import com.adidas.analytics.config.loads.AppendLoadConfiguration +import com.adidas.analytics.util.DFSWrapper._ +import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetFormat} +import com.adidas.analytics.util.DataFrameUtils._ +import com.adidas.analytics.util._ +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.slf4j.{Logger, LoggerFactory} +import scala.collection.immutable + +/** Performs append load of new records to an existing table. + */ +final class AppendLoad protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with AppendLoadConfiguration + with TableStatistics + with DateComponentDerivation { + + override protected def read(): Vector[DataFrame] = readInputData(targetSchema, spark, dfs) + + override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = + if (partitionSourceColumn.nonEmpty) + dataFrames.map(df => + df.transform( + withDateComponents( + partitionSourceColumn.get, + partitionSourceColumnFormat.getOrElse("yyyy-MM-dd"), + targetPartitions + ) + ) + ) + else if (regexFilename.nonEmpty) + dataFrames.map(df => + df.transform(addTargetPartitions(targetPartitions zip regexFilename.get, targetSchema)) + ) + else dataFrames + + override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + writeHeaders(dataFrames, targetPartitions, headerDir, dfs) + super.write(dataFrames) + } + + override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = + if (computeTableStatistics && dataType == STRUCTURED && targetTable.isDefined) { + if (targetPartitions.nonEmpty) + dataFrames + .foreach(df => computeStatisticsForTablePartitions(df, targetTable.get, targetPartitions)) + computeStatisticsForTable(targetTable) + } + + private def readInputData( + targetSchema: StructType, + spark: SparkSession, + dfs: DFSWrapper + ): Vector[DataFrame] = { + val inputDirPath = new Path(inputDir) + val headerDirPath = new Path(headerDir) + + val fs = dfs.getFileSystem(inputDirPath) + val sources = listSources(inputDirPath, headerDirPath, fs, targetSchema) + readSources(sources, spark) + } + + private def listSources( + inputDirPath: Path, + headerDirPath: Path, + fs: FileSystem, + targetSchema: StructType + ): Seq[Source] = { + logger.info(s"Looking for input files in $inputDirPath") + val targetSchemaWithoutTargetPartitions = + getSchemaWithouttargetPartitions(targetSchema, targetPartitions.toSet) + if (partitionSourceColumn.nonEmpty) + fs.ls(inputDirPath, recursive = true) + .map(sourcePath => Source(targetSchemaWithoutTargetPartitions, sourcePath.toString)) + else { + val groupedHeaderPathAndSourcePaths = + fs.ls(inputDirPath, recursive = true).groupBy { inputPath => + buildHeaderFilePath( + targetPartitions zip regexFilename.get, + targetSchema, + extractPathWithoutServerAndProtocol(inputPath.toString), + headerDirPath + ) + } + + def getMapSchemaStructToPath: immutable.Iterable[Source] = { + val mapSchemaStructToPath = groupedHeaderPathAndSourcePaths.toSeq + .map { + case (headerPath, sourcePaths) => + getSchemaFromHeaderOrSource(fs, headerPath, sourcePaths) + } + .groupBy(_._1) + .map { case (k, v) => (k, v.flatMap(_._2)) } + + val filteredMapSchemaStructToPath = mapSchemaStructToPath.filter(schemaFromInputData => + matchingSchemas_?(schemaFromInputData._1, targetSchema, schemaFromInputData._2) + ) + + if (mapSchemaStructToPath.size != filteredMapSchemaStructToPath.size) + throw new RuntimeException( + "Schema does not match the input data for some of the input folders." + ) + + mapSchemaStructToPath.flatMap { + case (_, sourcePaths) => + sourcePaths.map(sourcePath => Source(targetSchema, sourcePath.toString)) + } + } + + val schemaAndSourcePath = + if (!verifySchema) groupedHeaderPathAndSourcePaths.flatMap { + case (headerPath, sourcePaths) => + val schema = + if (fs.exists(headerPath)) loadHeader(headerPath, fs) + else targetSchemaWithoutTargetPartitions + sourcePaths.map(sourcePath => Source(schema, sourcePath.toString)) + } + else getMapSchemaStructToPath + schemaAndSourcePath.toSeq + } + } + + private def getSchemaFromHeaderOrSource( + fs: FileSystem, + headerPath: Path, + sourcePaths: Seq[Path] + ): (StructType, Seq[Path]) = { + val schema = + if (fs.exists(headerPath)) loadHeader(headerPath, fs) else inferSchemaFromSource(sourcePaths) + (schema, sourcePaths) + } + + private def inferSchemaFromSource(sourcePaths: Seq[Path]): StructType = { + val reader = spark.read.options(sparkReaderOptions) + val dataFormat = fileFormat match { + case "dsv" => DSVFormat() + case "parquet" => ParquetFormat() + case "json" => JSONFormat() + case anotherFormat => throw new RuntimeException(s"Unknown file format: $anotherFormat") + } + dataFormat.read(reader, sourcePaths.map(_.toString): _*).schema + } + + private def matchingSchemas_?( + schemaFromInputData: StructType, + targetSchema: StructType, + paths: Seq[Path] + ): Boolean = { + val inputColumnsVector = schemaFromInputData.names.toVector + val targetColumnsVector = targetSchema.names.toVector + val diff = inputColumnsVector.diff(targetColumnsVector) + if (diff.nonEmpty) + logger.error(s"Inferred schema does not match the target schema for ${paths.toString}") + diff.isEmpty + } + + private def readSources(sources: Seq[Source], spark: SparkSession): Vector[DataFrame] = + groupSourcesBySchema(sources).map { + case (schema, inputPaths) => + readInputFiles(inputPaths, fileFormat, schema, spark.read.options(sparkReaderOptions)) + }.toVector + + private def readInputFiles( + inputPaths: Seq[String], + fileFormat: String, + schema: StructType, + reader: DataFrameReader + ): DataFrame = + fileFormat match { + case "dsv" => DSVFormat(Some(schema)).read(reader, inputPaths: _*) + case "parquet" => ParquetFormat(Some(schema)).read(reader, inputPaths: _*) + case "json" => JSONFormat(Some(schema)).read(reader, inputPaths: _*) + case anotherFormat => throw new RuntimeException(s"Unknown file format: $anotherFormat") + } +} + +object AppendLoad { + + private val logger: Logger = LoggerFactory.getLogger(getClass) + private val headerFileName: String = "header.json" + + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): AppendLoad = + new AppendLoad(spark, dfs, configLocation) + + private def extractPathWithoutServerAndProtocol(path: String): String = + path.replaceFirst("\\w+\\d*://.+?/", "") + + private def getSchemaWithouttargetPartitions( + targetSchema: StructType, + targetPartitions: Set[String] + ): StructType = + StructType(targetSchema.fields.filterNot(field => targetPartitions.contains(field.name))) + + private def groupSourcesBySchema(sources: Seq[Source]): Map[StructType, Seq[String]] = + sources.groupBy(_.schema).mapValues(sources => sources.map(_.inputFileLocation)) + + private def addTargetPartitions( + columnNameToRegexPairs: Seq[(String, String)], + schema: StructType + )(inputDf: DataFrame): DataFrame = { + def getInputFileName: Column = + udf((path: String) => extractPathWithoutServerAndProtocol(path)).apply(input_file_name) + + val tempInputFileNameColumn = col("temp_input_file_name") + val columnNameToTypeMapping = schema.fields.map(field => field.name -> field.dataType).toMap + + columnNameToRegexPairs + .foldLeft(inputDf.withColumn(tempInputFileNameColumn.toString, getInputFileName)) { + case (df, (columnName, regex)) => + val targetColumnType = columnNameToTypeMapping(columnName) + df.withColumn( + columnName, + regexp_extract(tempInputFileNameColumn, regex, 1).cast(targetColumnType) + ) + } + .drop(tempInputFileNameColumn) + } + + private def buildHeaderFilePath( + columnNameToRegexPairs: Seq[(String, String)], + schema: StructType, + inputFileName: String, + headerDirPath: Path + ): Path = { + val columnNameToTypeMapping = schema.fields.map(field => field.name -> field.dataType).toMap + val subdirectories = columnNameToRegexPairs.map { + case (columnName, regex) => + implicit val dataType: DataType = columnNameToTypeMapping(columnName) + extractPartitionColumnValue(inputFileName, regex) match { + case Some(columnValue) => s"$columnName=$columnValue" + case None => + throw new RuntimeException( + s"Unable to extract value for $columnName with '$regex' from $inputFileName" + ) + } + } + new Path(headerDirPath.join(subdirectories), headerFileName) + } + + private def loadHeader(headerPath: Path, fs: FileSystem): StructType = + DataType.fromJson(fs.readFile(headerPath)).asInstanceOf[StructType] + + protected def writeHeaders( + dataFrames: Seq[DataFrame], + targetPartitions: Seq[String], + headerDir: String, + dfs: DFSWrapper + ): Unit = { + logger.info(s"Writing header files to $headerDir") + val headerDirPath = new Path(headerDir) + val fs = dfs.getFileSystem(headerDirPath) + dataFrames.foreach { df => + val schemaJson = + getSchemaWithouttargetPartitions(df.schema, targetPartitions.toSet).prettyJson + df.collectPartitions(targetPartitions).foreach { partitionCriteria => + val subdirectories = DataFrameUtils.mapPartitionsToDirectories(partitionCriteria) + val headerPath = new Path(headerDirPath.join(subdirectories), headerFileName) + if (!fs.exists(headerPath)) { + logger.info(s"Writing header $headerPath") + fs.writeFile(headerPath, schemaJson) + } + } + } + } + + private def extractPartitionColumnValue(fileName: String, regex: String)(implicit + dataType: DataType + ): Option[String] = { + val matcher = Pattern.compile(regex).matcher(fileName) + Option(matcher) + .filter(_.find) + .map(_.group(1)) //modifications to regexes demand taking group 1 instead of group 0 + .map(restoreFromTypedValue) + } + + private def restoreFromTypedValue( + stringColumnValue: String + )(implicit dataType: DataType): String = { + val columnValue = dataType match { + case ByteType | ShortType | IntegerType | LongType => stringColumnValue.toLong + case BooleanType => stringColumnValue.toBoolean + case StringType => stringColumnValue + } + columnValue.toString + } + + protected case class Source(schema: StructType, inputFileLocation: String) + +} diff --git a/src/main/scala/com/adidas/analytics/algo/loads/DeltaLakeLoad.scala b/src/main/scala/com/adidas/analytics/algo/loads/DeltaLakeLoad.scala new file mode 100644 index 0000000..dc92514 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/loads/DeltaLakeLoad.scala @@ -0,0 +1,360 @@ +package com.adidas.analytics.algo.loads + +import com.adidas.analytics.algo.core.{Algorithm, TableStatistics} +import com.adidas.analytics.algo.shared.DateComponentDerivation +import com.adidas.analytics.config.loads.DeltaLakeLoadConfiguration +import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetFormat} +import com.adidas.analytics.util.DataFrameUtils._ +import com.adidas.analytics.util.{DFSWrapper, DataFormat, DataFrameUtils} +import io.delta.tables._ +import org.apache.spark.sql.expressions.Window +import org.apache.spark.sql.functions.{col, row_number} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.storage.StorageLevel +import scala.collection.mutable + +final class DeltaLakeLoad protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with DeltaLakeLoadConfiguration + with TableStatistics + with DateComponentDerivation { + + override protected def read(): Vector[DataFrame] = + try { + val dataFormat: DataFormat = fileFormat match { + case "parquet" => ParquetFormat() + case "dsv" => DSVFormat() + case "json" => + JSONFormat(multiLine = isMultilineJSON.getOrElse(false), optionalSchema = readJsonSchema) + case _ => throw new RuntimeException(s"Unsupported input data format $fileFormat.") + } + + val inputDF = dataFormat.read(spark.read.options(sparkReaderOptions), inputDir) + Vector(inputDF.select(inputDF.columns.map(c => col(c).as(c.toLowerCase)): _*)) + } catch { case e: Throwable => throw new RuntimeException("Unable to read input data.", e) } + + override protected def transform(dataFrames: Vector[DataFrame]): Vector[Dataset[Row]] = + try Vector( + initOrUpdateDeltaTable(withDatePartitions(dataFrames(0))).selectExpr(targetTableColumns: _*) + ) + catch { case e: Throwable => throw new RuntimeException("Could not update Delta Table!", e) } + + override protected def write(dataFrames: Vector[DataFrame]): Vector[Dataset[Row]] = { + writer.writeWithBackup(dfs, dataFrames(0), Some(affectedPartitions)) + dataFrames + } + + /** Check if it is init load or not (i.e., delta table was already loaded at least once) and + * proceed to create the delta table or merge it with the new arriving data. + * + * @param newDataDF + * DataFrame containing new data to update the Delta table + * @return + * the DeltaTable object + */ + private def initOrUpdateDeltaTable(newDataDF: DataFrame): DataFrame = + if (DeltaTable.isDeltaTable(deltaTableDir)) { + newDataDF.persist(StorageLevel.MEMORY_AND_DISK) + affectedPartitions = newDataDF.collectPartitions(targetPartitions) + val affectedPartitionsFilter = + DataFrameUtils.buildPartitionsCriteriaMatcherFunc(affectedPartitions, newDataDF.schema) + + val condensedNewDataDF = condenseNewData(newDataDF) + newDataDF.unpersist() + condensedNewDataDF.persist(StorageLevel.MEMORY_AND_DISK) + + mergeDeltaTable(condensedNewDataDF) + condensedNewDataDF.unpersist() + + if (isManualRepartitioning) + repartitionDeltaTable( + DeltaTable.forPath(deltaTableDir), + affectedPartitions, + affectedPartitionsFilter + ) + + var deltaTableDF = DeltaTable.forPath(deltaTableDir).toDF + if (targetPartitions.nonEmpty) deltaTableDF = deltaTableDF.filter(affectedPartitionsFilter) + + deltaTableDF + } else { + val condensedNewDataDF = condenseNewData(newDataDF, initLoad = true) + + affectedPartitions = condensedNewDataDF.collectPartitions(targetPartitions) + + initDeltaTable(condensedNewDataDF) + + DeltaTable.forPath(deltaTableDir).toDF + } + + /** Executes the initial load of the delta table when this algorithm is executed for the first + * time on the table. + * + * @param initialDataDF + * DataFrame containing the initial data to load to the delta table + */ + private def initDeltaTable(initialDataDF: DataFrame): Unit = { + val dfWriter = { + if (targetPartitions.nonEmpty) { + val partitionCols = targetPartitions.map(columnName => col(columnName)) + outputPartitionsNum + .map(n => initialDataDF.repartition(n, partitionCols: _*)) + .getOrElse(initialDataDF) + .write + .format("delta") + .partitionBy(targetPartitions: _*) + } else + outputPartitionsNum + .map(initialDataDF.repartition) + .getOrElse(initialDataDF) + .write + .format("delta") + } + + dfWriter.save(deltaTableDir) + } + + /** Merges the delta table with the new arriving data. Also vacuums old history according to the + * retention period, if necessary. + * + * @param newDataDF + * DataFrame containing the new data to merge with the existing data on the delta table + */ + private def mergeDeltaTable(newDataDF: DataFrame): Unit = { + val deltaTable = DeltaTable.forPath(deltaTableDir) + + if (isToVacuum) deltaTable.vacuum(vacuumRetentionPeriod) + + deltaTable + .alias(currentDataAlias) + .merge( + newDataDF.alias(newDataAlias), + generateMatchCondition(businessKey, businessKeyMatchOperator) + ) + .whenMatched(newDataDF(recordModeColumnName).isin(recordsToDelete: _*)) + .delete() + .whenMatched() + .updateAll() + .whenNotMatched + .insertAll() + .execute() + } + + /** Creation of a condensed set of records to be inserted/updated/deleted. This serves the purpose + * of taking only the most recent status of a specific record, as the same can can be subjected + * to several updates (including deletions) in the source which are not yet registered in the + * delta table, and we want to only store the latest status in the delta table. Steps: 1) Order + * changes in each record according to the provided business and condensation keys. There is an + * additional condensation key in cases where, in the init loads only, the record mode column + * needs to be included in the condensation logic itself (e.g., cases where on the init load + * there is more than one row per business key); 2) Get only the last change of the records + * + * @param newData + * DataFrame containing all new delta records + * @param initLoad + * indicates if the condensation is an init load condensation or a delta condensation, as there + * is the need to sometimes include the record_mode in the condensation key to properly order + * the delta init. + * @return + * DataFrame containing the most recent records to be inserted/updated/deleted + */ + private def condenseNewData(newData: DataFrame, initLoad: Boolean = false): DataFrame = { + var partitionWindow = Window.partitionBy(businessKey.map(c => col(c)): _*) + if (initLoad && initCondensationWithRecordMode) + partitionWindow = partitionWindow.orderBy( + condensationKey.map(c => col(c).desc).union(Seq(col(recordModeColumnName).asc)): _* + ) + else partitionWindow = partitionWindow.orderBy(condensationKey.map(c => col(c).desc): _*) + + val rankedDeltaRecords = newData + .withColumn("ranking", row_number().over(partitionWindow)) + .filter(recordModesFilterFunction) + rankedDeltaRecords.filter(rankedDeltaRecords("ranking") === 1).drop("ranking") + } + + /** Adds temporal partitions (e.g., day, month, year) to an existing DataFrame containing a + * temporal column + * + * @param df + * DataFrame to which to add the partitions + * @return + */ + def withDatePartitions(df: DataFrame): DataFrame = + try if ( + targetPartitions.nonEmpty && partitionSourceColumn.nonEmpty && + partitionSourceColumnFormat.nonEmpty + ) + df.transform( + withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions) + ) + else df + catch { + case e: Throwable => + throw new RuntimeException("Cannot add partitioning information for data frames", e) + } + + /** Generates the condition to match the keys of the current data and new data in the delta table + * + * @param businessKey + * simple or composite business key that uniquely identifies a record so that new data can be + * compared with current data for deciding whether to update or insert a record + * @param logicalOperator + * logical operator in the match condition, in order to compare currentData with NewData + * according to the business key. Defaults to AND + * @return + * a string containing the match condition (e.g., current.id1 = new.id1 AND current.id2 = + * new.id2) + */ + private def generateMatchCondition(businessKey: Seq[String], logicalOperator: String): String = { + + /** Ensures that the delta merge match condition always have the null partition spec into + * consideration, otherwise it may not be correct in rare cases where in the init load there + * were two initial rows with different record modes for the same row and the condensation + * logic, despite including the record mode column in the sorting part of the condensation, + * does not pick the desired one for a very specific organizational business process. + * + * This acts mostly as a safe guard for data sources that don't handle empty dates very well in + * their change logs, so with this we always ensure that we will search for matches in the null + * partitions (e.g., ((year, 9999), (month,99), (day,99)) + * + * @param affectedPartitions + * affected partitions + * @param targetPartitions + * target partitions + * @return + */ + def forceAdditionOfNullPartitionCriteria( + affectedPartitions: Seq[PartitionCriteria], + targetPartitions: Seq[String] + ): Seq[PartitionCriteria] = { + var partitionCriteria: PartitionCriteria = Seq.empty + targetPartitions.foreach { + case partitionName @ "year" => + partitionCriteria = partitionCriteria :+ + (partitionName, DateComponentDerivation.DEFAULT_4DIGIT_VALUE.toString) + case partitionName @ "month" => + partitionCriteria = partitionCriteria :+ + (partitionName, DateComponentDerivation.DEFAULT_2DIGIT_VALUE.toString) + case partitionName @ "day" => + partitionCriteria = partitionCriteria :+ + (partitionName, DateComponentDerivation.DEFAULT_2DIGIT_VALUE.toString) + case partitionName @ "week" => + partitionCriteria = partitionCriteria :+ + (partitionName, DateComponentDerivation.DEFAULT_2DIGIT_VALUE.toString) + } + + affectedPartitions.union(Seq(partitionCriteria)).distinct + } + + /** Given a collection of column names it builds the adequate delta merge match condition. E.g.: + * Given Seq("sales_id", "sales_date") it returns currentData.sales_id = newData.sales_id AND + * currentData.sales_date = newData.sales_date + * + * @param columns + * list of column names to build the match condition + * @return + */ + def generateCondition(columns: Seq[String]): String = + columns + .map(key => s"$currentDataAlias.$key = $newDataAlias.$key") + .mkString(s" $logicalOperator ") + + targetPartitions match { + case _ :: _ => + if (!ignoreAffectedPartitionsMerge) generateCondition(businessKey.union(targetPartitions)) + else + s"${generateCondition(businessKey)} $logicalOperator (${generateAffectedPartitionsWhere(forceAdditionOfNullPartitionCriteria(affectedPartitions, targetPartitions), s"$currentDataAlias.")})" + case _ => generateCondition(businessKey) + } + } + + /** Generates a string with the where clause used to filter a DataFrame to only select data within + * the partitions affected in this delta load process. + * + * @param affectedPartitions + * collection of affected partitions in this delta load process + * @param prefix + * optional prefix to add before the name of the partition column (e.g., currenData.year) to + * solve ambiguous column names in the merge operations. + * @return + */ + def generateAffectedPartitionsWhere( + affectedPartitions: Seq[PartitionCriteria], + prefix: String = "" + ): String = + affectedPartitions + .map(partitionCriteria => + s"(%s)".format( + partitionCriteria + .map(partition => s"$prefix${partition._1} = ${partition._2}") + .mkString(" AND ") + ) + ) + .mkString(" OR ") + + /** Repartitions the delta table + * + * @param deltaTable + * delta table DataFrame to repartition + * @param affectedPartitions + * sequence of affected partitions considering the new data + * @param affectedPartitionsFilter + * filter function to filter only the affected partitions + */ + private def repartitionDeltaTable( + deltaTable: DeltaTable, + affectedPartitions: Seq[DataFrameUtils.PartitionCriteria], + affectedPartitionsFilter: FilterFunction + ): Unit = { + + /** Auxiliary method to repartition delta table independently if the same is partitioned or not + * + * @param deltaTableDF + * delta table DataFrame + * @param options + * options to repartition the delta table (e.g., dataChange and replaceWhere) + */ + def repartitionDeltaTableAux(deltaTableDF: DataFrame, options: Map[String, String]): Unit = + outputPartitionsNum + .map(deltaTableDF.repartition) + .getOrElse(deltaTableDF) + .write + .options(options) + .format("delta") + .mode("overwrite") + .save(deltaTableDir) + + val (options, df) = { + val options = mutable.Map("dataChange" -> "false") + var df = deltaTable.toDF + if (targetPartitions.nonEmpty) { + options.put("replaceWhere", generateAffectedPartitionsWhere(affectedPartitions)) + df = df.filter(affectedPartitionsFilter) + } + (options, df) + } + + repartitionDeltaTableAux(df, options.toMap) + } + + /** Implementation of the update statistics method from the Algorithm trait + * + * @param dataFrames + * Dataframes to compute statistics + */ + override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = + if (computeTableStatistics) { + if (targetPartitions.nonEmpty) + computeStatisticsForTablePartitions(targetTable, affectedPartitions) + computeStatisticsForTable(Some(targetTable)) + } +} + +object DeltaLakeLoad { + + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLakeLoad = + new DeltaLakeLoad(spark, dfs, configLocation) +} diff --git a/src/main/scala/com/adidas/analytics/algo/DeltaLoad.scala b/src/main/scala/com/adidas/analytics/algo/loads/DeltaLoad.scala similarity index 53% rename from src/main/scala/com/adidas/analytics/algo/DeltaLoad.scala rename to src/main/scala/com/adidas/analytics/algo/loads/DeltaLoad.scala index 48cc30e..f95820b 100644 --- a/src/main/scala/com/adidas/analytics/algo/DeltaLoad.scala +++ b/src/main/scala/com/adidas/analytics/algo/loads/DeltaLoad.scala @@ -1,9 +1,9 @@ -package com.adidas.analytics.algo +package com.adidas.analytics.algo.loads -import com.adidas.analytics.algo.DeltaLoad._ import com.adidas.analytics.algo.core.Algorithm +import com.adidas.analytics.algo.loads.DeltaLoad._ import com.adidas.analytics.algo.shared.DateComponentDerivation -import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration +import com.adidas.analytics.config.loads.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration import com.adidas.analytics.util.DataFrameUtils._ import com.adidas.analytics.util._ import org.apache.spark.sql.expressions.Window @@ -12,23 +12,29 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.{Logger, LoggerFactory} -/** - * Performs merge of active records with delta records. +/** Performs merge of active records with delta records. */ -final class DeltaLoad protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) - extends Algorithm with PartitionedDeltaLoadConfiguration with DateComponentDerivation { +final class DeltaLoad protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with PartitionedDeltaLoadConfiguration + with DateComponentDerivation { override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { - val dataFramesWithTargetPartitionsAdded = withDatePartitions(spark, dfs, dataFrames.take(1)) + val dataFramesWithTargetPartitionsAdded = withDatePartitions(dataFrames.take(1)) val deltaRecords = dataFramesWithTargetPartitionsAdded(0).persist(StorageLevel.MEMORY_AND_DISK) val activeRecords = dataFrames(1) val partitions = deltaRecords.collectPartitions(targetPartitions) - val isRequiredPartition = DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitions, activeRecords.schema) + val isRequiredPartition = + DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitions, activeRecords.schema) - // Create DataFrame containing full content of partitions that need to be touched - val activeRecordsTargetPartitions = activeRecords.filter(isRequiredPartition).persist(StorageLevel.MEMORY_AND_DISK) + /* Create DataFrame containing full content of partitions that need to be touched */ + val activeRecordsTargetPartitions = + activeRecords.filter(isRequiredPartition).persist(StorageLevel.MEMORY_AND_DISK) // Condense delta set val upsertRecords = getUpsertRecords(deltaRecords, activeRecords.columns) @@ -45,51 +51,53 @@ final class DeltaLoad protected(val spark: SparkSession, val dfs: DFSWrapper, va Vector(result) } - /** - * Creation of a condensed set of records to be inserted/updated - * @param deltaRecords Dataset[Row] containing all delta records - * @return Dataset[Row] containing the most recent record to be inserted/updated + /** Creation of a condensed set of records to be inserted/updated + * @param deltaRecords + * Dataset[Row] containing all delta records + * @return + * Dataset[Row] containing the most recent record to be inserted/updated */ - private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = { - // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records) + private def getUpsertRecords( + deltaRecords: Dataset[Row], + resultColumns: Seq[String] + ): Dataset[Row] = { + /* Create partition window - Partitioning by delta records logical key (i.e. + * technical key of active records) */ val partitionWindow = Window .partitionBy(businessKey.map(col): _*) .orderBy(technicalKey.map(component => col(component).desc): _*) // Ranking & projection - val rankedDeltaRecords = deltaRecords - .withColumn(rankingColumnName, row_number().over(partitionWindow)) - .filter(upsertRecordsModesFilterFunction) + val rankedDeltaRecords = + deltaRecords + .withColumn(rankingColumnName, row_number().over(partitionWindow)) + .filter(upsertRecordsModesFilterFunction) rankedDeltaRecords .filter(rankedDeltaRecords(rankingColumnName) === 1) .selectExpr(resultColumns: _*) } - protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + protected def withDatePartitions(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { logger.info("Adding partitioning information if needed") - try { - dataFrames.map { df => - if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){ - df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions)) - } - else df - } + try dataFrames.map { df => + if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions) + df.transform( + withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions) + ) + else df } catch { case e: Throwable => logger.error("Cannot add partitioning information for data frames.", e) - //TODO: Handle failure case properly throw new RuntimeException("Unable to transform data frames.", e) } } } - object DeltaLoad { private val logger: Logger = LoggerFactory.getLogger(getClass) - def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = { + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = new DeltaLoad(spark, dfs, configLocation) - } -} \ No newline at end of file +} diff --git a/src/main/scala/com/adidas/analytics/algo/loads/FullLoad.scala b/src/main/scala/com/adidas/analytics/algo/loads/FullLoad.scala new file mode 100644 index 0000000..da00279 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/loads/FullLoad.scala @@ -0,0 +1,98 @@ +package com.adidas.analytics.algo.loads + +import com.adidas.analytics.algo.core.Algorithm.WriteOperation +import com.adidas.analytics.algo.core.{Algorithm, TableStatistics} +import com.adidas.analytics.algo.loads.FullLoad._ +import com.adidas.analytics.algo.shared.{DataReshapingTask, DateComponentDerivation} +import com.adidas.analytics.config.loads.FullLoadConfiguration +import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetFormat} +import com.adidas.analytics.util._ +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +final class FullLoad protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with WriteOperation + with FullLoadConfiguration + with DateComponentDerivation + with TableStatistics + with DataReshapingTask { + + override protected def read(): Vector[DataFrame] = + try { + val dataFormat: DataFormat = fileFormat match { + case "parquet" => ParquetFormat(Some(targetSchema)) + case "dsv" => DSVFormat(Some(targetSchema)) + case "json" => + JSONFormat(multiLine = isMultilineJSON.getOrElse(false), optionalSchema = readJsonSchema) + case _ => throw new RuntimeException(s"Unsupported input data format $fileFormat.") + } + Vector(dataFormat.read(spark.read.options(sparkReaderOptions), inputDir)) + } catch { case e: Throwable => throw new RuntimeException("Unable to read input data.", e) } + + override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = + try additionalTasksDataFrame( + spark, + dataFrames, + targetSchema, + partitionSourceColumn, + partitionSourceColumnFormat, + targetPartitions + ) + catch { case e: Throwable => throw new RuntimeException("Unable to transform data frames.", e) } + + override protected def write(dataFrames: Vector[DataFrame]): Vector[DataFrame] = { + val outputDfs = { + try super.write(dataFrames) + catch { + case e: Throwable => + logger.info( + s"An exception occurred while writing the data... cleaning up temporary files: ${e.getMessage}" + ) + cleanupDirectory(nextTableLocation) + throw new RuntimeException("Unable to write DataFrames.", e) + } + } + + try CatalogTableManager(targetTable, spark).recreateTable(nextTableLocation, targetPartitions) + catch { + case e: Throwable => + logger.info(s"An exception occurred while recreating table: ${e.getMessage}") + cleanupDirectory(nextTableLocation) + CatalogTableManager(targetTable, spark).recreateTable( + currentTableLocation, + targetPartitions + ) + throw new RuntimeException(s"Unable to recreate table in location: $nextTableLocation", e) + } + + cleanupDirectory(currentTableLocation) + cleanupTableLeftovers(tableRootDir, nextTableLocationPrefix) + + outputDfs + } + + override protected def updateStatistics(dataFrames: Vector[DataFrame]): Unit = + if (computeTableStatistics && dataType == STRUCTURED) { + if (targetPartitions.nonEmpty) computeStatisticsForTablePartitions(targetTable) + computeStatisticsForTable(Some(targetTable)) + } + + private def cleanupDirectory(dir: String): Unit = + HadoopLoadHelper.cleanupDirectoryContent(dfs, dir) + + private def cleanupTableLeftovers(dir: String, ignorePrefix: String): Unit = + HadoopLoadHelper.cleanupDirectoryLeftovers(dfs, dir, ignorePrefix) + +} + +object FullLoad { + + private val logger: Logger = LoggerFactory.getLogger(getClass) + + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): FullLoad = + new FullLoad(spark, dfs, configLocation) +} diff --git a/src/main/scala/com/adidas/analytics/algo/shared/CustomDateFormatters.scala b/src/main/scala/com/adidas/analytics/algo/shared/CustomDateFormatters.scala index 967f51a..b107556 100644 --- a/src/main/scala/com/adidas/analytics/algo/shared/CustomDateFormatters.scala +++ b/src/main/scala/com/adidas/analytics/algo/shared/CustomDateFormatters.scala @@ -4,20 +4,19 @@ import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} import java.time.temporal.ChronoField object CustomDateFormatters { - /* - Singletons of Custom Date Formatters - */ + + /* Singletons of Custom Date Formatters */ val YEAR_WEEK: DateTimeFormatter = new DateTimeFormatterBuilder() - .appendValue(ChronoField.YEAR, 4) - .appendValue(ChronoField.ALIGNED_WEEK_OF_YEAR, 2) - .parseDefaulting(ChronoField.DAY_OF_WEEK, 1) - .toFormatter() + .appendValue(ChronoField.YEAR, 4) + .appendValue(ChronoField.ALIGNED_WEEK_OF_YEAR, 2) + .parseDefaulting(ChronoField.DAY_OF_WEEK, 1) + .toFormatter() val YEAR_WEEK_DAY: DateTimeFormatter = new DateTimeFormatterBuilder() - .appendValue(ChronoField.YEAR, 4) - .appendValue(ChronoField.ALIGNED_WEEK_OF_YEAR, 2) - .appendValue(ChronoField.DAY_OF_WEEK, 1) - .toFormatter() + .appendValue(ChronoField.YEAR, 4) + .appendValue(ChronoField.ALIGNED_WEEK_OF_YEAR, 2) + .appendValue(ChronoField.DAY_OF_WEEK, 1) + .toFormatter() val YEAR_MONTH: DateTimeFormatter = new DateTimeFormatterBuilder() .appendValue(ChronoField.YEAR, 4) @@ -25,4 +24,12 @@ object CustomDateFormatters { .parseDefaulting(ChronoField.DAY_OF_MONTH, 1) .toFormatter() + val MONTH_DAY_YEAR: DateTimeFormatter = + new DateTimeFormatterBuilder() + .appendValue(ChronoField.MONTH_OF_YEAR, 2) + .appendLiteral("/") + .appendValue(ChronoField.DAY_OF_MONTH, 2) + .appendLiteral("/") + .appendValue(ChronoField.YEAR, 4) + .toFormatter } diff --git a/src/main/scala/com/adidas/analytics/algo/shared/DataReshapingTask.scala b/src/main/scala/com/adidas/analytics/algo/shared/DataReshapingTask.scala new file mode 100644 index 0000000..62554fb --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/shared/DataReshapingTask.scala @@ -0,0 +1,104 @@ +package com.adidas.analytics.algo.shared + +import com.adidas.analytics.algo.NestedFlattener +import com.adidas.analytics.algo.Transpose +import com.adidas.analytics.config.shared.DataReshapingTaskConfig +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SparkSession} + +trait DataReshapingTask extends DataReshapingTaskConfig with DateComponentDerivation { + + /** Trait used to implement additional tasks on data during the transform stage. It picks up + * properties via an ACON file. In the future, to include more tasks from other algorithms, you + * need to assign a new match to the result variable, in order to look for other tasks in the + * ACON file. intended or to call other algo functions + * + * @param dataFrame + * vector dataframe to process + * @param spark + * spark session + * @param targetSchema + * for schema verification + */ + + def additionalTasksDataFrame( + spark: SparkSession, + dataFrame: Vector[DataFrame], + targetSchema: StructType, + partitionSourceColumn: String, + partitionSourceColumnFormat: String, + targetPartitions: Seq[String] + ): Vector[DataFrame] = + try dataFrame.map { df => + Seq( + NestedFlat(spark)(_), + TransposeDf(spark, targetSchema)(_), + withDatePartitions(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions)(_), + checkSchema(targetSchema)(_) + ).foldLeft(df)((df, task) => df.transform(task)) + } catch { + case e: Throwable => throw new RuntimeException("Unable to execute data reshaping task.", e) + } + + def checkSchema(targetSchema: StructType)(df: DataFrame): DataFrame = + if (enforceSchema) { + var castedTargetCols = targetSchema.map(c => col(c.name).cast(c.dataType)) + targetSchema.fields.foreach { f => + if (!df.schema.fieldNames.contains(f.name)) + castedTargetCols = castedTargetCols.filter(_ != col(f.name).cast(f.dataType)) + } + df.select(castedTargetCols: _*) + } else df + + def withDatePartitions( + partitionSourceColumn: String, + partitionSourceColumnFormat: String, + targetPartitions: Seq[String] + )(df: DataFrame): DataFrame = + if (targetPartitions.nonEmpty) + df.transform( + withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions) + ) + else df + + def TransposeDf(spark: SparkSession, targetSchema: StructType)(df: DataFrame): DataFrame = + transposeTaskProperties match { + case Some(_) => + Transpose.transposeTask( + spark, + df, + getProperties[String]("transpose_task_properties", "pivot_column") + .getOrElse(throw new RuntimeException("Pivot column value is missing")), + getProperties[String]("transpose_task_properties", "aggregation_column") + .getOrElse(throw new RuntimeException("Aggregation column value is missing")), + getProperties[Seq[String]]("transpose_task_properties", "group_by_column") + .getOrElse(throw new RuntimeException(" Group by value is missing")), + targetSchema + ) + case _ => df + } + + def NestedFlat(spark: SparkSession)(df: DataFrame): DataFrame = + flattenTaskProperties match { + case Some(_) => + NestedFlattener.flatDataFrame( + spark, + NestedFlattener.replaceCharsInColumns( + spark, + df, + getProperties[String]("nested_task_properties", "chars_to_replace") + .getOrElse(throw new RuntimeException(s"replacement_char value is missing")), + getProperties[String]("nested_task_properties", "replacement_char") + .getOrElse(throw new RuntimeException(s"replacement_char value is missing")) + ), + getProperties[Seq[String]]("nested_task_properties", "fields_to_flatten") + .getOrElse(throw new RuntimeException(s"fields_to_flatten value is missing")), + getProperties[Map[String, String]]("nested_task_properties", "column_mapping") + .getOrElse(throw new RuntimeException(s"column_mapping value is missing")), + getProperties[Map[String, Seq[String]]]("nested_task_properties", "side_flatten") + ) + case _ => df + } + +} diff --git a/src/main/scala/com/adidas/analytics/algo/shared/DateComponentDerivation.scala b/src/main/scala/com/adidas/analytics/algo/shared/DateComponentDerivation.scala index bf61963..3e8a67f 100644 --- a/src/main/scala/com/adidas/analytics/algo/shared/DateComponentDerivation.scala +++ b/src/main/scala/com/adidas/analytics/algo/shared/DateComponentDerivation.scala @@ -1,117 +1,166 @@ package com.adidas.analytics.algo.shared import java.time.LocalDate -import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} +import java.time.format.DateTimeFormatter import java.time.temporal.ChronoField - import org.apache.spark.sql.DataFrame import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{udf, _} import org.apache.spark.sql.types.{IntegerType, StringType} - import scala.util.{Failure, Success, Try} - trait DateComponentDerivation { protected val tempFormatterColumnName: String = "temp_formatter_column" - protected def withDateComponents(sourceDateColumnName: String, sourceDateFormat: String, targetDateComponentColumnNames: Seq[String])(inputDf: DataFrame): DataFrame = { - targetDateComponentColumnNames.foldLeft(inputDf.withColumn(tempFormatterColumnName, lit(sourceDateFormat))) { - (df, colName) => - colName match { + protected def withDateComponents( + sourceDateColumnName: String, + sourceDateFormat: String, + targetDateComponentColumnNames: Seq[String] + )(inputDf: DataFrame): DataFrame = + targetDateComponentColumnNames + .foldLeft( + inputDf.withColumn(tempFormatterColumnName, lit(sourceDateFormat)) + ) { (df, colName) => + if (DateComponentDerivation.ALLOWED_DERIVATIONS.contains(colName)) colName match { case "year" => - withDateComponent(df, sourceDateColumnName, colName, 9999, customYear) + withDateComponent( + df, + sourceDateColumnName, + colName, + DateComponentDerivation.DEFAULT_4DIGIT_VALUE, + customYear + ) case "month" => - withDateComponent(df, sourceDateColumnName, colName, 99, customMonth) + withDateComponent( + df, + sourceDateColumnName, + colName, + DateComponentDerivation.DEFAULT_2DIGIT_VALUE, + customMonth + ) case "day" => - withDateComponent(df, sourceDateColumnName, colName, 99, customDay) + withDateComponent( + df, + sourceDateColumnName, + colName, + DateComponentDerivation.DEFAULT_2DIGIT_VALUE, + customDay + ) case "week" => - withDateComponent(df, sourceDateColumnName, colName, 99, customWeekOfYear) - case everythingElse => - throw new RuntimeException(s"Unable to infer a partitioning column for: $everythingElse.") + withDateComponent( + df, + sourceDateColumnName, + colName, + DateComponentDerivation.DEFAULT_2DIGIT_VALUE, + customWeekOfYear + ) } - }.drop(tempFormatterColumnName) - } - - private def withDateComponent(inputDf: DataFrame, - sourceDateColumnName: String, - targetColumnName: String, - defaultValue: Int, - derivationFunction: UserDefinedFunction): DataFrame = { + else + throw new RuntimeException( + s"Unable to derive a partitioning column for $colName as the same does not " + + s"belong to the set of allowed derivations." + ) + } + .drop(tempFormatterColumnName) - inputDf - .withColumn(targetColumnName, - when( - derivationFunction(col(sourceDateColumnName).cast(StringType), col(tempFormatterColumnName)).isNotNull, - derivationFunction(col(sourceDateColumnName).cast(StringType), col(tempFormatterColumnName))) - .otherwise(lit(defaultValue)) - ) - } + private def withDateComponent( + inputDf: DataFrame, + sourceDateColumnName: String, + targetColumnName: String, + defaultValue: Int, + derivationFunction: UserDefinedFunction + ): DataFrame = + inputDf.withColumn( + targetColumnName, + when( + derivationFunction( + col(sourceDateColumnName).cast(StringType), + col(tempFormatterColumnName) + ).isNotNull, + derivationFunction(col(sourceDateColumnName).cast(StringType), col(tempFormatterColumnName)) + ).otherwise(lit(defaultValue)) + ) - private val customWeekOfYear = udf((ts: String, formatter: String) => { + private val customWeekOfYear = udf( + (ts: String, formatter: String) => Try { getCustomFormatter(formatter) match { case Some(customFormatter) => - LocalDate.parse(ts, customFormatter).get(ChronoField.ALIGNED_WEEK_OF_YEAR) - case None => None + LocalDate + .parse(ts, customFormatter) + .get(ChronoField.ALIGNED_WEEK_OF_YEAR) + case None => + LocalDate + .parse(ts, DateTimeFormatter.ofPattern(formatter)) + .get(ChronoField.ALIGNED_WEEK_OF_YEAR) } } match { - case Failure(_) => None + case Failure(_) => None case Success(value) => value - } - }, IntegerType) + }, + IntegerType + ) - private val customYear = udf((ts: String, formatter: String) => { - Try { - getCustomFormatter(formatter) match { - case Some(customFormatter) => - LocalDate.parse(ts, customFormatter).get(ChronoField.YEAR) - case None => - LocalDate.parse(ts, DateTimeFormatter.ofPattern(formatter)).getYear - } - } match { - case Failure(_) => None - case Success(value) => value - } - }, IntegerType) + private val customYear = udf( + (ts: String, formatter: String) => + Try { + getCustomFormatter(formatter) match { + case Some(customFormatter) => LocalDate.parse(ts, customFormatter).get(ChronoField.YEAR) + case None => LocalDate.parse(ts, DateTimeFormatter.ofPattern(formatter)).getYear + } + } match { + case Failure(_) => None + case Success(value) => value + }, + IntegerType + ) - private val customDay = udf((ts: String, formatter: String) => { - Try { - getCustomFormatter(formatter) match { - // note: this logic must be updated if we have - // customFormatters with dayOfMonth - case Some(customFormatter) => - LocalDate.parse(ts, customFormatter).get(ChronoField.DAY_OF_WEEK) - case None => - LocalDate.parse(ts, DateTimeFormatter.ofPattern(formatter)).getDayOfMonth - } - } match { - case Failure(_) => None - case Success(value) => value - } - }, IntegerType) + private val customDay = udf( + (ts: String, formatter: String) => + Try { + getCustomFormatter(formatter) match { + case Some(customFormatter) => + val day_type = + if (formatter.contains("dd")) ChronoField.DAY_OF_MONTH else ChronoField.DAY_OF_WEEK + LocalDate.parse(ts, customFormatter).get(day_type) + case None => LocalDate.parse(ts, DateTimeFormatter.ofPattern(formatter)).getDayOfMonth + } + } match { + case Failure(_) => None + case Success(value) => value + }, + IntegerType + ) - private val customMonth = udf((ts: String, formatter: String) => { - Try { - getCustomFormatter(formatter) match { - case Some(customFormatter) => - LocalDate.parse(ts, customFormatter).getMonthValue - case None => - LocalDate.parse(ts, DateTimeFormatter.ofPattern(formatter)).getMonthValue - } - } match { - case Failure(_) => None - case Success(value) => value - } - }, IntegerType) + private val customMonth = udf( + (ts: String, formatter: String) => + Try { + getCustomFormatter(formatter) match { + case Some(customFormatter) => LocalDate.parse(ts, customFormatter).getMonthValue + case None => LocalDate.parse(ts, DateTimeFormatter.ofPattern(formatter)).getMonthValue + } + } match { + case Failure(_) => None + case Success(value) => value + }, + IntegerType + ) private def getCustomFormatter(dateFormatter: String): Option[DateTimeFormatter] = dateFormatter match { - case "yyyyww" => Option(CustomDateFormatters.YEAR_WEEK) - case "yyyywwe" => Option(CustomDateFormatters.YEAR_WEEK_DAY) - case "yyyyMM" => Option(CustomDateFormatters.YEAR_MONTH) - case _ => None + case "yyyyww" => Option(CustomDateFormatters.YEAR_WEEK) + case "yyyywwe" => Option(CustomDateFormatters.YEAR_WEEK_DAY) + case "yyyyMM" => Option(CustomDateFormatters.YEAR_MONTH) + case "MM/dd/yyyy" => Option(CustomDateFormatters.MONTH_DAY_YEAR) + case _ => None } } + +object DateComponentDerivation { + + val ALLOWED_DERIVATIONS: Seq[String] = Seq[String]("year", "month", "day", "week") + val DEFAULT_4DIGIT_VALUE = 9999 + val DEFAULT_2DIGIT_VALUE = 99 +} diff --git a/src/main/scala/com/adidas/analytics/algo/templates/AlgorithmTemplate.scala b/src/main/scala/com/adidas/analytics/algo/templates/AlgorithmTemplate.scala new file mode 100644 index 0000000..a39d72e --- /dev/null +++ b/src/main/scala/com/adidas/analytics/algo/templates/AlgorithmTemplate.scala @@ -0,0 +1,43 @@ +package com.adidas.analytics.algo.templates + +import com.adidas.analytics.algo.core.Algorithm +import com.adidas.analytics.config.templates.AlgorithmTemplateConfiguration +import com.adidas.analytics.util.DFSWrapper +import org.apache.spark.sql._ + +final class AlgorithmTemplate protected ( + val spark: SparkSession, + val dfs: DFSWrapper, + val configLocation: String +) extends Algorithm + with AlgorithmTemplateConfiguration { + + /** In this method perform all the operations required to obtain the desired dataframe. For + * example, adding new columns, calculating values for columns, exploding, etc. + * + * @param dataFrames + * this would be a 3-D array, where each cell of the Vector has a 2-D spark dataframe + */ + override protected def transform(dataFrames: Vector[DataFrame]): Vector[DataFrame] = + throw new NotImplementedError( + "This class is not meant to be used. Please, considering implementing your own class based on this template" + ) +} + +object AlgorithmTemplate { + + /** Additionally, one can define a companion object, with different attributes and methods. These + * methods could be helpers for the transform method. In this case, an instantiation of + * AlgorithmTemplate occurs in the companion object. + * + * @param spark + * instance of SparkSession class. + * @param dfs + * instance of DFSWrapper class for FS operations helper. + * @param configLocation + * path of configuration file for the algorithm. + * @return + */ + def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): AlgorithmTemplate = + new AlgorithmTemplate(spark, dfs, configLocation) +} diff --git a/src/main/scala/com/adidas/analytics/config/AlgorithmTemplateConfiguration.scala b/src/main/scala/com/adidas/analytics/config/AlgorithmTemplateConfiguration.scala deleted file mode 100644 index b6ce24f..0000000 --- a/src/main/scala/com/adidas/analytics/config/AlgorithmTemplateConfiguration.scala +++ /dev/null @@ -1,98 +0,0 @@ -package com.adidas.analytics.config - -import com.adidas.analytics.algo.core.Algorithm.{ReadOperation, SafeWriteOperation, UpdateStatisticsOperation} -import com.adidas.analytics.config.AlgorithmTemplateConfiguration.ruleToLocalDate -import com.adidas.analytics.config.shared.{ConfigurationContext, MetadataUpdateStrategy} -import com.adidas.analytics.util.DataFormat.ParquetFormat -import com.adidas.analytics.util.{InputReader, LoadMode, OutputWriter} -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.types.StructType -import org.joda.time.format.DateTimeFormat -import org.joda.time.{Days, LocalDate} - - -trait AlgorithmTemplateConfiguration extends ConfigurationContext - with ReadOperation - with SafeWriteOperation - with UpdateStatisticsOperation - with MetadataUpdateStrategy { - - protected def spark: SparkSession - - /** - * This trait contains has the responsibility to obtain the required configurations - * for a given algorithm. In this template, it can be seen that values like source and target tables, - * dates, and readers and writers, are obtained in this class by mixing ConfigurationContext, ReadOperation - * and SafeWriteOperation. - * - * At the same time, AlgorithmTemplateConfiguration is mixed in the AlgorithmTemplate class, so it can - * use the values from the provided configuration. - * - * An easy way to see this, is to think of it as the parser of the algorithm json config file. - */ - protected val sourceTable: String = configReader.getAs[String]("source_table").trim - // you can use a source location as parquet files on the lake instead of a hive table - // protected val sourceLocation: String = configReader.getAs[String]("source_location").trim - - protected val targetTable: String = configReader.getAs[String]("target_table").trim - - protected val startDate: LocalDate = ruleToLocalDate(configReader.getAs[String]("date_from").trim) - protected val endDate: LocalDate = ruleToLocalDate(configReader.getAs[String]("date_to").trim) - - protected val dateRange: Days = Days.daysBetween(startDate, endDate) - - protected val targetSchema: StructType = spark.table(targetTable).schema - - override protected val readers: Vector[InputReader.TableReader] = Vector( - /** - * Obtaining a reader for the algorithm. - */ - - InputReader.newTableReader(table = sourceTable) -// you can use a source location as parquet files on the lake instead of a hive table -// InputReader.newFileSystemReader(sourceLocation, DataFormat.ParquetFormat()) - ) - - override protected val writer: OutputWriter.AtomicWriter = { - /** - * Obtaining a writer for the algorithm. - * - * Note that the LoadMode can be any of the following: - * - * -- OverwriteTable: which steps on the exiting files and writes the new records - * -- OverwritePartitions: which steps on the existing files inside a partition directory - * -- AppendJoinPartitions: which appends the records to existing ones in the partition directory by a Full Outer Join - * -- AppendUnionPartition: which appends the records to existing ones in the partition directory by a Union All - */ - OutputWriter.newTableLocationWriter ( - table = targetTable, - format = ParquetFormat (Some (targetSchema) ), - metadataConfiguration = getMetaDataUpdateStrategy(targetTable, Seq ("", "", "")), - targetPartitions = Seq ("", "", ""), //If partitions are required, this would look like, e.g., Seq("year", "month") - loadMode = LoadMode.OverwritePartitionsWithAddedColumns - ) - } -} - - -object AlgorithmTemplateConfiguration { - - /** - * A companion object can alternatively be used to add helper methods - * In this case, there is a method to convert a date string to a specific - * date value, because in this example, date could also contain a string - * such as today and yesterday, as well as a pattern. - */ - - private val DatePattern = "([0-9]{4}-[0-9]{2}-[0-9]{2})".r - private val DateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd") - - private def ruleToLocalDate(rule: String): LocalDate = { - rule.trim match { - case DatePattern(dateString) => LocalDate.parse(dateString, DateFormatter) - case "today" => LocalDate.now() - case "yesterday" => LocalDate.now().minus(Days.ONE) - case _ => throw new IllegalArgumentException(s"Invalid date format: $rule") - } - } -} diff --git a/src/main/scala/com/adidas/analytics/config/AppendLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/AppendLoadConfiguration.scala deleted file mode 100644 index feb6c45..0000000 --- a/src/main/scala/com/adidas/analytics/config/AppendLoadConfiguration.scala +++ /dev/null @@ -1,78 +0,0 @@ -package com.adidas.analytics.config - -import com.adidas.analytics.algo.core.Algorithm.SafeWriteOperation -import com.adidas.analytics.config.shared.{ConfigurationContext, LoadConfiguration, MetadataUpdateStrategy} -import com.adidas.analytics.util.DataFormat.ParquetFormat -import com.adidas.analytics.util.{LoadMode, OutputWriter} -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.util.DropMalformedMode -import org.apache.spark.sql.types.{DataType, StructType} - -import scala.util.parsing.json.JSONObject - - -trait AppendLoadConfiguration extends ConfigurationContext - with LoadConfiguration - with SafeWriteOperation - with MetadataUpdateStrategy { - - protected def spark: SparkSession - - private val regexFilename: Seq[String] = configReader.getAsSeq[String]("regex_filename") - - protected val headerDir: String = configReader.getAs[String]("header_dir") - - protected val targetTable: Option[String] = configReader.getAsOption[String]("target_table") - - - // This option is used to specify whether the input data schema must be the same as target schema specified in the configuration file - // Note: if it is set to True, it will cause input data to be read more than once - private val verifySchemaOption: Option[Boolean] = configReader.getAsOption[Boolean]("verify_schema") - - protected val verifySchema: Boolean = dataType match { - case SEMISTRUCTURED => verifySchemaOption.getOrElse(true) - case _ => false - } - - protected val columnToRegexPairs: Seq[(String, String)] = targetPartitions zip regexFilename - - private val jsonSchemaOption: Option[JSONObject] = configReader.getAsOption[JSONObject]("schema") - - protected val targetSchema: StructType = getTargetSchema - - private val targetDir: Option[String] = configReader.getAsOption[String]("target_dir") - - override protected val writer: OutputWriter.AtomicWriter = dataType match { - case STRUCTURED if targetTable.isDefined => OutputWriter.newTableLocationWriter( - table = targetTable.get, - format = ParquetFormat(Some(targetSchema)), - targetPartitions = targetPartitions, - loadMode = LoadMode.OverwritePartitionsWithAddedColumns, - metadataConfiguration = getMetaDataUpdateStrategy(targetTable.get,targetPartitions) - ) - case SEMISTRUCTURED if targetDir.isDefined => OutputWriter.newFileSystemWriter( - location = targetDir.get, - format = ParquetFormat(Some(targetSchema)), - targetPartitions = targetPartitions, - loadMode = LoadMode.OverwritePartitions - ) - case anotherDataType => throw new RuntimeException(s"Unsupported data type: $anotherDataType in AppendLoad or the configuration file is malformed.") - } - - private def getTargetSchemaFromHiveTable: StructType = { - targetTable match { - case Some(tableName) => spark.table(tableName).schema - case None => throw new RuntimeException("No schema definition found.") - } - } - - private def getTargetSchema: StructType = { - dataType match { - case STRUCTURED => getTargetSchemaFromHiveTable - case SEMISTRUCTURED if jsonSchemaOption.isDefined => DataType.fromJson(jsonSchemaOption.get.toString()).asInstanceOf[StructType] - case anotherDataType => throw new RuntimeException(s"Unsupported data type: $anotherDataType in AppendLoad or the configuration file is malformed.") - } - } - - override def loadMode: String = readerModeSetter(DropMalformedMode.name) -} diff --git a/src/main/scala/com/adidas/analytics/config/DeltaLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/DeltaLoadConfiguration.scala deleted file mode 100644 index fee6815..0000000 --- a/src/main/scala/com/adidas/analytics/config/DeltaLoadConfiguration.scala +++ /dev/null @@ -1,75 +0,0 @@ -package com.adidas.analytics.config - -import com.adidas.analytics.algo.core.Algorithm.{ReadOperation, SafeWriteOperation, UpdateStatisticsOperation} -import com.adidas.analytics.config.shared.{ConfigurationContext, DateComponentDerivationConfiguration, MetadataUpdateStrategy} -import com.adidas.analytics.util.DataFormat.ParquetFormat -import com.adidas.analytics.util.{DataFormat, InputReader, LoadMode, OutputWriter} -import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.{Row, SparkSession} - - -trait DeltaLoadConfiguration extends ConfigurationContext - with UpdateStatisticsOperation - with MetadataUpdateStrategy { - - protected val activeRecordsTable: String = configReader.getAs[String]("active_records_table_lake") - protected val deltaRecordsTable: Option[String] = configReader.getAsOption[String]("delta_records_table_lake") - protected val deltaRecordsFilePath: Option[String] = configReader.getAsOption[String]("delta_records_file_path") - - protected val businessKey: Seq[String] = configReader.getAsSeq[String]("business_key") - protected val technicalKey: Seq[String] = configReader.getAsSeq[String]("technical_key") - - protected val rankingColumnName: String = "DELTA_LOAD_RANK" - protected val recordModeColumnName: String = "recordmode" - protected val upsertRecordModes: Seq[String] = Seq("", "N") - protected val upsertRecordsModesFilterFunction: Row => Boolean = { row: Row => - var recordmode = "" - try { - recordmode = row.getAs[String](recordModeColumnName) - } catch { - case _ => recordmode = row.getAs[String](recordModeColumnName.toUpperCase) - } - recordmode == null || recordmode == "" || recordmode == "N" - } -} - - -object DeltaLoadConfiguration { - - trait PartitionedDeltaLoadConfiguration extends DeltaLoadConfiguration with DateComponentDerivationConfiguration - with ReadOperation with SafeWriteOperation { - - protected def spark: SparkSession - - override protected val targetPartitions: Seq[String] = configReader.getAsSeq[String]("target_partitions") - override protected val partitionSourceColumn: String = configReader.getAs[String]("partition_column") - override protected val partitionSourceColumnFormat: String = configReader.getAs[String]("partition_column_format") - - private val targetSchema: StructType = spark.table(activeRecordsTable).schema - - override protected val readers: Vector[InputReader] = Vector( - createDeltaInputReader(deltaRecordsFilePath, deltaRecordsTable), - InputReader.newTableReader(table = activeRecordsTable) - ) - - override protected val writer: OutputWriter.AtomicWriter = OutputWriter.newTableLocationWriter( - table = activeRecordsTable, - format = ParquetFormat(Some(targetSchema)), - targetPartitions = targetPartitions, - metadataConfiguration = getMetaDataUpdateStrategy(activeRecordsTable, targetPartitions), - loadMode = LoadMode.OverwritePartitionsWithAddedColumns - ) - } - - private def createDeltaInputReader(deltaRecordsFilePath: Option[String], deltaRecordsTable: Option[String]): InputReader = { - def createInputReaderByPath: InputReader = { - deltaRecordsFilePath.fold { - throw new RuntimeException("Unable to create a reader for the delta table: neither delta records path not delta table name is defined") - } { - location => InputReader.newFileSystemReader(s"$location*.parquet", DataFormat.ParquetFormat()) - } - } - - deltaRecordsTable.fold(createInputReaderByPath)(tableName => InputReader.newTableReader(tableName)) - } -} diff --git a/src/main/scala/com/adidas/analytics/config/FixedSizeStringExtractorConfiguration.scala b/src/main/scala/com/adidas/analytics/config/FixedSizeStringExtractorConfiguration.scala index db59dc9..60275ec 100644 --- a/src/main/scala/com/adidas/analytics/config/FixedSizeStringExtractorConfiguration.scala +++ b/src/main/scala/com/adidas/analytics/config/FixedSizeStringExtractorConfiguration.scala @@ -1,75 +1,86 @@ package com.adidas.analytics.config -import com.adidas.analytics.algo.core.Algorithm.{ReadOperation, SafeWriteOperation, UpdateStatisticsOperation} +import com.adidas.analytics.algo.core.Algorithm.{ + ReadOperation, + SafeWriteOperation, + UpdateStatisticsOperation +} import com.adidas.analytics.config.FixedSizeStringExtractorConfiguration._ import com.adidas.analytics.config.shared.{ConfigurationContext, MetadataUpdateStrategy} import com.adidas.analytics.util.DataFormat.ParquetFormat import com.adidas.analytics.util.DataFrameUtils.PartitionCriteria -import com.adidas.analytics.util.{InputReader, LoadMode, OutputWriter} +import com.adidas.analytics.util.{CatalogTableManager, InputReader, LoadMode, OutputWriter} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType import org.joda.time._ import org.slf4j.{Logger, LoggerFactory} - -trait FixedSizeStringExtractorConfiguration extends ConfigurationContext - with ReadOperation - with SafeWriteOperation - with UpdateStatisticsOperation - with MetadataUpdateStrategy { +trait FixedSizeStringExtractorConfiguration + extends ConfigurationContext + with ReadOperation + with SafeWriteOperation + with UpdateStatisticsOperation + with MetadataUpdateStrategy { private val logger: Logger = LoggerFactory.getLogger(getClass) protected def spark: SparkSession private val sourceTable: String = configReader.getAs[String]("source_table").trim + private val targetTable: String = configReader.getAs[String]("target_table").trim protected val sourceField: String = configReader.getAs[String]("source_field").trim - protected val targetPartitionsOrdered: Seq[String] = configReader.getAsSeq[String]("target_partitions") + + protected val targetPartitionsOrdered: Seq[String] = + configReader.getAsSeq[String]("target_partitions") protected val targetPartitionsSet: Set[String] = targetPartitionsOrdered.toSet protected val partitionsCriteria: PartitionCriteria = { - if (configReader.contains("select_conditions")) { - if (targetPartitionsOrdered.nonEmpty) { + if (configReader.contains("select_conditions")) + if (targetPartitionsOrdered.nonEmpty) parseConditions(configReader.getAsSeq[String]("select_conditions")) - } else { + else { logger.warn("Select conditions can be applied to partitioned tables only. Ignoring.") Seq.empty } - } else if (configReader.contains("select_rules")) { - if (targetPartitionsOrdered.nonEmpty) { - parseRules(configReader.getAsSeq[String]("select_rules"), targetPartitionsOrdered, targetPartitionsSet) - } else { + else if (configReader.contains("select_rules")) + if (targetPartitionsOrdered.nonEmpty) + parseRules( + configReader.getAsSeq[String]("select_rules"), + targetPartitionsOrdered, + targetPartitionsSet + ) + else { logger.warn("Select rules can be applied to partitioned tables only. Ignoring.") Seq.empty } - } else { - Seq.empty - } + else Seq.empty } - protected val substringPositions: Seq[(Int, Int)] = configReader.getAsSeq[String]("substring_positions").map { - case NumberPairPattern(start, end) => (start.toInt, end.toInt) - case another => throw new IllegalArgumentException(s"Wrong select condition: $another") - } + protected val substringPositions: Seq[(Int, Int)] = + configReader.getAsSeq[String]("substring_positions").map { + case NumberPairPattern(start, end) => (start.toInt, end.toInt) + case another => throw new IllegalArgumentException(s"Wrong select condition: $another") + } - protected val targetSchema: StructType = spark.table(targetTable).schema + protected val targetSchema: StructType = + CatalogTableManager(targetTable, spark).getSchemaSafely(dfs) - override protected val readers: Vector[InputReader] = Vector( - InputReader.newTableReader(table = sourceTable) - ) + override protected val readers: Vector[InputReader] = + Vector(InputReader.newTableReader(table = sourceTable)) override protected val writer: OutputWriter.AtomicWriter = OutputWriter.newTableLocationWriter( table = targetTable, format = ParquetFormat(Some(targetSchema)), metadataConfiguration = getMetaDataUpdateStrategy(targetTable, targetPartitionsOrdered), targetPartitions = targetPartitionsOrdered, - loadMode = if (targetPartitionsOrdered.nonEmpty) LoadMode.OverwritePartitionsWithAddedColumns else LoadMode.OverwriteTable + loadMode = + if (targetPartitionsOrdered.nonEmpty) LoadMode.OverwritePartitionsWithAddedColumns + else LoadMode.OverwriteTable ) } - object FixedSizeStringExtractorConfiguration { private val Year = "year" @@ -81,53 +92,55 @@ object FixedSizeStringExtractorConfiguration { private val ConditionPattern = "(.+?)[ ]*=[ ]*(.+)".r private val NumberPairPattern = "([0-9]+?)[ ]*,[ ]*([0-9]+)".r - private def parseConditions(conditions: Seq[String]): PartitionCriteria = { + private def parseConditions(conditions: Seq[String]): PartitionCriteria = conditions.map { case ConditionPattern(columnName, columnValue) => (columnName.trim, columnValue.trim) - case condition => throw new IllegalArgumentException(s"Wrong select condition: $condition") + case condition => throw new IllegalArgumentException(s"Wrong select condition: $condition") } - } - private def parseRules(rules: Seq[String], targetPartitionsOrdered: Seq[String], targetPartitionsSet: Set[String]): PartitionCriteria = { + private def parseRules( + rules: Seq[String], + targetPartitionsOrdered: Seq[String], + targetPartitionsSet: Set[String] + ): PartitionCriteria = if (rules.nonEmpty) { val selectDate = rules.foldLeft(LocalDate.now()) { case (date, RulePattern(period, "-", value)) => - if (!targetPartitionsSet.contains(period)) { + if (!targetPartitionsSet.contains(period)) throw new RuntimeException(s"Unsupported period: $period") - } date.minus(createPeriodByNameAndValue(period, value.toInt)) case (date, RulePattern(period, "+", value)) => - if (!targetPartitionsSet.contains(period)) { + if (!targetPartitionsSet.contains(period)) throw new RuntimeException(s"Unsupported period: $period") - } date.plus(createPeriodByNameAndValue(period, value.toInt)) - case rule => - throw new IllegalArgumentException(s"Wrong select rule: $rule") + case rule => throw new IllegalArgumentException(s"Wrong select rule: $rule") } createCriteriaForDate(selectDate, targetPartitionsOrdered) - } else { - Seq.empty - } - } + } else Seq.empty - private def createCriteriaForDate(date: LocalDate, targetPartitions: Seq[String]): PartitionCriteria = { + private def createCriteriaForDate( + date: LocalDate, + targetPartitions: Seq[String] + ): PartitionCriteria = targetPartitions match { case Year :: Month :: Day :: Nil => - Seq(Year -> date.getYear.toString, Month -> date.getMonthOfYear.toString, Day -> date.getDayOfMonth.toString) + Seq( + Year -> date.getYear.toString, + Month -> date.getMonthOfYear.toString, + Day -> date.getDayOfMonth.toString + ) case Year :: Month :: Nil => Seq(Year -> date.getYear.toString, Month -> date.getMonthOfYear.toString) case Year :: Week :: Nil => Seq(Year -> date.getYear.toString, Week -> date.getWeekOfWeekyear.toString) case _ => throw new RuntimeException(s"Unsupported partitioning schema: $targetPartitions") } - } - private def createPeriodByNameAndValue(name: String, value: Int): ReadablePeriod = { + private def createPeriodByNameAndValue(name: String, value: Int): ReadablePeriod = name match { - case Year => Years.years(value) + case Year => Years.years(value) case Month => Months.months(value) - case Week => Weeks.weeks(value) - case Day => Days.days(value) + case Week => Weeks.weeks(value) + case Day => Days.days(value) } - } } diff --git a/src/main/scala/com/adidas/analytics/config/FullLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/FullLoadConfiguration.scala deleted file mode 100644 index 9bd89e5..0000000 --- a/src/main/scala/com/adidas/analytics/config/FullLoadConfiguration.scala +++ /dev/null @@ -1,42 +0,0 @@ -package com.adidas.analytics.config - -import com.adidas.analytics.config.shared.{ConfigurationContext, DateComponentDerivationConfiguration, LoadConfiguration} -import com.adidas.analytics.util.DataFormat.ParquetFormat -import com.adidas.analytics.util.{LoadMode, OutputWriter} -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.util.FailFastMode -import org.apache.spark.sql.types.StructType - - -trait FullLoadConfiguration extends ConfigurationContext with LoadConfiguration with DateComponentDerivationConfiguration { - - protected def spark: SparkSession - - protected val currentDir: String = configReader.getAs[String]("current_dir") - - protected val backupDir: String = configReader.getAs[String]("backup_dir") - - protected val targetTable: String = configReader.getAs[String]("target_table") - - protected val targetSchema: StructType = spark.table(targetTable).schema - - protected val writer: OutputWriter.AtomicWriter = dataType match { - case STRUCTURED => OutputWriter.newFileSystemWriter( - location = currentDir, - format = ParquetFormat(Some(targetSchema)), - targetPartitions = targetPartitions, - loadMode = LoadMode.OverwriteTable - ) - case anotherDataType => throw new RuntimeException(s"Unsupported data type: $anotherDataType for FullLoad.") - } - - - override protected val partitionSourceColumn: String = configReader.getAs[String]("partition_column") - override protected val partitionSourceColumnFormat: String = configReader.getAs[String]("partition_column_format") - - override protected def readNullValue: Option[String] = { - super.readNullValue.orElse(Some("XXNULLXXX")) - } - - override def loadMode: String = readerModeSetter(FailFastMode.name) -} diff --git a/src/main/scala/com/adidas/analytics/config/GzipDecompressorConfiguration.scala b/src/main/scala/com/adidas/analytics/config/GzipDecompressorConfiguration.scala index 0c4f1a7..2191656 100644 --- a/src/main/scala/com/adidas/analytics/config/GzipDecompressorConfiguration.scala +++ b/src/main/scala/com/adidas/analytics/config/GzipDecompressorConfiguration.scala @@ -3,12 +3,11 @@ package com.adidas.analytics.config import com.adidas.analytics.config.shared.ConfigurationContext import org.apache.hadoop.fs.Path - trait GzipDecompressorConfiguration extends ConfigurationContext { - protected val recursive: Boolean = true + protected val outputExtension: String = configReader.getAs[String]("format") - protected val outputExtension: String = "." + configReader.getAs[String]("format") protected val inputDirectoryPath: Path = new Path(configReader.getAs[String]("directory")) + protected val threadPoolSize: Int = configReader.getAs[Int]("thread_pool_size") } diff --git a/src/main/scala/com/adidas/analytics/config/MaterializationConfiguration.scala b/src/main/scala/com/adidas/analytics/config/MaterializationConfiguration.scala new file mode 100644 index 0000000..602d475 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/MaterializationConfiguration.scala @@ -0,0 +1,171 @@ +package com.adidas.analytics.config + +import com.adidas.analytics.algo.core.Algorithm.{ + ReadOperation, + SafeWriteOperation, + UpdateStatisticsOperation +} +import com.adidas.analytics.config.shared.MetadataUpdateStrategy +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util.DataFrameUtils.PartitionCriteria +import com.adidas.analytics.util.{ + ConfigReader, + HadoopLoadHelper, + CatalogTableManager, + InputReader, + LoadMode, + OutputWriter +} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.StructType +import org.joda.time._ +import org.joda.time.format.DateTimeFormat + +trait MaterializationConfiguration + extends ReadOperation + with SafeWriteOperation + with UpdateStatisticsOperation + with MetadataUpdateStrategy { + + protected def configReader: ConfigReader + + protected def spark: SparkSession + + protected def loadMode: LoadMode + + protected def partitionsCriteria: Seq[PartitionCriteria] + + protected val sourceTable: String = configReader.getAs[String]("source_table") + protected val targetTable: String = configReader.getAs[String]("target_table") + + protected val targetSchema: StructType = + CatalogTableManager(targetTable, spark).getSchemaSafely(dfs) + + protected val targetPartitions: Seq[String] = + configReader.getAsSeq[String]("target_partitions").toList + + protected val toCache: Boolean = configReader.getAsOption[Boolean]("to_cache").getOrElse(true) + + override protected val readers: Vector[InputReader.TableReader] = + Vector(InputReader.newTableReader(table = sourceTable)) + + override protected val writer: OutputWriter.AtomicWriter = OutputWriter.newTableLocationWriter( + table = targetTable, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + loadMode = loadMode, + metadataConfiguration = getMetaDataUpdateStrategy(targetTable, targetPartitions) + ) + + override protected def outputFilesNum: Option[Int] = + configReader.getAsOption[Int]("number_output_partitions") +} + +object MaterializationConfiguration { + + private val ConditionPattern = "(.+?)=(.+)".r + + private val FormatYearMonthDay = "yyyy-MM-dd" + private val FormatYearMonth = "yyyy-MM" + private val FormatYearWeek = "yyyy-ww" + + private val Year = "year" + private val Month = "month" + private val Week = "week" + private val Day = "day" + + trait FullMaterializationConfiguration extends MaterializationConfiguration { + + override protected val partitionsCriteria: Seq[PartitionCriteria] = Seq.empty + + protected val numVersionsToRetain: Int = configReader.getAs[Int]("num_versions_to_retain") + + protected val baseDataDir: String = configReader.getAs[String]("base_data_dir") + + protected val currentTableLocation: Path = + new Path(CatalogTableManager(targetTable, spark).getTableLocation) + + protected val sortingIgnoreFolderNames: Seq[String] = + configReader + .getAsOption[Seq[String]]("sorting_ignore_folder_names") + .getOrElse(Seq("_$folder$", "=", ".parquet", "_SUCCESS", "_tmp_")) + + protected val tableDataDir: Path = { + if (currentTableLocation.getName == baseDataDir.replace("/", "")) + /* currentTableLocation is baseDataDir (e.g., data/), so tableDataDir will be the same */ + currentTableLocation + else + /* currentTableLocation is e.g., data/20200612_101214_UTC, so we need to get the parent + * folder */ + currentTableLocation.getParent + } + + protected val nextTableLocation: Path = + HadoopLoadHelper.buildUTCTimestampedTablePath(tableDataDir) + protected val nextTableLocationPrefix: String = nextTableLocation.getName + + override protected val writer: OutputWriter.AtomicWriter = OutputWriter.newFileSystemWriter( + location = nextTableLocation.toString, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + loadMode = loadMode + ) + } + + trait QueryMaterializationConfiguration extends MaterializationConfiguration { + + override protected val partitionsCriteria: Seq[PartitionCriteria] = { + val conditions = configReader.getAsSeq[String]("select_conditions").map { + case ConditionPattern(columnName, columnValue) => (columnName.trim, columnValue.trim) + case condition => throw new IllegalArgumentException(s"Wrong select condition: $condition") + } + + if (conditions.isEmpty) + throw new RuntimeException(s"Unable to run materialization by query: conditions are empty") + + Seq(conditions) + } + } + + trait RangeMaterializationConfiguration extends MaterializationConfiguration { + + private val fromDateString = configReader.getAs[String]("date_from") + private val toDateString = configReader.getAs[String]("date_to") + + override protected val partitionsCriteria: Seq[PartitionCriteria] = targetPartitions match { + case Year :: Month :: Day :: Nil => + getDatesRange(FormatYearMonthDay, Days.ONE).map { date => + Seq( + Year -> date.getYear.toString, + Month -> date.getMonthOfYear.toString, + Day -> date.getDayOfMonth.toString + ) + }.toSeq + case Year :: Month :: Nil => + getDatesRange(FormatYearMonth, Months.ONE).map { date => + Seq(Year -> date.getYear.toString, Month -> date.getMonthOfYear.toString) + }.toSeq + case Year :: Week :: Nil => + getDatesRange(FormatYearWeek, Weeks.ONE).map { date => + Seq(Year -> date.getYear.toString, Week -> date.getWeekOfWeekyear.toString) + }.toSeq + case _ => + throw new RuntimeException( + s"Unable to run materialization by date range: unsupported partitioning schema: $targetPartitions" + ) + } + + private def getDatesRange(pattern: String, period: ReadablePeriod): Iterator[LocalDate] = { + val dateFormatter = DateTimeFormat.forPattern(pattern) + val startDate = LocalDate.parse(fromDateString, dateFormatter) + val endDate = LocalDate.parse(toDateString, dateFormatter) + if (startDate.isAfter(endDate)) + throw new RuntimeException( + "Unable to run materialization by date range: date_start is after date_end" + ) + Iterator.iterate(startDate)(_.plus(period)).takeWhile(!_.isAfter(endDate)) + } + } + +} diff --git a/src/main/scala/com/adidas/analytics/config/NestedFlattenerConfiguration.scala b/src/main/scala/com/adidas/analytics/config/NestedFlattenerConfiguration.scala index 71e8f00..12a6fc0 100644 --- a/src/main/scala/com/adidas/analytics/config/NestedFlattenerConfiguration.scala +++ b/src/main/scala/com/adidas/analytics/config/NestedFlattenerConfiguration.scala @@ -1,55 +1,65 @@ package com.adidas.analytics.config import com.adidas.analytics.config.shared.{ConfigurationContext, MetadataUpdateStrategy} -import com.adidas.analytics.algo.core.Algorithm.{ReadOperation, SafeWriteOperation, UpdateStatisticsOperation} +import com.adidas.analytics.algo.core.Algorithm.{ + ReadOperation, + SafeWriteOperation, + UpdateStatisticsOperation +} import com.adidas.analytics.util.DataFormat.ParquetFormat -import com.adidas.analytics.util.{InputReader, LoadMode, OutputWriter} +import com.adidas.analytics.util.{CatalogTableManager, InputReader, LoadMode, OutputWriter} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType - -trait NestedFlattenerConfiguration extends ConfigurationContext - with ReadOperation - with SafeWriteOperation - with UpdateStatisticsOperation - with MetadataUpdateStrategy { +trait NestedFlattenerConfiguration + extends ConfigurationContext + with ReadOperation + with SafeWriteOperation + with UpdateStatisticsOperation + with MetadataUpdateStrategy { protected def spark: SparkSession private val sourceLocation: String = configReader.getAs[String]("source_location") private val targetTable: String = configReader.getAs[String]("target_table") - protected val targetPartitions: Option[Seq[String]] = configReader.getAsOptionSeq[String]("target_partitions") - protected val targetSchema: StructType = spark.table(targetTable).schema - protected val charsToReplace: String = configReader.getAs[String]("chars_to_replace") - protected val replacementChar: String = configReader.getAs[String]("replacement_char") - - /* - * Be aware of the naming pattern after flattening, because you also need to include sub-level structs or arrays if you want them. - * Example: events__data if you want to flatten an array called "data" inside a struct called "events" - */ + + protected val targetPartitions: Option[Seq[String]] = + configReader.getAsOptionSeq[String]("target_partitions") + + protected val targetSchema: StructType = + CatalogTableManager(targetTable, spark).getSchemaSafely(dfs) + + protected val charsToReplace: String = + configReader.getAsOption[String]("chars_to_replace").getOrElse("[.:#]+") + + protected val replacementChar: String = + configReader.getAsOption[String]("replacement_char").getOrElse("_") + + /* Be aware of the naming pattern after flattening, because you also need to include sub-level + * structs or arrays if you want them. + * Example: events__data if you want to flatten an array called "data" inside a struct called + * "events" */ protected val fieldsToFlatten: Seq[String] = configReader.getAsSeq[String]("fields_to_flatten") - /* - * columnMapping provides the columns (with user-friendly names) to include in the final DataFrame. - * Columns not in the nameMapping will be excluded - */ + /* columnMapping provides the columns (with user-friendly names) to include in the final + * DataFrame. + * Columns not in the nameMapping will be excluded */ protected val columnMapping: Map[String, String] = configReader.getAsMap("column_mapping") - override protected val readers: Vector[InputReader] = Vector( - InputReader.newFileSystemReader(sourceLocation, ParquetFormat()) - ) + override protected val readers: Vector[InputReader] = + Vector(InputReader.newFileSystemReader(sourceLocation, ParquetFormat())) override protected val writer: OutputWriter.AtomicWriter = { var loadMode: LoadMode = LoadMode.OverwritePartitions - if (targetPartitions.isEmpty) - loadMode = LoadMode.OverwriteTable + if (targetPartitions.isEmpty) loadMode = LoadMode.OverwriteTable OutputWriter.newTableLocationWriter( targetTable, ParquetFormat(Some(targetSchema)), targetPartitions.getOrElse(Seq.empty), loadMode = loadMode, - metadataConfiguration = getMetaDataUpdateStrategy(targetTable, targetPartitions.getOrElse(Seq.empty)) + metadataConfiguration = + getMetaDataUpdateStrategy(targetTable, targetPartitions.getOrElse(Seq.empty)) ) } diff --git a/src/main/scala/com/adidas/analytics/config/PartitionMaterializationConfiguration.scala b/src/main/scala/com/adidas/analytics/config/PartitionMaterializationConfiguration.scala deleted file mode 100644 index 0ab7c9e..0000000 --- a/src/main/scala/com/adidas/analytics/config/PartitionMaterializationConfiguration.scala +++ /dev/null @@ -1,121 +0,0 @@ -package com.adidas.analytics.config - -import com.adidas.analytics.algo.core.Algorithm.{ReadOperation, SafeWriteOperation, UpdateStatisticsOperation} -import com.adidas.analytics.config.shared.MetadataUpdateStrategy -import com.adidas.analytics.util.DataFormat.ParquetFormat -import com.adidas.analytics.util.DataFrameUtils.PartitionCriteria -import com.adidas.analytics.util.{ConfigReader, InputReader, LoadMode, OutputWriter} -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.types.StructType -import org.joda.time._ -import org.joda.time.format.DateTimeFormat - - -trait PartitionMaterializationConfiguration extends ReadOperation - with SafeWriteOperation - with UpdateStatisticsOperation - with MetadataUpdateStrategy { - - protected def configReader: ConfigReader - - protected def spark: SparkSession - - protected def loadMode: LoadMode - - protected def partitionsCriteria: Seq[PartitionCriteria] - - private val sourceTable: String = configReader.getAs[String]("source_table") - private val targetTable: String = configReader.getAs[String]("target_table") - private val targetSchema: StructType = spark.table(targetTable).schema - - protected val targetPartitions: Seq[String] = configReader.getAsSeq[String]("target_partitions").toList - - override protected val readers: Vector[InputReader.TableReader] = Vector( - InputReader.newTableReader(table = sourceTable) - ) - - override protected val writer: OutputWriter.AtomicWriter = OutputWriter.newTableLocationWriter( - table = targetTable, - format = ParquetFormat(Some(targetSchema)), - targetPartitions = targetPartitions, - loadMode = loadMode, - metadataConfiguration = getMetaDataUpdateStrategy(targetTable, targetPartitions) - ) - - override protected def outputFilesNum: Option[Int] = configReader.getAsOption[Int]("number_output_partitions") -} - - -object PartitionMaterializationConfiguration { - - private val ConditionPattern = "(.+?)=(.+)".r - - private val FormatYearMonthDay = "yyyy-MM-dd" - private val FormatYearMonth = "yyyy-MM" - private val FormatYearWeek = "yyyy-ww" - - private val Year = "year" - private val Month = "month" - private val Week = "week" - private val Day = "day" - - trait FullMaterializationConfiguration { - - protected val partitionsCriteria: Seq[PartitionCriteria] = Seq.empty - } - - trait QueryMaterializationConfiguration { - - protected def configReader: ConfigReader - - protected val partitionsCriteria: Seq[PartitionCriteria] = { - val conditions = configReader.getAsSeq[String]("select_conditions").map { - case ConditionPattern(columnName, columnValue) => (columnName.trim, columnValue.trim) - case condition => throw new IllegalArgumentException(s"Wrong select condition: $condition") - } - - if (conditions.isEmpty) { - throw new RuntimeException(s"Unable to run materialization by query: conditions are empty") - } - - Seq(conditions) - } - } - - trait RangeMaterializationConfiguration { - - private val fromDateString = configReader.getAs[String]("date_from") - private val toDateString = configReader.getAs[String]("date_to") - - protected def targetPartitions: Seq[String] - - protected def configReader: ConfigReader - - protected val partitionsCriteria: Seq[PartitionCriteria] = targetPartitions match { - case Year :: Month :: Day :: Nil => - getDatesRange(FormatYearMonthDay, Days.ONE).map { date => - Seq(Year -> date.getYear.toString, Month -> date.getMonthOfYear.toString, Day -> date.getDayOfMonth.toString) - }.toSeq - case Year :: Month :: Nil => - getDatesRange(FormatYearMonth, Months.ONE).map { date => - Seq(Year -> date.getYear.toString, Month -> date.getMonthOfYear.toString) - }.toSeq - case Year :: Week :: Nil => - getDatesRange(FormatYearWeek, Weeks.ONE).map { date => - Seq(Year -> date.getYear.toString, Week -> date.getWeekOfWeekyear.toString) - }.toSeq - case _ => throw new RuntimeException(s"Unable to run materialization by date range: unsupported partitioning schema: $targetPartitions") - } - - private def getDatesRange(pattern: String, period: ReadablePeriod): Iterator[LocalDate] = { - val dateFormatter = DateTimeFormat.forPattern(pattern) - val startDate = LocalDate.parse(fromDateString, dateFormatter) - val endDate = LocalDate.parse(toDateString, dateFormatter) - if (startDate.isAfter(endDate)) { - throw new RuntimeException("Unable to run materialization by date range: date_start is after date_end") - } - Iterator.iterate(startDate)(_.plus(period)).takeWhile(!_.isAfter(endDate)) - } - } - -} diff --git a/src/main/scala/com/adidas/analytics/config/TransposeConfiguration.scala b/src/main/scala/com/adidas/analytics/config/TransposeConfiguration.scala new file mode 100644 index 0000000..e49f63a --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/TransposeConfiguration.scala @@ -0,0 +1,55 @@ +package com.adidas.analytics.config + +import com.adidas.analytics.algo.core.Algorithm.{ + ReadOperation, + SafeWriteOperation, + UpdateStatisticsOperation +} +import com.adidas.analytics.config.shared.{ConfigurationContext, MetadataUpdateStrategy} +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util.{CatalogTableManager, InputReader, LoadMode, OutputWriter} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.StructType + +trait TransposeConfiguration + extends ConfigurationContext + with ReadOperation + with SafeWriteOperation + with UpdateStatisticsOperation + with MetadataUpdateStrategy { + + protected def spark: SparkSession + protected val sourceTable: String = configReader.getAs[String]("source_table") + protected val targetTable: String = configReader.getAs[String]("target_table") + + protected val targetPartitions: Option[Seq[String]] = + configReader.getAsOptionSeq[String]("target_partitions") + + protected val targetSchema: StructType = + CatalogTableManager(targetTable, spark).getSchemaSafely(dfs) + + protected val aggregationColumn: String = configReader.getAs[String]("aggregation_column") + protected val pivotColumn: String = configReader.getAs[String]("pivot_column") + + protected val groupByColumn: Seq[String] = configReader.getAsSeq[String]("group_by_column") + + protected val enforceSchema: Boolean = + configReader.getAsOption[Boolean]("enforce_schema").getOrElse(false) + + override protected val readers: Vector[InputReader] = + Vector(InputReader.newTableReader(sourceTable)) + + override protected val writer: OutputWriter.AtomicWriter = { + var loadMode: LoadMode = LoadMode.OverwritePartitions + if (targetPartitions.isEmpty) loadMode = LoadMode.OverwriteTable + + OutputWriter.newTableLocationWriter( + targetTable, + ParquetFormat(Some(targetSchema)), + targetPartitions.getOrElse(Seq.empty), + loadMode = loadMode, + metadataConfiguration = + getMetaDataUpdateStrategy(targetTable, targetPartitions.getOrElse(Seq.empty)) + ) + } +} diff --git a/src/main/scala/com/adidas/analytics/config/loads/AppendLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/loads/AppendLoadConfiguration.scala new file mode 100644 index 0000000..58bb288 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/loads/AppendLoadConfiguration.scala @@ -0,0 +1,99 @@ +package com.adidas.analytics.config.loads + +import com.adidas.analytics.algo.core.Algorithm.SafeWriteOperation +import com.adidas.analytics.config.shared.{ConfigurationContext, MetadataUpdateStrategy} +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util.{CatalogTableManager, LoadMode, OutputWriter} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.util.DropMalformedMode +import org.apache.spark.sql.types.{DataType, StructType} +import scala.util.parsing.json.JSONObject + +trait AppendLoadConfiguration + extends ConfigurationContext + with LoadConfiguration + with SafeWriteOperation + with MetadataUpdateStrategy { + + protected def spark: SparkSession + + protected val regexFilename: Option[Seq[String]] = + configReader.getAsOptionSeq[String]("regex_filename") + + protected val partitionSourceColumn: Option[String] = + configReader.getAsOption[String]("partition_column") + + protected val partitionSourceColumnFormat: Option[String] = + configReader.getAsOption[String]("partition_column_format") + protected val headerDir: String = configReader.getAs[String]("header_dir") + + protected val targetTable: Option[String] = configReader.getAsOption[String]("target_table") + + /* This option is used to specify whether the input data schema must be the same as target schema + * specified in the configuration file */ + /* Note: if it is set to True, it will cause input data to be read more than once */ + private val verifySchemaOption: Option[Boolean] = + configReader.getAsOption[Boolean]("verify_schema") + + protected val verifySchema: Boolean = dataType match { + case SEMISTRUCTURED => verifySchemaOption.getOrElse(true) + case _ => false + } + + private val jsonSchemaOption: Option[JSONObject] = configReader.getAsOption[JSONObject]("schema") + + protected val dropDateDerivedColumns: Boolean = + configReader.getAsOption[Boolean]("drop_date_derived_columns").getOrElse(false) + + protected val targetSchema: StructType = getTargetSchema + + private val targetDir: Option[String] = configReader.getAsOption[String]("target_dir") + + override protected val writer: OutputWriter.AtomicWriter = dataType match { + case STRUCTURED if targetTable.isDefined => + OutputWriter.newTableLocationWriter( + table = targetTable.get, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + loadMode = LoadMode.OverwritePartitionsWithAddedColumns, + metadataConfiguration = getMetaDataUpdateStrategy(targetTable.get, targetPartitions) + ) + case SEMISTRUCTURED if targetDir.isDefined => + OutputWriter.newFileSystemWriter( + location = targetDir.get, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + loadMode = LoadMode.OverwritePartitions + ) + case anotherDataType => + throw new RuntimeException( + s"Unsupported data type: $anotherDataType in AppendLoad or the configuration file is malformed." + ) + } + + private def getTargetSchemaFromHiveTable: StructType = + targetTable match { + case Some(tableName) => + CatalogTableManager(tableName, spark).getSchemaSafely( + dfs, + targetPartitions, + dropDateDerivedColumns, + addCorruptRecordColumn, + Some(loadMode) + ) + case None => throw new RuntimeException("No schema definition found.") + } + + private def getTargetSchema: StructType = + dataType match { + case STRUCTURED => getTargetSchemaFromHiveTable + case SEMISTRUCTURED if jsonSchemaOption.isDefined => + DataType.fromJson(jsonSchemaOption.get.toString()).asInstanceOf[StructType] + case anotherDataType => + throw new RuntimeException( + s"Unsupported data type: $anotherDataType in AppendLoad or the configuration file is malformed." + ) + } + + override def loadMode: String = readerModeSetter(DropMalformedMode.name) +} diff --git a/src/main/scala/com/adidas/analytics/config/loads/DeltaLakeLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/loads/DeltaLakeLoadConfiguration.scala new file mode 100644 index 0000000..7aeab40 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/loads/DeltaLakeLoadConfiguration.scala @@ -0,0 +1,162 @@ +package com.adidas.analytics.config.loads + +import com.adidas.analytics.algo.core.Algorithm.{SafeWriteOperation, UpdateStatisticsOperation} +import com.adidas.analytics.config.loads.DeltaLakeLoadConfiguration._ +import com.adidas.analytics.config.shared.{ + ConfigurationContext, + DateComponentDerivationConfiguration, + MetadataUpdateStrategy +} +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util.DataFrameUtils.PartitionCriteria +import com.adidas.analytics.util.{CatalogTableManager, LoadMode, OutputWriter} +import org.apache.spark.sql.catalyst.util.PermissiveMode +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.{Row, SparkSession} +import org.slf4j.{Logger, LoggerFactory} +import scala.util.parsing.json.JSONObject + +trait DeltaLakeLoadConfiguration + extends ConfigurationContext + with LoadConfiguration + with UpdateStatisticsOperation + with MetadataUpdateStrategy + with DateComponentDerivationConfiguration + with SafeWriteOperation { + + protected def spark: SparkSession + + protected val isManualRepartitioning: Boolean = + if (configReader.getAsOption[Int]("output_partitions_num").isEmpty) false else true + + protected val outputPartitionsNum: Option[Int] = + configReader.getAsOption[Int]("output_partitions_num") + + configureDeltaSparkSession() + + // Delta Table Properties + protected val deltaTableDir: String = configReader.getAs[String]("delta_table_dir") + + protected val businessKey: Seq[String] = configReader.getAsSeq[String]("business_key") + + protected val businessKeyMatchOperator: String = + configReader.getAsOption[String]("business_key_match_operator").getOrElse("AND") + protected val currentDataAlias = "currentData" + protected val newDataAlias = "newData" + + protected val isToVacuum: Boolean = + configReader.getAsOption[Boolean]("is_to_vacuum").getOrElse(true) + + protected val vacuumRetentionPeriod: Int = + configReader.getAsOption[Int]("vacuum_retention_period").getOrElse(12) + + // Delta Condensation + protected val condensationKey: Seq[String] = configReader.getAsSeq[String]("condensation_key") + + protected val recordModeColumnName: String = configReader.getAs[String]("record_mode_column") + + protected val recordsToCondense: Seq[String] = + configReader.getAsSeq[String]("records_to_condense") + + protected val recordsToDelete: Seq[String] = configReader.getAsSeq[String]("records_to_delete") + + protected def recordModesFilterFunction: Row => Boolean = { row: Row => + recordsToCondense.contains(row.getAs[String](recordModeColumnName)) + } + + protected val initCondensationWithRecordMode: Boolean = + configReader.getAsOption[Boolean]("init_condensation_with_record_mode").getOrElse(true) + + /* ignoreAffectedPartitionsMerge: covers the case where the partition key of the lake table can't + * be used to control the merge match conditions, because it is not constant per record, meaning + * it can change with time with the rest of the attributes of a record. It instructs the delta + * lake load algorithm to ignore the delta table partitions when merging the current with new + * data. */ + protected val ignoreAffectedPartitionsMerge: Boolean = + configReader.getAsOption[Boolean]("ignore_affected_partitions_merge").getOrElse(true) + + protected var affectedPartitions: Seq[PartitionCriteria] = _ + + // Target Properties + protected val targetTable: String = configReader.getAs[String]("target_table") + + protected val targetSchema: StructType = + CatalogTableManager(targetTable, spark).getSchemaSafely(dfs) + + protected val targetTableColumns: Array[String] = spark.table(targetTable).columns + + override protected val targetPartitions: Seq[String] = + configReader.getAsSeq[String]("target_partitions") + + override protected val partitionSourceColumn: String = + configReader.getAs[String]("partition_column") + + override protected val partitionSourceColumnFormat: String = + configReader.getAs[String]("partition_column_format") + + override protected lazy val writer: OutputWriter.AtomicWriter = { + if (targetPartitions.nonEmpty) + OutputWriter.newTableLocationWriter( + table = targetTable, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + metadataConfiguration = getMetaDataUpdateStrategy(targetTable, targetPartitions), + loadMode = LoadMode.OverwritePartitions + ) + else + OutputWriter.newTableLocationWriter( + table = targetTable, + format = ParquetFormat(Some(targetSchema)), + metadataConfiguration = getMetaDataUpdateStrategy(targetTable, targetPartitions), + loadMode = LoadMode.OverwriteTable + ) + } + + // JSON Source properties + protected val isMultilineJSON: Option[Boolean] = + configReader.getAsOption[Boolean]("is_multiline_json") + + protected val readJsonSchema: Option[StructType] = + configReader.getAsOption[JSONObject]("schema") match { + case Some(value) => Some(DataType.fromJson(value.toString()).asInstanceOf[StructType]) + case _ => None + } + + override def loadMode: String = readerModeSetter(PermissiveMode.name) + + /** Configures the Spark session specially for the DeltaLakeLoad algorithm + */ + def configureDeltaSparkSession(): Unit = { + spark.conf.set( + "spark.delta.logStore.class", + "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore" + ) + spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true") + spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false") + spark.conf.set("spark.delta.merge.repartitionBeforeWrite", "true") + spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + if (!isManualRepartitioning) + spark.conf.set( + "spark.sql.shuffle.partitions", + Math.round( + spark.conf.getOption("spark.executor.instances").getOrElse("100").toInt * + spark.conf.getOption("spark.executor.cores").getOrElse("2").toInt + ) + ) + + logger.info( + s"Auto schema merge is ${spark.conf.get("spark.databricks.delta.schema.autoMerge.enabled")}" + ) + logger.info( + s"Repartition before write is ${spark.conf.get("spark.delta.merge.repartitionBeforeWrite")}" + ) + logger.info(s"Number of shuffle partitions: ${spark.conf.get("spark.sql.shuffle.partitions")}") + logger.info(s"Spark serializer: ${spark.conf.get("spark.serializer")}") + } + +} + +object DeltaLakeLoadConfiguration { + + private val logger: Logger = LoggerFactory.getLogger(this.getClass) +} diff --git a/src/main/scala/com/adidas/analytics/config/loads/DeltaLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/loads/DeltaLoadConfiguration.scala new file mode 100644 index 0000000..fbd139a --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/loads/DeltaLoadConfiguration.scala @@ -0,0 +1,97 @@ +package com.adidas.analytics.config.loads + +import com.adidas.analytics.algo.core.Algorithm.{ + ReadOperation, + SafeWriteOperation, + UpdateStatisticsOperation +} +import com.adidas.analytics.config.shared.{ + ConfigurationContext, + DateComponentDerivationConfiguration, + MetadataUpdateStrategy +} +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util._ +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Row, SparkSession} + +trait DeltaLoadConfiguration + extends ConfigurationContext + with UpdateStatisticsOperation + with MetadataUpdateStrategy { + + protected val activeRecordsTable: String = configReader.getAs[String]("active_records_table_lake") + + protected val deltaRecordsTable: Option[String] = + configReader.getAsOption[String]("delta_records_table_lake") + + protected val deltaRecordsFilePath: Option[String] = + configReader.getAsOption[String]("delta_records_file_path") + + protected val businessKey: Seq[String] = configReader.getAsSeq[String]("business_key") + + protected val technicalKey: Seq[String] = configReader.getAsSeq[String]("technical_key") + + protected val rankingColumnName: String = "DELTA_LOAD_RANK" + protected val recordModeColumnName: String = "recordmode" + protected val upsertRecordModes: Seq[String] = Seq("", "N") + + protected def upsertRecordsModesFilterFunction: Row => Boolean = { row: Row => + var recordmode = "" + try recordmode = row.getAs[String](recordModeColumnName) + catch { case _: Throwable => recordmode = row.getAs[String](recordModeColumnName.toUpperCase) } + recordmode == null || recordmode == "" || recordmode == "N" + } +} + +object DeltaLoadConfiguration { + + trait PartitionedDeltaLoadConfiguration + extends DeltaLoadConfiguration + with DateComponentDerivationConfiguration + with ReadOperation + with SafeWriteOperation { + + protected def spark: SparkSession + + override protected val targetPartitions: Seq[String] = + configReader.getAsSeq[String]("target_partitions") + + override protected val partitionSourceColumn: String = + configReader.getAs[String]("partition_column") + + override protected val partitionSourceColumnFormat: String = + configReader.getAs[String]("partition_column_format") + + private val targetSchema: StructType = + CatalogTableManager(activeRecordsTable, spark).getSchemaSafely(dfs) + + override protected val readers: Vector[InputReader] = Vector( + createDeltaInputReader(deltaRecordsFilePath, deltaRecordsTable), + InputReader.newTableReader(table = activeRecordsTable) + ) + + override protected val writer: OutputWriter.AtomicWriter = OutputWriter.newTableLocationWriter( + table = activeRecordsTable, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + metadataConfiguration = getMetaDataUpdateStrategy(activeRecordsTable, targetPartitions), + loadMode = LoadMode.OverwritePartitionsWithAddedColumns + ) + } + + private def createDeltaInputReader( + deltaRecordsFilePath: Option[String], + deltaRecordsTable: Option[String] + ): InputReader = { + def createInputReaderByPath: InputReader = + deltaRecordsFilePath.fold { + throw new RuntimeException( + "Unable to create a reader for the delta table: neither delta records path not delta table name is defined" + ) + }(location => InputReader.newFileSystemReader(location, DataFormat.ParquetFormat())) + + deltaRecordsTable + .fold(createInputReaderByPath)(tableName => InputReader.newTableReader(tableName)) + } +} diff --git a/src/main/scala/com/adidas/analytics/config/loads/FullLoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/loads/FullLoadConfiguration.scala new file mode 100644 index 0000000..9753630 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/loads/FullLoadConfiguration.scala @@ -0,0 +1,90 @@ +package com.adidas.analytics.config.loads + +import com.adidas.analytics.algo.core.Algorithm.BaseWriteOperation +import com.adidas.analytics.config.shared.{ + ConfigurationContext, + DateComponentDerivationConfiguration +} +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util._ +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.util.FailFastMode +import org.apache.spark.sql.types.{DataType, StructType} +import scala.util.parsing.json.JSONObject + +trait FullLoadConfiguration + extends ConfigurationContext + with LoadConfiguration + with DateComponentDerivationConfiguration + with BaseWriteOperation { + + protected def spark: SparkSession + + protected def dfs: DFSWrapper + + protected val targetTable: String = configReader.getAs[String]("target_table") + + protected val baseDataDir: String = configReader.getAs[String]("base_data_dir") + + protected val currentTableLocation: String = + CatalogTableManager(targetTable, spark).getTableLocation + + protected val tableRootDir: String = + currentTableLocation.substring(0, currentTableLocation.lastIndexOf('/')) + + protected val nextTableLocation: String = + HadoopLoadHelper.buildTimestampedTablePath(new Path(tableRootDir, baseDataDir)).toString + + protected val nextTableLocationPrefix: String = + nextTableLocation.substring(nextTableLocation.lastIndexOf('/')) + + protected val isMultilineJSON: Option[Boolean] = + configReader.getAsOption[Boolean]("is_multiline_json") + + protected val dropDateDerivedColumns: Boolean = + configReader + .getAsOption[Boolean]("drop_date_derived_columns") + .getOrElse(if (loadMode == FailFastMode.name) true else false) + + protected val targetSchema: StructType = CatalogTableManager(targetTable, spark).getSchemaSafely( + dfs, + targetPartitions, + dropDateDerivedColumns, + addCorruptRecordColumn, + Some(loadMode) + ) + + protected val partitionSourceColumn: String = configReader.getAs[String]("partition_column") + + protected val partitionSourceColumnFormat: String = + configReader.getAs[String]("partition_column_format") + + override protected val outputFilesNum: Option[Int] = + configReader.getAsOption[Int]("output_files_num").orElse(Some(10)) + + protected val writer: OutputWriter.AtomicWriter = dataType match { + case STRUCTURED => + OutputWriter.newFileSystemWriter( + location = nextTableLocation, + format = ParquetFormat(Some(targetSchema)), + targetPartitions = targetPartitions, + loadMode = LoadMode.OverwriteTable + ) + case anotherDataType => + throw new RuntimeException(s"Unsupported data type: $anotherDataType for FullLoad.") + } + + protected val readJsonSchema: Option[StructType] = + configReader.getAsOption[JSONObject]("schema") match { + case Some(value) => Some(DataType.fromJson(value.toString()).asInstanceOf[StructType]) + case _ => None + } + + // effectively disable the empty string to null conversion here per default + override protected def readNullValue: Option[String] = + super.readNullValue.orElse(Some("XXNULLXXX")) + + override def loadMode: String = readerModeSetter(FailFastMode.name) + +} diff --git a/src/main/scala/com/adidas/analytics/config/shared/LoadConfiguration.scala b/src/main/scala/com/adidas/analytics/config/loads/LoadConfiguration.scala similarity index 58% rename from src/main/scala/com/adidas/analytics/config/shared/LoadConfiguration.scala rename to src/main/scala/com/adidas/analytics/config/loads/LoadConfiguration.scala index 748ab59..2e5f431 100644 --- a/src/main/scala/com/adidas/analytics/config/shared/LoadConfiguration.scala +++ b/src/main/scala/com/adidas/analytics/config/loads/LoadConfiguration.scala @@ -1,4 +1,4 @@ -package com.adidas.analytics.config.shared +package com.adidas.analytics.config.loads import com.adidas.analytics.util.ConfigReader import org.apache.spark.sql.catalyst.util.{DropMalformedMode, FailFastMode, PermissiveMode} @@ -8,31 +8,36 @@ trait LoadConfiguration { val SEMISTRUCTURED = "semistructured" private val fileDelimiter: Option[String] = configReader.getAsOption[String]("delimiter") + private val hasHeader: Option[Boolean] = configReader.getAsOption[Boolean]("has_header") + private val dateFormat: String = + configReader.getAsOption[String]("date_format").getOrElse("yyyy-MM-dd") + private val optionalSparkOptions: Map[String, String] = Map[String, Option[String]]( "nullValue" -> readNullValue, - "quote" -> readQuoteValue - ).collect { - case (key, Some(value)) => (key, value) - } + "quote" -> readQuoteValue, + "dateFormat" -> Some(dateFormat) + ).collect { case (key, Some(value)) => (key, value) } private val requiredSparkOptions: Map[String, String] = Map[String, Option[Any]]( "delimiter" -> fileDelimiter, "header" -> hasHeader, "mode" -> Some(loadMode) - ).collect { - case (key, Some(value)) => (key, value.toString) - } - + ).collect { case (key, Some(value)) => (key, value.toString) } protected val targetPartitions: Seq[String] = configReader.getAsSeq[String]("target_partitions") protected val inputDir: String = configReader.getAs[String]("source_dir") protected val fileFormat: String = configReader.getAs[String]("file_format") - protected val dataType: String = configReader.getAsOption[String]("data_type").getOrElse(STRUCTURED) + protected val dataType: String = + configReader.getAsOption[String]("data_type").getOrElse(STRUCTURED) + + protected val addCorruptRecordColumn: Boolean = + configReader.getAsOption[Boolean]("add_corrupt_record_column").getOrElse(false) - protected val sparkReaderOptions: Map[String, String] = requiredSparkOptions ++ optionalSparkOptions + protected val sparkReaderOptions: Map[String, String] = requiredSparkOptions ++ + optionalSparkOptions protected def configReader: ConfigReader @@ -42,19 +47,18 @@ trait LoadConfiguration { protected def readQuoteValue: Option[String] = configReader.getAsOption[String]("quote_character") - protected def computeTableStatistics: Boolean = configReader.getAsOption[Boolean]("compute_table_statistics").getOrElse(true) + protected def computeTableStatistics: Boolean = + configReader.getAsOption[Boolean]("compute_table_statistics").getOrElse(true) - protected def readerModeSetter(defaultMode: String): String = { + protected def readerModeSetter(defaultMode: String): String = configReader.getAsOption[String]("reader_mode") match { - case Some(mode) => { + case Some(mode) => mode.toUpperCase match { - case PermissiveMode.name => PermissiveMode.name - case FailFastMode.name => FailFastMode.name + case PermissiveMode.name => PermissiveMode.name + case FailFastMode.name => FailFastMode.name case DropMalformedMode.name => DropMalformedMode.name - case _ => throw new RuntimeException(s"Invalid reader mode: $mode provided") + case _ => throw new RuntimeException(s"Invalid reader mode: $mode provided") } - } case None => defaultMode } - } } diff --git a/src/main/scala/com/adidas/analytics/config/shared/ConfigurationContext.scala b/src/main/scala/com/adidas/analytics/config/shared/ConfigurationContext.scala index 4f4231f..902685b 100644 --- a/src/main/scala/com/adidas/analytics/config/shared/ConfigurationContext.scala +++ b/src/main/scala/com/adidas/analytics/config/shared/ConfigurationContext.scala @@ -4,7 +4,6 @@ import com.adidas.analytics.util.DFSWrapper._ import com.adidas.analytics.util.{ConfigReader, DFSWrapper} import org.apache.hadoop.fs.Path - trait ConfigurationContext extends Serializable { protected def dfs: DFSWrapper diff --git a/src/main/scala/com/adidas/analytics/config/shared/DataReshapingTaskConfig.scala b/src/main/scala/com/adidas/analytics/config/shared/DataReshapingTaskConfig.scala new file mode 100644 index 0000000..beb8146 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/shared/DataReshapingTaskConfig.scala @@ -0,0 +1,45 @@ +package com.adidas.analytics.config.shared + +import com.adidas.analytics.util.ConfigReader +import scala.util.parsing.json.{JSON, JSONObject} + +trait DataReshapingTaskConfig { + + protected def configReader: ConfigReader + + protected val additonalTask: Option[Map[String, String]] = + configReader.getAsOption[JSONObject]("additional_task") match { + case Some(value) => JSON.parseFull(value.toString()).asInstanceOf[Option[Map[String, String]]] + case _ => None + } + + protected val flattenTaskProperties: Option[Map[String, String]] = + getAdditionalSetting[Map[String, String]]("nested_task_properties", additonalTask) + + protected val transposeTaskProperties: Option[Map[String, String]] = + getAdditionalSetting[Map[String, String]]("transpose_task_properties", additonalTask) + + protected val enforceSchema: Boolean = + getAdditionalSetting[Boolean]("enforce_schema", additonalTask).getOrElse(false) + + protected def getAdditionalSetting[T]( + propertyName: String, + optSetting: Option[Map[String, String]] + ): Option[T] = + optSetting match { + case Some(x) => x.get(propertyName).asInstanceOf[Option[T]] + case _ => None + } + + protected def getProperties[T](propertyName: String, secondProperty: String): Option[T] = { + val property = Some(additonalTask.get(propertyName).asInstanceOf[Map[String, String]]) match { + case Some(x) => x.get(secondProperty).asInstanceOf[Option[T]] + case _ => None + } + property + } + + protected def getAdditionalTaskProperty[T](option: Option[T]): T = + option.getOrElse(throw new RuntimeException(s"$option value is missing")) + +} diff --git a/src/main/scala/com/adidas/analytics/config/shared/DateComponentDerivationConfiguration.scala b/src/main/scala/com/adidas/analytics/config/shared/DateComponentDerivationConfiguration.scala index 205fe69..eeaced7 100644 --- a/src/main/scala/com/adidas/analytics/config/shared/DateComponentDerivationConfiguration.scala +++ b/src/main/scala/com/adidas/analytics/config/shared/DateComponentDerivationConfiguration.scala @@ -1,6 +1,5 @@ package com.adidas.analytics.config.shared - trait DateComponentDerivationConfiguration { protected def partitionSourceColumn: String @@ -9,4 +8,3 @@ trait DateComponentDerivationConfiguration { protected def targetPartitions: Seq[String] } - diff --git a/src/main/scala/com/adidas/analytics/config/shared/MetadataUpdateStrategy.scala b/src/main/scala/com/adidas/analytics/config/shared/MetadataUpdateStrategy.scala index 891fbe3..7a7a9ef 100644 --- a/src/main/scala/com/adidas/analytics/config/shared/MetadataUpdateStrategy.scala +++ b/src/main/scala/com/adidas/analytics/config/shared/MetadataUpdateStrategy.scala @@ -5,12 +5,17 @@ import com.adidas.analytics.util.{RecoverPartitionsCustom, RecoverPartitionsNati trait MetadataUpdateStrategy extends ConfigurationContext { - protected def getMetaDataUpdateStrategy(targetTable: String, - partitionColumns: Seq[String]): Metadata = + protected def getMetaDataUpdateStrategy( + targetTable: String, + partitionColumns: Seq[String] + ): Metadata = configReader.getAsOption[String]("metadata_update_strategy") match { - case Some("SparkRecoverPartitionsNative") => RecoverPartitionsNative(targetTable, partitionColumns) - case Some("SparkRecoverPartitionsCustom") => RecoverPartitionsCustom(targetTable, partitionColumns) - case Some(invalidConfig) => throw new Exception(s"Invalid metadata update strategy ${invalidConfig}") + case Some("SparkRecoverPartitionsNative") => + RecoverPartitionsNative(targetTable, partitionColumns) + case Some("SparkRecoverPartitionsCustom") => + RecoverPartitionsCustom(targetTable, partitionColumns) + case Some(invalidConfig) => + throw new Exception(s"Invalid metadata update strategy $invalidConfig") case None => RecoverPartitionsNative(targetTable, partitionColumns) } diff --git a/src/main/scala/com/adidas/analytics/config/templates/AlgorithmTemplateConfiguration.scala b/src/main/scala/com/adidas/analytics/config/templates/AlgorithmTemplateConfiguration.scala new file mode 100644 index 0000000..db52128 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/config/templates/AlgorithmTemplateConfiguration.scala @@ -0,0 +1,98 @@ +package com.adidas.analytics.config.templates + +import com.adidas.analytics.algo.core.Algorithm.{ + ReadOperation, + SafeWriteOperation, + UpdateStatisticsOperation +} +import com.adidas.analytics.config.shared.{ConfigurationContext, MetadataUpdateStrategy} +import com.adidas.analytics.config.templates.AlgorithmTemplateConfiguration.ruleToLocalDate +import com.adidas.analytics.util.DataFormat.ParquetFormat +import com.adidas.analytics.util.{CatalogTableManager, InputReader, LoadMode, OutputWriter} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.StructType +import org.joda.time.format.DateTimeFormat +import org.joda.time.{Days, LocalDate} + +trait AlgorithmTemplateConfiguration + extends ConfigurationContext + with ReadOperation + with SafeWriteOperation + with UpdateStatisticsOperation + with MetadataUpdateStrategy { + + protected def spark: SparkSession + + /** This trait has the responsibility to obtain the required configurations for a given algorithm. + * In this template, it can be seen that values like source and target tables, dates, and readers + * and writers, are obtained in this class by mixing ConfigurationContext, ReadOperation and + * SafeWriteOperation. + * + * At the same time, AlgorithmTemplateConfiguration is mixed in the AlgorithmTemplate class, so + * it can use the values from the provided configuration. + * + * An easy way to see this, is to think of it as the parser of the algorithm json config file. + */ + protected val sourceTable: String = configReader.getAs[String]("source_table").trim + /* you can use a source location as parquet files on the lake instead of a hive table */ + /* protected val sourceLocation: String = + * configReader.getAs[String]("source_location").trim */ + + protected val targetTable: String = configReader.getAs[String]("target_table").trim + + protected val startDate: LocalDate = ruleToLocalDate(configReader.getAs[String]("date_from").trim) + + protected val endDate: LocalDate = ruleToLocalDate(configReader.getAs[String]("date_to").trim) + + protected val dateRange: Days = Days.daysBetween(startDate, endDate) + + protected val targetSchema: StructType = + CatalogTableManager(targetTable, spark).getSchemaSafely(dfs) + + override protected val readers: Vector[InputReader.TableReader] = Vector( + // Obtaining a reader for the algorithm + InputReader.newTableReader(table = sourceTable) + /* you can use a source location as parquet files on the lake instead of a hive table */ + /* InputReader.newFileSystemReader(sourceLocation, DataFormat.ParquetFormat()) */ + ) + + override protected val writer: OutputWriter.AtomicWriter = + /** Obtaining a writer for the algorithm. + * + * Note that the LoadMode can be any of the following: + * + * -- OverwriteTable: which steps on the exiting files and writes the new records + * -- OverwritePartitions: which steps on the existing files inside a partition directory + * -- AppendJoinPartitions: which appends the records to existing ones in the partition + * directory by a Full Outer Join + * -- AppendUnionPartition: which appends the records to existing ones in the partition + * directory by a Union All + */ + OutputWriter.newTableLocationWriter( + table = targetTable, + format = ParquetFormat(Some(targetSchema)), + metadataConfiguration = getMetaDataUpdateStrategy(targetTable, Seq.empty), + targetPartitions = Seq.empty, + /* If partitions are required, this would look like, e.g., Seq("year", "month") */ + loadMode = LoadMode.OverwritePartitionsWithAddedColumns + ) +} + +object AlgorithmTemplateConfiguration { + + /** A companion object can alternatively be used to add helper methods In this case, there is a + * method to convert a date string to a specific date value, because in this example, date could + * also contain a string such as today and yesterday, as well as a pattern. + */ + + private val DatePattern = "([0-9]{4}-[0-9]{2}-[0-9]{2})".r + private val DateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd") + + private def ruleToLocalDate(rule: String): LocalDate = + rule.trim match { + case DatePattern(dateString) => LocalDate.parse(dateString, DateFormatter) + case "today" => LocalDate.now() + case "yesterday" => LocalDate.now().minus(Days.ONE) + case _ => throw new IllegalArgumentException(s"Invalid date format: $rule") + } +} diff --git a/src/main/scala/com/adidas/analytics/util/CatalogTableManager.scala b/src/main/scala/com/adidas/analytics/util/CatalogTableManager.scala new file mode 100644 index 0000000..4754637 --- /dev/null +++ b/src/main/scala/com/adidas/analytics/util/CatalogTableManager.scala @@ -0,0 +1,183 @@ +package com.adidas.analytics.util + +import com.adidas.analytics.algo.shared.DateComponentDerivation +import com.adidas.analytics.util.DFSWrapper._ +import com.adidas.analytics.util.CatalogTableManager._ +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.catalyst.util.{FailFastMode, PermissiveMode} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{Row, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +class CatalogTableManager(table: String, sparkSession: SparkSession) { + private val KeyColumn = col("col_name") + private val ValueColumn = col("data_type") + private val LocationKey = "LOCATION" + + private lazy val attributes: Map[String, String] = sparkSession + .sql(s"describe formatted $table") + .select(KeyColumn, ValueColumn) + .withColumn(KeyColumn.toString, upper(KeyColumn)) + .collect() + .map(row => (row.getAs[String](KeyColumn.toString), row.getAs[String](ValueColumn.toString))) + .toMap + + /** Generic method to read attribute values from hive table description output + * + * @param key + * Key to be read (First column) + * @return + * Value (Second Column) + */ + def getValueForKey(key: String): String = attributes(key.toUpperCase) + + /** Read table location + * + * @return + * table location + */ + def getTableLocation: String = getValueForKey(LocationKey) + + /** Set table location + * + * @param location + * location of the table + * @return + * DataFrame + */ + def setTableLocation(location: String): Unit = + sparkSession.sql(s"ALTER TABLE $table SET $LocationKey '$location'") + + /** Set location of all table partitions + * + * @param location + * location of the table + */ + def setTablePartitionsLocation(location: String): Unit = { + val partitionSpecs = sparkSession.sql(s"SHOW PARTITIONS $table").collect() + partitionSpecs.foreach { row => + val originalSpec = row.getAs[String]("partition") + val formattedSpec = getFormattedPartitionSpec(row) + sparkSession.sql( + s"ALTER TABLE $table PARTITION ($formattedSpec) SET $LocationKey '$location/$originalSpec'" + ) + } + } + + /** Drop all table partitions + */ + def dropTablePartitions(): Unit = { + val partitionSpecs = sparkSession.sql(s"SHOW PARTITIONS $table").collect() + partitionSpecs.foreach { row => + val formattedSpec = getFormattedPartitionSpec(row) + sparkSession.sql(s"ALTER TABLE $table DROP IF EXISTS PARTITION ($formattedSpec)") + } + } + + /** Recreate the table in a new location by creating a new table like the old one in terms of + * schema. It uses a temporary table to transition from the old table definition to the new one, + * because the old table cannot be dropped until the new one is created. + * + * @param location + * location for the target table + * @param targetPartitions + * target table partitions + */ + def recreateTable(location: String, targetPartitions: Seq[String]): Unit = { + logger.info(s"Recreating table $table in location: $location") + + sparkSession.sql(createExternalTableStatement(table, s"${table}_tmp", location)) + sparkSession.sql(s"DROP TABLE IF EXISTS $table") + + sparkSession.sql(createExternalTableStatement(s"${table}_tmp", table, location)) + sparkSession.sql(s"DROP TABLE IF EXISTS ${table}_tmp") + + if (targetPartitions.nonEmpty) sparkSession.catalog.recoverPartitions(table) + + logger.info(s"Finished recreating table $table in location: $location") + } + + /** A method to get the target schema by first checking if the data folder exists. If not, data + * folder is created, avoiding getSchema to fail for certain Hive/Spark metastores. Target schema + * is given accordingly to the respective reader mode. + * + * @param dfs + * DFSWrapper to create directory if the same does not exist + * @param targetPartitions + * List with the target partitions + * @param isDropDerivedColumns + * Boolean to remove or maintain target partitions in table initial schema + * @param addCorruptRecord + * Boolean to add corrupt_record column when permissive mode is used + * @param readerMode + * Failfast, Permissive or DropMalformed modes are available + * @return + * Target table schema + */ + def getSchemaSafely( + dfs: DFSWrapper, + targetPartitions: Seq[String] = Seq.empty[String], + isDropDerivedColumns: Boolean = false, + addCorruptRecord: Boolean = false, + readerMode: Option[String] = None + ): StructType = { + + def dropDateDerivedColumns(schema: StructType): StructType = { + var schemaStruct = new StructType() + schema.foreach { column => + if (!DateComponentDerivation.ALLOWED_DERIVATIONS.contains(column.name)) + schemaStruct = schemaStruct.add(column) + } + schemaStruct + } + + def addCorruptRecordColumn(schema: StructType): StructType = + schema.add(StructField("_corrupt_record", StringType, nullable = true)) + + try { + val tableLocation = new Path(getTableLocation) + dfs.getFileSystem(tableLocation).createDirIfNotExists(tableLocation) + logger.info(s"Retrieved table location: $tableLocation") + + val schema = sparkSession.table(table).schema + readerMode match { + case Some(FailFastMode.name) if isDropDerivedColumns => dropDateDerivedColumns(schema) + case Some(PermissiveMode.name) => + if (isDropDerivedColumns) + if (addCorruptRecord) addCorruptRecordColumn(dropDateDerivedColumns(schema)) + else dropDateDerivedColumns(schema) + else if (addCorruptRecord) addCorruptRecordColumn(schema) + else schema + case _ => schema + } + } catch { + case e: Throwable => + throw new RuntimeException(s"Unable to return or create table location: ", e) + } + } + + private def createExternalTableStatement( + sourceTable: String, + destinationTable: String, + location: String + ): String = s"CREATE TABLE IF NOT EXISTS $destinationTable LIKE $sourceTable LOCATION '$location'" + + private def getFormattedPartitionSpec(row: Row): String = + row + .getAs[String]("partition") + .split('/') + .map { p => + val pSplitted = p.split('=') + "%s='%s'".format(pSplitted(0), pSplitted(1)) + } + .mkString(",") +} + +object CatalogTableManager { + + private val logger: Logger = LoggerFactory.getLogger(getClass) + + def apply(tableName: String, sparkSession: SparkSession): CatalogTableManager = + new CatalogTableManager(tableName, sparkSession) +} diff --git a/src/main/scala/com/adidas/analytics/util/ConfigReader.scala b/src/main/scala/com/adidas/analytics/util/ConfigReader.scala index 46ddd2a..03dab88 100644 --- a/src/main/scala/com/adidas/analytics/util/ConfigReader.scala +++ b/src/main/scala/com/adidas/analytics/util/ConfigReader.scala @@ -1,61 +1,53 @@ package com.adidas.analytics.util import java.text.DecimalFormatSymbols - -import org.slf4j.{Logger, LoggerFactory} - import scala.util.parsing.json.{JSON, JSONArray, JSONObject} -/** - * Base class capable of reading parameters from config content +/** Base class capable of reading parameters from config content */ class ConfigReader(jsonContent: String) extends Serializable { - private val logger: Logger = LoggerFactory.getLogger(getClass) - private val decimalSeparator: Char = new DecimalFormatSymbols().getDecimalSeparator - JSON.globalNumberParser = (in: String) => if (in.contains(decimalSeparator)) in.toDouble else in.toInt + JSON.globalNumberParser = (in: String) => + if (in.contains(decimalSeparator)) in.toDouble else in.toInt private lazy val config = JSON.parseRaw(jsonContent) match { case Some(JSONObject(obj)) => obj - case _ => throw new IllegalArgumentException(s"Wrong format of the configuration file: $jsonContent") + case _ => + throw new IllegalArgumentException(s"Wrong format of the configuration file: $jsonContent") } - def getAsSeq[T](propertyName: String): Seq[T] = { + def getAsSeq[T](propertyName: String): Seq[T] = config.get(propertyName) match { case Some(JSONArray(list)) => list.map(_.asInstanceOf[T]) - case _ => throw new IllegalArgumentException(s"Unable to find configuration property $propertyName") + case _ => + throw new IllegalArgumentException(s"Unable to find configuration property $propertyName") } - } - def getAsMap[K, V](propertyName: String): Map[K,V] = { + def getAsMap[K, V](propertyName: String): Map[K, V] = config.get(propertyName) match { - case Some(JSONObject(obj)) => obj.asInstanceOf[Map[K,V]] - case _ => throw new IllegalArgumentException(s"Unable to find configuration property $propertyName") + case Some(JSONObject(obj)) => obj.asInstanceOf[Map[K, V]] + case _ => + throw new IllegalArgumentException(s"Unable to find configuration property $propertyName") } - } - def getAs[T](propertyName: String): T = { + def getAs[T](propertyName: String): T = config.get(propertyName) match { case Some(property) => property.asInstanceOf[T] - case None => throw new IllegalArgumentException(s"Unable to find configuration property $propertyName") + case None => + throw new IllegalArgumentException(s"Unable to find configuration property $propertyName") } - } - def getAsOption[T](propertyName: String): Option[T] = { + def getAsOption[T](propertyName: String): Option[T] = config.get(propertyName).map(property => property.asInstanceOf[T]) - } - def getAsOptionSeq[T](propertyName: String): Option[Seq[T]] = { + def getAsOptionSeq[T](propertyName: String): Option[Seq[T]] = config.get(propertyName).map(_ => getAsSeq(propertyName)) - } - def contains(propertyName: String): Boolean = { - config.contains(propertyName) - } + def contains(propertyName: String): Boolean = config.contains(propertyName) } object ConfigReader { def apply(jsonContent: String): ConfigReader = new ConfigReader(jsonContent) -} \ No newline at end of file +} diff --git a/src/main/scala/com/adidas/analytics/util/DFSWrapper.scala b/src/main/scala/com/adidas/analytics/util/DFSWrapper.scala index 1018b52..91e755e 100644 --- a/src/main/scala/com/adidas/analytics/util/DFSWrapper.scala +++ b/src/main/scala/com/adidas/analytics/util/DFSWrapper.scala @@ -1,41 +1,37 @@ package com.adidas.analytics.util import java.io.{IOException, ObjectInputStream, ObjectOutputStream, PrintWriter} - import com.adidas.analytics.util.DFSWrapper.ConfigurationWrapper import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.slf4j.{Logger, LoggerFactory} - import scala.collection.mutable import scala.io.Source import scala.util.{Failure, Success, Try} - -class DFSWrapper private(config: ConfigurationWrapper) extends Serializable { +class DFSWrapper private (config: ConfigurationWrapper) extends Serializable { @transient private val fsCache: mutable.Map[String, FileSystem] = mutable.Map.empty - def getFileSystem(path: Path): FileSystem = { + def getFileSystem(path: Path): FileSystem = fsCache.getOrElseUpdate(path.toUri.getHost, path.getFileSystem(config.hadoopConfiguration)) - } } object DFSWrapper { private val logger: Logger = LoggerFactory.getLogger(getClass) - def apply(hadoopConfiguration: Configuration): DFSWrapper = { + def apply(hadoopConfiguration: Configuration): DFSWrapper = new DFSWrapper(new ConfigurationWrapper(hadoopConfiguration)) - } - final class ConfigurationWrapper(@transient var hadoopConfiguration: Configuration) extends Serializable { + final class ConfigurationWrapper( + @transient + var hadoopConfiguration: Configuration + ) extends Serializable { //noinspection ScalaUnusedSymbol - private def writeObject(out: ObjectOutputStream): Unit = { - hadoopConfiguration.write(out) - } + private def writeObject(out: ObjectOutputStream): Unit = hadoopConfiguration.write(out) //noinspection ScalaUnusedSymbol private def readObject(in: ObjectInputStream): Unit = { @@ -44,7 +40,6 @@ object DFSWrapper { } } - implicit class ExtendedFileSystem(fs: FileSystem) { def writeFile(path: Path, content: String): Unit = { @@ -54,16 +49,14 @@ object DFSWrapper { writer.close() } - def readFile(path: Path): String = { - Source.fromInputStream(fs.open(path)).mkString - } + def readFile(path: Path): String = Source.fromInputStream(fs.open(path)).mkString def ls(inputPath: Path, recursive: Boolean = false): Seq[Path] = { import RemoteIteratorWrapper._ fs.listFiles(inputPath, recursive).remoteIteratorToIterator.map(_.getPath).toVector } - def deleteAll(paths: Seq[Path], recursive: Boolean = false): Unit = { + def deleteAll(paths: Seq[Path], recursive: Boolean = false): Unit = paths.par.foreach { path => Try(fs.delete(path, recursive)) match { case Failure(e) => @@ -77,9 +70,8 @@ object DFSWrapper { case Success(true) => } } - } - def renameAll(sources: Seq[Path], targetDir: Path): Unit = { + def renameAll(sources: Seq[Path], targetDir: Path): Unit = sources.par.foreach { source => val path = new Path(targetDir, source.getName) Try(fs.rename(source, path)) match { @@ -94,29 +86,25 @@ object DFSWrapper { case Success(true) => } } - } - def createDirIfNotExists(path: Path): Unit = { - if(!fs.exists(path)) { - Try(fs.mkdirs(path)) match { - case Failure(e) => - val ex = new IOException(s"Unable to create $path", e) - logger.error(ex.getMessage) - throw new IOException(ex) - case Success(false) => - val ex = new IOException(s"Unable to create $path") - logger.error(ex.getMessage) - throw new IOException(ex) - case Success(true) => - } + def createDirIfNotExists(path: Path): Unit = + if (!fs.exists(path)) Try(fs.mkdirs(path)) match { + case Failure(e) => + val ex = new IOException(s"Unable to create $path", e) + logger.error(ex.getMessage) + throw new IOException(ex) + case Success(false) => + val ex = new IOException(s"Unable to create $path") + logger.error(ex.getMessage) + throw new IOException(ex) + case Success(true) => } - } } implicit class ExtendedPath(path: Path) { - def join(children: Seq[String]): Path = { + def join(children: Seq[String]): Path = children.foldLeft(path)((parentPath, suffix) => new Path(parentPath, suffix)) - } } + } diff --git a/src/main/scala/com/adidas/analytics/util/DataFormat.scala b/src/main/scala/com/adidas/analytics/util/DataFormat.scala index 79b3b76..b6971a9 100644 --- a/src/main/scala/com/adidas/analytics/util/DataFormat.scala +++ b/src/main/scala/com/adidas/analytics/util/DataFormat.scala @@ -4,7 +4,6 @@ import org.apache.spark.sql._ import org.apache.spark.sql.types.StructType import org.slf4j.{Logger, LoggerFactory} - sealed trait DataFormat { protected val logger: Logger = LoggerFactory.getLogger(getClass) @@ -14,7 +13,6 @@ sealed trait DataFormat { def write(writer: DataFrameWriter[Row], location: String): Unit } - object DataFormat { case class ParquetFormat(optionalSchema: Option[StructType] = None) extends DataFormat { @@ -36,7 +34,9 @@ object DataFormat { override def read(reader: DataFrameReader, locations: String*): DataFrame = { val filesString = locations.mkString(", ") logger.info(s"Reading DSV data from $filesString") - optionalSchema.fold(reader.option("inferSchema", "true"))(schema => reader.schema(schema)).csv(locations: _*) + optionalSchema + .fold(reader.option("inferSchema", "true"))(schema => reader.schema(schema)) + .csv(locations: _*) } override def write(writer: DataFrameWriter[Row], location: String): Unit = { @@ -45,12 +45,16 @@ object DataFormat { } } - case class JSONFormat(optionalSchema: Option[StructType] = None) extends DataFormat { + case class JSONFormat(optionalSchema: Option[StructType] = None, multiLine: Boolean = false) + extends DataFormat { override def read(reader: DataFrameReader, locations: String*): DataFrame = { val filesString = locations.mkString(", ") logger.info(s"Reading JSON data from $filesString") - optionalSchema.fold(reader.option("inferSchema", "true"))(schema => reader.schema(schema)).json(locations: _*) + optionalSchema + .fold(reader.option("inferSchema", "true"))(schema => reader.schema(schema)) + .option("multiline", multiLine) + .json(locations: _*) } override def write(writer: DataFrameWriter[Row], location: String): Unit = { @@ -58,5 +62,5 @@ object DataFormat { writer.json(location) } } -} +} diff --git a/src/main/scala/com/adidas/analytics/util/DataFrameUtils.scala b/src/main/scala/com/adidas/analytics/util/DataFrameUtils.scala index e45d522..f506ab1 100644 --- a/src/main/scala/com/adidas/analytics/util/DataFrameUtils.scala +++ b/src/main/scala/com/adidas/analytics/util/DataFrameUtils.scala @@ -4,7 +4,6 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, functions} import org.slf4j.{Logger, LoggerFactory} - object DataFrameUtils { private val logger: Logger = LoggerFactory.getLogger(getClass) @@ -13,32 +12,49 @@ object DataFrameUtils { type PartitionCriteria = Seq[(String, String)] - def mapPartitionsToDirectories(partitionCriteria: PartitionCriteria): Seq[String] = { - partitionCriteria.map { - case (columnName, columnValue) => s"$columnName=$columnValue" - } - } + def mapPartitionsToDirectories(partitionCriteria: PartitionCriteria): Seq[String] = + partitionCriteria.map { case (columnName, columnValue) => s"$columnName=$columnValue" } - def buildPartitionsCriteriaMatcherFunc(multiplePartitionsCriteria: Seq[PartitionCriteria], schema: StructType): FilterFunction = { + def buildPartitionsCriteriaMatcherFunc( + multiplePartitionsCriteria: Seq[PartitionCriteria], + schema: StructType + ): FilterFunction = { val targetPartitions = multiplePartitionsCriteria.flatten.map(_._1).toSet - val fieldNameToMatchFunctionMapping = schema.fields.filter { - case StructField(name, _, _, _) => targetPartitions.contains(name) - }.map { - case StructField(name, _: ByteType, _, _) => name -> ((r: Row, value: String) => r.getAs[Byte](name) == value.toByte) - case StructField(name, _: ShortType, _, _) => name -> ((r: Row, value: String) => r.getAs[Short](name) == value.toShort) - case StructField(name, _: IntegerType, _, _) => name -> ((r: Row, value: String) => r.getAs[Int](name) == value.toInt) - case StructField(name, _: LongType, _, _) => name -> ((r: Row, value: String) => r.getAs[Long](name) == value.toLong) - case StructField(name, _: FloatType, _, _) => name -> ((r: Row, value: String) => r.getAs[Float](name) == value.toFloat) - case StructField(name, _: DoubleType, _, _) => name -> ((r: Row, value: String) => r.getAs[Double](name) == value.toDouble) - case StructField(name, _: BooleanType, _, _) => name -> ((r: Row, value: String) => r.getAs[Boolean](name) == value.toBoolean) - case StructField(name, _: StringType, _, _) => name -> ((r: Row, value: String) => r.getAs[String](name) == value) - }.toMap - - def convertPartitionCriteriaToFilterFunctions(partitionCriteria: PartitionCriteria): Seq[FilterFunction] = partitionCriteria.map { - case (name, value) => (row: Row) => fieldNameToMatchFunctionMapping(name)(row, value) - } + val fieldNameToMatchFunctionMapping = + schema.fields + .filter { case StructField(name, _, _, _) => targetPartitions.contains(name) } + .map { + case StructField(name, _: ByteType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Byte](name) == value.toByte) + case StructField(name, _: ShortType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Short](name) == value.toShort) + case StructField(name, _: IntegerType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Int](name) == value.toInt) + case StructField(name, _: LongType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Long](name) == value.toLong) + case StructField(name, _: FloatType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Float](name) == value.toFloat) + case StructField(name, _: DoubleType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Double](name) == value.toDouble) + case StructField(name, _: BooleanType, _, _) => + name -> ((r: Row, value: String) => r.getAs[Boolean](name) == value.toBoolean) + case StructField(name, _: StringType, _, _) => + name -> ((r: Row, value: String) => r.getAs[String](name) == value) + case StructField(_, dataType, _, _) => + throw new Exception("Unsupported partition data type: " + dataType.getClass) + } + .toMap - def joinSinglePartitionFilterFunctionsWithAnd(partitionFilterFunctions: Seq[FilterFunction]): FilterFunction = + def convertPartitionCriteriaToFilterFunctions( + partitionCriteria: PartitionCriteria + ): Seq[FilterFunction] = + partitionCriteria.map { + case (name, value) => (row: Row) => fieldNameToMatchFunctionMapping(name)(row, value) + } + + def joinSinglePartitionFilterFunctionsWithAnd( + partitionFilterFunctions: Seq[FilterFunction] + ): FilterFunction = partitionFilterFunctions .reduceOption((predicate1, predicate2) => (row: Row) => predicate1(row) && predicate2(row)) .getOrElse((_: Row) => false) @@ -50,18 +66,20 @@ object DataFrameUtils { .getOrElse((_: Row) => false) } - implicit class DataFrameHelper(df: DataFrame) { def collectPartitions(targetPartitions: Seq[String]): Seq[PartitionCriteria] = { - logger.info(s"Collecting unique partitions for partitions columns (${targetPartitions.mkString(", ")})") + logger.info( + s"Collecting unique partitions for partitions columns (${targetPartitions.mkString(", ")})" + ) val partitions = df.selectExpr(targetPartitions: _*).distinct().collect() partitions.map { row => targetPartitions.map { columnName => Option(row.getAs[Any](columnName)) match { case Some(columnValue) => columnName -> columnValue.toString - case None => throw new RuntimeException(s"Partition column '$columnName' contains null value") + case None => + throw new RuntimeException(s"Partition column '$columnName' contains null value") } } } @@ -70,11 +88,8 @@ object DataFrameUtils { def addMissingColumns(targetSchema: StructType): DataFrame = { val dataFieldsSet = df.schema.fieldNames.toSet val selectColumns = targetSchema.fields.map { field => - if (dataFieldsSet.contains(field.name)) { - functions.col(field.name) - } else { - functions.lit(null).cast(field.dataType).as(field.name) - } + if (dataFieldsSet.contains(field.name)) functions.col(field.name) + else functions.lit(null).cast(field.dataType).as(field.name) } df.select(selectColumns: _*) } diff --git a/src/main/scala/com/adidas/analytics/util/DistCpLoadHelper.scala b/src/main/scala/com/adidas/analytics/util/DistCpLoadHelper.scala deleted file mode 100644 index 0e3604e..0000000 --- a/src/main/scala/com/adidas/analytics/util/DistCpLoadHelper.scala +++ /dev/null @@ -1,144 +0,0 @@ -package com.adidas.analytics.util - -import java.io.IOException - -import com.adidas.analytics.util.DFSWrapper._ -import org.apache.hadoop.fs.{FileSystem, Path} -import org.joda.time.LocalDateTime -import org.slf4j.{Logger, LoggerFactory} - -import scala.util.{Failure, Success, Try} - - -object DistCpLoadHelper { - - private val logger: Logger = LoggerFactory.getLogger(getClass) - - - def buildTempPath(directoryPath: Path): Path = { - val dateTime = LocalDateTime.now().toString("yyyyMMdd_HHmm") - new Path(directoryPath.getParent, s"${directoryPath.getName}_tmp_$dateTime") - } - - def cleanupDirectoryContent(fs: FileSystem, dir: Path): Unit = { - logger.info(s"Cleaning up location $dir") - try { - val childObjects = fs.listStatus(dir).map(_.getPath) - fs.deleteAll(childObjects, recursive = true) - logger.info("Cleanup successfully completed") - } catch { - case e: Throwable => throw new RuntimeException(s"Unable to cleanup directory $dir", e) - } - } - - def cleanupDirectoryContent(fs: FileSystem, dirString: String): Unit = { - val dir = new Path(dirString) - cleanupDirectoryContent(fs, dir) - } - - def cleanupDirectoryContent(dfs: DFSWrapper, dirString: String): Unit = { - val dir = new Path(dirString) - val fs = dfs.getFileSystem(dir) - cleanupDirectoryContent(fs, dir) - } - - def backupDirectoryContent(fs: FileSystem, sourceDir: Path, backupDir: Path): Unit = { - logger.info(s"Creating backup $sourceDir -> $backupDir") - try { - copyChildren(fs, sourceDir, backupDir) - logger.info("Backup successfully created") - } catch { - case e: Throwable => throw new RuntimeException(s"Unable to backup content of $sourceDir", e) - } - } - - def backupDirectoryContent(fs: FileSystem, sourceDirString: String, backupDirString: String): Unit = { - val sourceDir = new Path(sourceDirString) - val backupDir = new Path(backupDirString) - backupDirectoryContent(fs, sourceDir, backupDir) - } - - def backupDirectoryContent(dfs: DFSWrapper, sourceDirString: String, backupDirString: String): Unit = { - val sourceDir = new Path(sourceDirString) - val backupDir = new Path(backupDirString) - val fs = dfs.getFileSystem(sourceDir) - backupDirectoryContent(fs, sourceDir, backupDir) - } - - def restoreDirectoryContent(fs: FileSystem, sourceDir: Path, backupDir: Path): Unit = { - logger.info(s"Restoring directory state $backupDir -> $sourceDir") - try { - cleanupDirectoryContent(fs, sourceDir) - copyChildren(fs, backupDir, sourceDir) - logger.info("Previous state was successfully restored") - } catch { - case e: Throwable => throw new RuntimeException(s"Unable to restore state of $sourceDir", e) - } - } - - def restoreDirectoryContent(fs: FileSystem, sourceDirString: String, backupDirString: String): Unit = { - val sourcePath = new Path(sourceDirString) - val backupPath = new Path(backupDirString) - restoreDirectoryContent(fs, sourcePath, backupPath) - } - - def restoreDirectoryContent(dfs: DFSWrapper, sourceDirString: String, backupDirString: String): Unit = { - val sourcePath = new Path(sourceDirString) - val backupPath = new Path(backupDirString) - val fs = dfs.getFileSystem(sourcePath) - restoreDirectoryContent(fs, sourcePath, backupPath) - } - - def createCopySpecs(fs: FileSystem, sourceDir: Path, targetDir: Path, partitionsCriteria: Seq[Seq[(String, String)]]): Seq[DistCpSpec] = { - partitionsCriteria.map { partitionCriteria => - val subdirectories = DataFrameUtils.mapPartitionsToDirectories(partitionCriteria) - DistCpSpec(sourceDir.join(subdirectories), targetDir.join(subdirectories)) - }.filter(spec => fs.exists(spec.source)) - } - - def backupPartitions(fs: FileSystem, backupSpecs: Seq[DistCpSpec]): Unit = { - try { - copyDirectories(fs, backupSpecs) - fs.deleteAll(backupSpecs.map(_.source), recursive = true) - } catch { - case e: Throwable => throw new RuntimeException("Unable to backup partitions", e) - } - } - - def restorePartitions(fs: FileSystem, backupSpecs: Seq[DistCpSpec]): Unit = { - try { - copyDirectories(fs, backupSpecs.map(spec => DistCpSpec(spec.target, spec.source))) - } catch { - case e: Throwable => throw new RuntimeException("Unable to restore partitions", e) - } - } - - def loadPartitions(fs: FileSystem, loadSpecs: Seq[DistCpSpec]): Unit = { - try { - copyDirectories(fs, loadSpecs) - } catch { - case e: Throwable => throw new RuntimeException("Unable to load partitions", e) - } - } - - def copyChildren(fs: FileSystem, sourceDir: Path, targetDir: Path): Unit = { - Try { - DistCpWrapper(fs.getConf, Seq(sourceDir), targetDir).run(overwrite = true) - } match { - case Failure(e) => throw new IOException(s"Unable to copy directory content $sourceDir -> $targetDir", e) - case Success(_) => - } - } - - private def copyDirectories(fs: FileSystem, specs: Seq[DistCpSpec]): Unit = { - specs.foreach { spec => - logger.info(s"Copying partition directory ${spec.source} -> ${spec.target}") - if (fs.exists(spec.target) || !fs.mkdirs(spec.target)) { - throw new IOException(s"Unable to create target directory ${spec.target}") - } - copyChildren(fs, spec.source, spec.target) - } - } - - protected case class DistCpSpec(source: Path, target: Path) -} diff --git a/src/main/scala/com/adidas/analytics/util/DistCpWrapper.scala b/src/main/scala/com/adidas/analytics/util/DistCpWrapper.scala index 9e8fd76..3bc2c7c 100644 --- a/src/main/scala/com/adidas/analytics/util/DistCpWrapper.scala +++ b/src/main/scala/com/adidas/analytics/util/DistCpWrapper.scala @@ -4,13 +4,11 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.tools.{DistCp, DistCpOptions} - -import scala.collection.JavaConversions._ - +import scala.collection.JavaConverters._ class DistCpWrapper(conf: Configuration, sources: Seq[Path], target: Path) { - private val baseOptions = new DistCpOptions(sources, target) + private val baseOptions = new DistCpOptions(sources.asJava, target) def run(mapsNum: Int = 10, atomic: Boolean = false, overwrite: Boolean = false): Job = { val options = new DistCpOptions(baseOptions) @@ -29,7 +27,6 @@ class DistCpWrapper(conf: Configuration, sources: Seq[Path], target: Path) { object DistCpWrapper { - def apply(conf: Configuration, sources: Seq[Path], target: Path): DistCpWrapper = { + def apply(conf: Configuration, sources: Seq[Path], target: Path): DistCpWrapper = new DistCpWrapper(conf, sources, target) - } -} \ No newline at end of file +} diff --git a/src/main/scala/com/adidas/analytics/util/HadoopLoadHelper.scala b/src/main/scala/com/adidas/analytics/util/HadoopLoadHelper.scala index b8abf25..51173ef 100644 --- a/src/main/scala/com/adidas/analytics/util/HadoopLoadHelper.scala +++ b/src/main/scala/com/adidas/analytics/util/HadoopLoadHelper.scala @@ -1,17 +1,25 @@ package com.adidas.analytics.util import java.io.IOException - import com.adidas.analytics.util.DFSWrapper._ -import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.fs.{FileSystem, FileUtil, Path, PathFilter} import org.joda.time.LocalDateTime import org.slf4j.{Logger, LoggerFactory} - +import scala.util.{Failure, Success, Try} object HadoopLoadHelper { private val logger: Logger = LoggerFactory.getLogger(getClass) + def buildTimestampedTablePath(directoryPath: Path): Path = { + val dateTime = LocalDateTime.now().toString("yyyyMMddHHmmssSSS") + new Path(directoryPath.getParent, s"${directoryPath.getName}_$dateTime") + } + + def buildUTCTimestampedTablePath(directoryPath: Path): Path = { + val dateTime = s"${LocalDateTime.now().toString("yyyyMMdd_HHmmss")}_UTC" + new Path(directoryPath, s"$dateTime") + } def buildTempPath(directoryPath: Path): Path = { val dateTime = LocalDateTime.now().toString("yyyyMMdd_HHmm") @@ -20,7 +28,7 @@ object HadoopLoadHelper { def cleanupDirectoryContent(fs: FileSystem, dir: Path): Unit = { logger.info(s"Cleaning up location $dir") - try { + try if (fs.exists(dir)) { fs.deleteAll(fs.listStatus(dir).map(_.getPath), recursive = true) logger.info("Cleanup successfully completed") } catch { @@ -33,104 +41,205 @@ object HadoopLoadHelper { cleanupDirectoryContent(fs, dir) } - def backupDirectoryContent(fs: FileSystem, sourceDir: Path, backupDir: Path): Unit = { + def cleanupDirectoryContent(dfs: DFSWrapper, dirString: String): Unit = { + val dir = new Path(dirString) + val fs = dfs.getFileSystem(dir) + cleanupDirectoryContent(fs, dir) + } + + def cleanupDirectoryLeftovers(fs: FileSystem, dir: Path, ignorePrefixes: Seq[String]): Unit = { + logger.info(s"Cleaning up leftovers in directory $dir") + try if (fs.exists(dir)) { + fs.deleteAll( + fs.listStatus(dir) + .map(_.getPath) + .filter(path => !ignorePrefixes.exists(path.toString.contains)), + recursive = true + ) + logger.info("Cleaning up leftovers successfully completed") + } else logger.info(s"Folder $dir did not exist! Cleanup of leftovers skipped") + catch { + case e: Throwable => + throw new RuntimeException(s"Unable to cleanup leftovers in directory $dir", e) + } + } + + def cleanupDirectoryLeftovers(dfs: DFSWrapper, dirString: String, ignorePrefix: String): Unit = { + val dir = new Path(dirString) + val fs = dfs.getFileSystem(dir) + cleanupDirectoryLeftovers(fs, dir, Seq(ignorePrefix)) + } + + def cleanupDirectoryLeftovers( + dfs: DFSWrapper, + dirString: String, + ignorePrefixes: Seq[String] + ): Unit = { + val dir = new Path(dirString) + val fs = dfs.getFileSystem(dir) + cleanupDirectoryLeftovers(fs, dir, ignorePrefixes) + } + + /** Gets a list of the ordered subfolders in a specific folder. + * + * @param dfs + * distributed file system + * @param dir + * directory to check for subfolders + * @param maxResults + * Optional number of subfolders to get + * @param fileFilter + * optional PathFilter to avoid considering some folders/files in the ordering process (e.g., + * S3 EMR folder placeholders, partition folders) + * @param ordering + * type of string ordering + * @return + * list of ordered subfolders + */ + def getOrderedSubFolders( + dfs: DFSWrapper, + dir: String, + maxResults: Option[Int] = None, + fileFilter: Option[PathFilter] = None, + ordering: Ordering[String] = Ordering.String.reverse + ): Seq[String] = { + val tableParentDirPath = new Path(dir) + val fs = dfs.getFileSystem(tableParentDirPath) + + val files = { + if (fileFilter.isDefined) fs.listStatus(tableParentDirPath, fileFilter.get) + else fs.listStatus(tableParentDirPath) + } + + val prefixes = files.map(_.getPath.getName).toSeq.sorted(ordering) + + if (maxResults.isDefined) prefixes.take(maxResults.get) else prefixes + } + + def backupDirectoryContent( + fs: FileSystem, + sourceDir: Path, + backupDir: Path, + move: Boolean = true + ): Unit = { logger.info(s"Creating backup $sourceDir -> $backupDir") try { - if (fs.exists(backupDir) || !fs.mkdirs(backupDir)) { - throw new IOException(s"Unable to create target directory ${backupDir}") - } - moveChildren(fs, sourceDir, backupDir) + if (move) { + if (fs.exists(backupDir) || !fs.mkdirs(backupDir)) + throw new IOException(s"Unable to create target directory $backupDir") + moveChildren(fs, sourceDir, backupDir) + } else copyChildren(fs, sourceDir, backupDir) logger.info("Backup successfully created") } catch { case e: Throwable => throw new RuntimeException(s"Unable to backup content of $sourceDir", e) } } - def backupDirectoryContent(fs: FileSystem, sourceDirString: String, backupDirString: String): Unit = { + def backupDirectoryContent( + dfs: DFSWrapper, + sourceDirString: String, + backupDirString: String, + move: Boolean + ): Unit = { val sourceDir = new Path(sourceDirString) val backupDir = new Path(backupDirString) - backupDirectoryContent(fs, sourceDir, backupDir) + val fs = dfs.getFileSystem(sourceDir) + backupDirectoryContent(fs, sourceDir, backupDir, move) } - def restoreDirectoryContent(fs: FileSystem, sourceDir: Path, backupDir: Path): Unit = { + def restoreDirectoryContent( + fs: FileSystem, + sourceDir: Path, + backupDir: Path, + move: Boolean = true + ): Unit = { logger.info(s"Restoring directory state $backupDir -> $sourceDir") try { cleanupDirectoryContent(fs, sourceDir) - moveChildren(fs, backupDir, sourceDir) + if (move) moveChildren(fs, backupDir, sourceDir) else copyChildren(fs, backupDir, sourceDir) logger.info("Previous state was successfully restored") } catch { case e: Throwable => throw new RuntimeException(s"Unable to restore state of $sourceDir", e) } } - def restoreDirectoryContent(fs: FileSystem, sourceDirString: String, backupDirString: String): Unit = { + def restoreDirectoryContent( + dfs: DFSWrapper, + sourceDirString: String, + backupDirString: String, + move: Boolean + ): Unit = { val sourcePath = new Path(sourceDirString) val backupPath = new Path(backupDirString) - restoreDirectoryContent(fs, sourcePath, backupPath) + val fs = dfs.getFileSystem(sourcePath) + restoreDirectoryContent(fs, sourcePath, backupPath, move) } - def createMoveSpecs(fs: FileSystem, sourceDir: Path, targetDir: Path, partitionsCriteria: Seq[Seq[(String, String)]]): Seq[MoveSpec] = { - partitionsCriteria.map { partitionCriteria => - val subdirectories = DataFrameUtils.mapPartitionsToDirectories(partitionCriteria) - MoveSpec(sourceDir.join(subdirectories), targetDir.join(subdirectories)) - }.filter(spec => fs.exists(spec.source)) - } + def createMoveSpecs( + fs: FileSystem, + sourceDir: Path, + targetDir: Path, + partitionsCriteria: Seq[Seq[(String, String)]] + ): Seq[MoveSpec] = + partitionsCriteria + .map { partitionCriteria => + val subdirectories = DataFrameUtils.mapPartitionsToDirectories(partitionCriteria) + MoveSpec(sourceDir.join(subdirectories), targetDir.join(subdirectories)) + } + .filter(spec => fs.exists(spec.source)) - def backupPartitions(fs: FileSystem, backupSpecs: Seq[MoveSpec]): Unit = { + def backupPartitions(fs: FileSystem, backupSpecs: Seq[MoveSpec]): Unit = try { moveDirectories(fs, backupSpecs) - fs.deleteAll(backupSpecs.map(_.source), recursive = true) - } catch { - case e: Throwable => throw new RuntimeException("Unable to backup partitions", e) - } - } + fs.deleteAll(backupSpecs.filter(ms => fs.exists(ms.source)).map(_.source), recursive = true) + } catch { case e: Throwable => throw new RuntimeException("Unable to backup partitions", e) } - def restorePartitions(fs: FileSystem, backupSpecs: Seq[MoveSpec]): Unit = { + def restorePartitions(fs: FileSystem, backupSpecs: Seq[MoveSpec]): Unit = try { val restoreSpecs = backupSpecs.map(_.reverse) - fs.deleteAll(restoreSpecs.map(_.target), recursive = true) + fs.deleteAll(restoreSpecs.filter(ms => fs.exists(ms.target)).map(_.target), recursive = true) moveDirectories(fs, restoreSpecs) - } catch { - case e: Throwable => throw new RuntimeException("Unable to restore partitions", e) - } - } + } catch { case e: Throwable => throw new RuntimeException("Unable to restore partitions", e) } - def loadTable(fs: FileSystem, sourceDir: Path, targetDir: Path): Unit = { - try { - moveChildren(fs, sourceDir, targetDir) - } catch { - case e: Throwable => throw new RuntimeException("Unable to load table", e) - } - } + def loadTable(fs: FileSystem, sourceDir: Path, targetDir: Path): Unit = + try moveChildren(fs, sourceDir, targetDir) + catch { case e: Throwable => throw new RuntimeException("Unable to load table", e) } - def loadPartitions(fs: FileSystem, loadSpecs: Seq[MoveSpec]): Unit = { - try { - moveDirectories(fs, loadSpecs) - } catch { - case e: Throwable => throw new RuntimeException("Unable to load partitions", e) - } - } + def loadPartitions(fs: FileSystem, loadSpecs: Seq[MoveSpec]): Unit = + try moveDirectories(fs, loadSpecs) + catch { case e: Throwable => throw new RuntimeException("Unable to load partitions", e) } - private def moveChildren(fs: FileSystem, sourceDir: Path, targetDir: Path): Unit = { + private def moveChildren(fs: FileSystem, sourceDir: Path, targetDir: Path): Unit = try { val childObjects = fs.listStatus(sourceDir).map(_.getPath) fs.renameAll(childObjects, targetDir) } catch { - case e: Throwable => throw new IOException(s"Unable to move directory content $sourceDir -> $targetDir", e) + case e: Throwable => + throw new IOException(s"Unable to move directory content $sourceDir -> $targetDir", e) } - } - private def moveDirectories(fs: FileSystem, specs: Seq[MoveSpec]): Unit = { + def copyChildren(fs: FileSystem, sourceDir: Path, targetDir: Path): Unit = + Try { + fs.listStatus(sourceDir).foreach { file => + FileUtil + .copy(fs, file, fs, targetDir.join(Seq(file.getPath.getName)), false, true, fs.getConf) + } + } match { + case Failure(e) => + throw new IOException(s"Unable to copy directory content $sourceDir -> $targetDir", e) + case Success(_) => + } + + private def moveDirectories(fs: FileSystem, specs: Seq[MoveSpec]): Unit = specs.par.foreach { spec => logger.info(s"Moving partition directory ${spec.source} -> ${spec.target}") - if (fs.exists(spec.target) || !fs.mkdirs(spec.target)) { + if (fs.exists(spec.target) || !fs.mkdirs(spec.target)) throw new IOException(s"Unable to create target directory ${spec.target}") - } moveChildren(fs, spec.source, spec.target) } - } protected case class MoveSpec(source: Path, target: Path) { def reverse: MoveSpec = MoveSpec(target, source) } + } diff --git a/src/main/scala/com/adidas/analytics/util/HiveTableAttributeReader.scala b/src/main/scala/com/adidas/analytics/util/HiveTableAttributeReader.scala deleted file mode 100644 index c4582d2..0000000 --- a/src/main/scala/com/adidas/analytics/util/HiveTableAttributeReader.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.adidas.analytics.util - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions._ - - -class HiveTableAttributeReader(table: String, sparkSession: SparkSession) { - private val KeyColumn = col("col_name") - private val ValueColumn = col("data_type") - private val LocationKey = "LOCATION" - - - private lazy val attributes: Map[String, String] = sparkSession.sql(s"describe formatted $table") - .select(KeyColumn, ValueColumn) - .withColumn(KeyColumn.toString, upper(KeyColumn)) - .collect() - .map(row => (row.getAs[String](KeyColumn.toString), row.getAs[String](ValueColumn.toString))) - .toMap - - /** - * Generic method to read attribute values from hive table description output - * @param key Key to be read (First column) - * @return Value (Second Column) - */ - def getValueForKey(key: String): String = attributes(key.toUpperCase) - - /** - * Read table location - * @return table location - */ - def getTableLocation: String = getValueForKey(LocationKey) -} - - -object HiveTableAttributeReader { - def apply(tableName: String, sparkSession: SparkSession): HiveTableAttributeReader = { - new HiveTableAttributeReader(tableName, sparkSession) - } -} diff --git a/src/main/scala/com/adidas/analytics/util/InputReader.scala b/src/main/scala/com/adidas/analytics/util/InputReader.scala index 0db0373..adbca2d 100644 --- a/src/main/scala/com/adidas/analytics/util/InputReader.scala +++ b/src/main/scala/com/adidas/analytics/util/InputReader.scala @@ -3,8 +3,7 @@ package com.adidas.analytics.util import org.apache.spark.sql.{DataFrame, SparkSession} import org.slf4j.{Logger, LoggerFactory} -/** - * Base trait for classes which are capable of reading data into DataFrames +/** Base trait for classes which are capable of reading data into DataFrames */ sealed abstract class InputReader { @@ -13,61 +12,76 @@ sealed abstract class InputReader { def read(sparkSession: SparkSession): DataFrame } - object InputReader { - /** - * Factory method which creates TableReader + /** Factory method which creates TableReader * - * @param table source table to read data from - * @param options Spark reader options - * @return TableReader + * @param table + * source table to read data from + * @param options + * Spark reader options + * @return + * TableReader */ - def newTableReader(table: String, options: Map[String, String] = Map.empty): TableReader = { + def newTableReader(table: String, options: Map[String, String] = Map.empty): TableReader = TableReader(table, options) - } - /** - * Factory method which creates FileSystemReader + /** Factory method which creates FileSystemReader * - * @param location location to read data from - * @param format format of source data - * @param options Spark reader options - * @return FileSystemReader + * @param location + * location to read data from + * @param format + * format of source data + * @param options + * Spark reader options + * @return + * FileSystemReader */ - def newFileSystemReader(location: String, format: DataFormat, options: Map[String, String] = Map.empty): FileSystemReader = { - FileSystemReader(location, format, options) - } + def newFileSystemReader( + location: String, + format: DataFormat, + options: Map[String, String] = Map.empty + ): FileSystemReader = FileSystemReader(location, format, options) - /** - * Factory method which creates TableLocationReader + /** Factory method which creates TableLocationReader * - * @param table source table which location is used to read data from - * @param format format of source data - * @param options Spark reader options - * @return TableLocationReader + * @param table + * source table which location is used to read data from + * @param format + * format of source data + * @param options + * Spark reader options + * @return + * TableLocationReader */ - def newTableLocationReader(table: String, format: DataFormat, options: Map[String, String] = Map.empty): TableLocationReader = { - TableLocationReader(table, format, options) - } + def newTableLocationReader( + table: String, + format: DataFormat, + options: Map[String, String] = Map.empty + ): TableLocationReader = TableLocationReader(table, format, options) case class TableReader(table: String, options: Map[String, String]) extends InputReader { + override def read(sparkSession: SparkSession): DataFrame = { logger.info(s"Reading data from table $table") sparkSession.read.options(options).table(table) } } - case class FileSystemReader(location: String, format: DataFormat, options: Map[String, String]) extends InputReader { + case class FileSystemReader(location: String, format: DataFormat, options: Map[String, String]) + extends InputReader { + override def read(sparkSession: SparkSession): DataFrame = { logger.info(s"Reading data from location $location") format.read(sparkSession.read.options(options), location) } } - case class TableLocationReader(table: String, format: DataFormat, options: Map[String, String]) extends InputReader { + case class TableLocationReader(table: String, format: DataFormat, options: Map[String, String]) + extends InputReader { + override def read(sparkSession: SparkSession): DataFrame = { - val location = HiveTableAttributeReader(table, sparkSession).getTableLocation + val location = CatalogTableManager(table, sparkSession).getTableLocation logger.info(s"Reading data from location $location") format.read(sparkSession.read.options(options), location) } diff --git a/src/main/scala/com/adidas/analytics/util/JavaConsumable.scala b/src/main/scala/com/adidas/analytics/util/JavaConsumable.scala index fc127ce..553f803 100644 --- a/src/main/scala/com/adidas/analytics/util/JavaConsumable.scala +++ b/src/main/scala/com/adidas/analytics/util/JavaConsumable.scala @@ -1,18 +1,14 @@ package com.adidas.analytics.util import org.apache.spark.sql.DataFrame - import scala.collection.JavaConverters._ trait JavaConsumable { def algorithm(dataFrames: Vector[DataFrame]): Vector[DataFrame] - def algorithm(dataFrames: java.util.List[DataFrame]): java.util.List[DataFrame] = { + def algorithm(dataFrames: java.util.List[DataFrame]): java.util.List[DataFrame] = algorithm(dataFrames.asScala.toVector).asJava - } - def algorithm(dataFrame: DataFrame): DataFrame = { - algorithm(Vector(dataFrame))(0) - } + def algorithm(dataFrame: DataFrame): DataFrame = algorithm(Vector(dataFrame))(0) } diff --git a/src/main/scala/com/adidas/analytics/util/LoadMode.scala b/src/main/scala/com/adidas/analytics/util/LoadMode.scala index 043bf2d..e24affc 100644 --- a/src/main/scala/com/adidas/analytics/util/LoadMode.scala +++ b/src/main/scala/com/adidas/analytics/util/LoadMode.scala @@ -28,6 +28,3 @@ object LoadMode { override def sparkMode: SaveMode = SaveMode.Append } } - - - diff --git a/src/main/scala/com/adidas/analytics/util/OutputWriter.scala b/src/main/scala/com/adidas/analytics/util/OutputWriter.scala index 7b65dc4..e872f72 100644 --- a/src/main/scala/com/adidas/analytics/util/OutputWriter.scala +++ b/src/main/scala/com/adidas/analytics/util/OutputWriter.scala @@ -5,11 +5,9 @@ import com.adidas.analytics.util.DataFrameUtils._ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.{DataFrameWriter, _} import org.slf4j.{Logger, LoggerFactory} - import scala.util.{Failure, Success, Try} -/** - * Base trait for classes which are capable of persisting DataFrames +/** Base trait for classes which are capable of persisting DataFrames */ sealed abstract class OutputWriter { @@ -21,88 +19,121 @@ sealed abstract class OutputWriter { def write(dfs: DFSWrapper, df: DataFrame): DataFrame - protected def getWriter(df: DataFrame): DataFrameWriter[Row] = { - if (targetPartitions.nonEmpty) { - df.write.partitionBy(targetPartitions: _*) - } else { - df.write - } - } + protected def getWriter(df: DataFrame): DataFrameWriter[Row] = + if (targetPartitions.nonEmpty) df.write.partitionBy(targetPartitions: _*) else df.write } - object OutputWriter { - /** - * Factory method which creates TableWriter + /** Factory method which creates TableWriter * - * @param table target table - * @param targetPartitions specifies how data should be partitioned - * @param options options which are provided to Spark DataFrameWriter - * @param loadMode specifies LoadMode for the writer (see more details for SaveMode in Spark documentation) - * @return TableWriter + * @param table + * target table + * @param targetPartitions + * specifies how data should be partitioned + * @param options + * options which are provided to Spark DataFrameWriter + * @param loadMode + * specifies LoadMode for the writer (see more details for SaveMode in Spark documentation) + * @return + * TableWriter */ - def newTableWriter(table: String, targetPartitions: Seq[String] = Seq.empty, options: Map[String, String] = Map.empty, - loadMode: LoadMode = LoadMode.OverwritePartitionsWithAddedColumns): TableWriter = { - TableWriter(table, targetPartitions, options, loadMode) - } - - /** - * Factory method which creates TableLocationWriter + def newTableWriter( + table: String, + targetPartitions: Seq[String] = Seq.empty, + options: Map[String, String] = Map.empty, + loadMode: LoadMode = LoadMode.OverwritePartitionsWithAddedColumns + ): TableWriter = TableWriter(table, targetPartitions, options, loadMode) + + /** Factory method which creates TableLocationWriter * - * @param table target table which location is used for writing data to - * @param format format of result data - * @param targetPartitions specifies how data should be partitioned - * @param options options which are provided to Spark DataFrameWriter - * @param loadMode specifies LoadMode for the writer (see more details for SaveMode in Spark documentation) - * @return TableLocationWriter + * @param table + * target table which location is used for writing data to + * @param format + * format of result data + * @param targetPartitions + * specifies how data should be partitioned + * @param options + * options which are provided to Spark DataFrameWriter + * @param loadMode + * specifies LoadMode for the writer (see more details for SaveMode in Spark documentation) + * @return + * TableLocationWriter */ - def newTableLocationWriter(table: String, format: DataFormat, targetPartitions: Seq[String] = Seq.empty, - options: Map[String, String] = Map.empty, loadMode: LoadMode = LoadMode.OverwritePartitionsWithAddedColumns, - metadataConfiguration: Metadata): TableLocationWriter = { + def newTableLocationWriter( + table: String, + format: DataFormat, + targetPartitions: Seq[String] = Seq.empty, + options: Map[String, String] = Map.empty, + loadMode: LoadMode = LoadMode.OverwritePartitionsWithAddedColumns, + metadataConfiguration: Metadata + ): TableLocationWriter = TableLocationWriter(table, format, targetPartitions, options, loadMode, metadataConfiguration) - } - /** - * Factory method which creates FileSystemWriter + /** Factory method which creates FileSystemWriter * - * @param location output location on the filesystem - * @param format format of result data - * @param targetPartitions specifies how data should be partitioned - * @param options options which are provided to Spark DataFrameWriter - * @param loadMode specifies LoadMode for the writer (see more details for SaveMode in Spark documentation) - * @return FileSystemWriter + * @param location + * output location on the filesystem + * @param format + * format of result data + * @param targetPartitions + * specifies how data should be partitioned + * @param options + * options which are provided to Spark DataFrameWriter + * @param loadMode + * specifies LoadMode for the writer (see more details for SaveMode in Spark documentation) + * @return + * FileSystemWriter */ - def newFileSystemWriter(location: String, format: DataFormat, targetPartitions: Seq[String] = Seq.empty, - options: Map[String, String] = Map.empty, loadMode: LoadMode = LoadMode.OverwritePartitionsWithAddedColumns): FileSystemWriter = { - FileSystemWriter(location, format, targetPartitions, options, loadMode) - } - - /** - * Base trait for writers which are capable of writing data in safer way. - * The data is written in three steps: - * - write data to a temporary location - * - create backups for existing partitions which are supposed to be replaced - * - move new partitions to the final location + def newFileSystemWriter( + location: String, + format: DataFormat, + targetPartitions: Seq[String] = Seq.empty, + options: Map[String, String] = Map.empty, + loadMode: LoadMode = LoadMode.OverwritePartitionsWithAddedColumns + ): FileSystemWriter = FileSystemWriter(location, format, targetPartitions, options, loadMode) + + /** Base trait for writers which are capable of writing data in safer way. The data is written in + * three steps: + * - write data to a temporary location + * - create backups for existing partitions which are supposed to be replaced + * - move new partitions to the final location */ sealed trait AtomicWriter extends OutputWriter { def format: DataFormat - def writeWithBackup(dfs: DFSWrapper, df: DataFrame): DataFrame - - protected def writeUnsafe(dfs: DFSWrapper, df: DataFrame, finalLocation: String, loadMode: LoadMode): DataFrame = { + def writeWithBackup( + dfs: DFSWrapper, + df: DataFrame, + affectedPartitions: Option[Seq[PartitionCriteria]] = None + ): DataFrame + + protected def writeUnsafe( + dfs: DFSWrapper, + df: DataFrame, + finalLocation: String, + loadMode: LoadMode + ): DataFrame = { val finalPath = new Path(finalLocation) val fs = dfs.getFileSystem(finalPath) - if (loadMode == LoadMode.OverwriteTable) { + if (loadMode == LoadMode.OverwriteTable) HadoopLoadHelper.cleanupDirectoryContent(fs, finalPath) - } - write(fs, df, finalPath, loadMode) + write(df, finalPath, loadMode) } - protected def writeSafe(dfs: DFSWrapper, df: DataFrame, finalLocation: String, loadMode: LoadMode): DataFrame = { + protected def writeSafe( + dfs: DFSWrapper, + df: DataFrame, + finalLocation: String, + loadMode: LoadMode, + affectedPartitions: Option[Seq[PartitionCriteria]] = None + ): DataFrame = Try { - lazy val partitionsCriteria = df.collectPartitions(targetPartitions) + lazy val partitionsCriteria = { + if (affectedPartitions.nonEmpty) affectedPartitions.get + else df.collectPartitions(targetPartitions) + } val finalPath = new Path(finalLocation) val fs = dfs.getFileSystem(finalPath) @@ -114,54 +145,80 @@ object OutputWriter { fs.delete(tempPath, true) loadMode match { - case LoadMode.OverwriteTable => - loadTable(fs, df, finalPath, tempDataPath, tempBackupPath) + case LoadMode.OverwriteTable => loadTable(fs, df, finalPath, tempDataPath, tempBackupPath) case LoadMode.OverwritePartitions => loadPartitions(fs, df, finalPath, tempDataPath, tempBackupPath, partitionsCriteria) case LoadMode.OverwritePartitionsWithAddedColumns => val existingDf = format.read(df.sparkSession.read, finalLocation) val outputDf = df.addMissingColumns(existingDf.schema) - loadPartitions(fs, outputDf, finalPath, tempDataPath, tempBackupPath, partitionsCriteria) + loadPartitions( + fs, + outputDf, + finalPath, + tempDataPath, + tempBackupPath, + partitionsCriteria + ) case LoadMode.AppendJoinPartitions => - val isRequiredPartition = DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitionsCriteria, df.schema) - val existingDf = format.read(df.sparkSession.read, finalLocation).filter(isRequiredPartition) + val isRequiredPartition = + DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitionsCriteria, df.schema) + val existingDf = + format.read(df.sparkSession.read, finalLocation).filter(isRequiredPartition) val joinColumns = existingDf.columns.toSet intersect df.columns.toSet val combinedDf = existingDf.join(df, joinColumns.toSeq, "FULL_OUTER") - loadPartitions(fs, combinedDf, finalPath, tempDataPath, tempBackupPath, partitionsCriteria) + loadPartitions( + fs, + combinedDf, + finalPath, + tempDataPath, + tempBackupPath, + partitionsCriteria + ) case LoadMode.AppendUnionPartitions => - val isRequiredPartition = DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitionsCriteria, df.schema) - val existingDf = format.read(df.sparkSession.read, finalLocation).filter(isRequiredPartition) + val isRequiredPartition = + DataFrameUtils.buildPartitionsCriteriaMatcherFunc(partitionsCriteria, df.schema) + val existingDf = + format.read(df.sparkSession.read, finalLocation).filter(isRequiredPartition) val combinedDf = df.addMissingColumns(existingDf.schema).union(existingDf) - loadPartitions(fs, combinedDf, finalPath, tempDataPath, tempBackupPath, partitionsCriteria) + loadPartitions( + fs, + combinedDf, + finalPath, + tempDataPath, + tempBackupPath, + partitionsCriteria + ) } fs.delete(tempPath, true) } match { case Failure(exception) => throw exception - case Success(_) => df + case Success(_) => df } - } - - private def write(fs: FileSystem, df: DataFrame, finalPath: Path, loadMode: LoadMode): DataFrame = { + private def write(df: DataFrame, finalPath: Path, loadMode: LoadMode) = Try { - val writer = getWriter(df).options(options).mode(loadMode.sparkMode) - format.write(writer, finalPath.toUri.toString) - logger.info(s"Data was successfully written to $finalPath") + val writer = getWriter(df).options(options).mode(loadMode.sparkMode) + format.write(writer, finalPath.toUri.toString) + logger.info(s"Data was successfully written to $finalPath") } match { - case Failure(exception) => throw new RuntimeException("Unable to process data", exception) - case Success(_) => df + case Failure(exception) => throw new RuntimeException("Unable to process data", exception) + case Success(_) => df } - } - private def loadTable(fs: FileSystem, df: DataFrame, finalPath: Path, dataPath: Path, backupPath: Path): Unit = { - write(fs, df, dataPath, LoadMode.OverwriteTable) + private def loadTable( + fs: FileSystem, + df: DataFrame, + finalPath: Path, + dataPath: Path, + backupPath: Path + ): Unit = { + write(df, dataPath, LoadMode.OverwriteTable) HadoopLoadHelper.backupDirectoryContent(fs, finalPath, backupPath) logger.info(s"Loading data to final location $finalPath") - try { - HadoopLoadHelper.loadTable(fs, dataPath, finalPath) - } catch { + try HadoopLoadHelper.loadTable(fs, dataPath, finalPath) + catch { case e: Throwable => logger.error("Data processing failed", e) logger.info(s"Restoring previous state $backupPath -> $finalPath") @@ -170,37 +227,48 @@ object OutputWriter { } } - private def loadPartitions(fs: FileSystem, df: DataFrame, finalPath: Path, dataPath: Path, backupPath: Path, - partitionsCriteria: Seq[Seq[(String, String)]]): Unit = { + private def loadPartitions( + fs: FileSystem, + df: DataFrame, + finalPath: Path, + dataPath: Path, + backupPath: Path, + partitionsCriteria: Seq[Seq[(String, String)]] + ): Unit = if (partitionsCriteria.nonEmpty) { - write(fs, df, dataPath, LoadMode.OverwritePartitionsWithAddedColumns) + write(df, dataPath, LoadMode.OverwritePartitionsWithAddedColumns) logger.info(s"Creating backup in $backupPath") - val backupSpecs = HadoopLoadHelper.createMoveSpecs(fs, finalPath, backupPath, partitionsCriteria) + val backupSpecs = + HadoopLoadHelper.createMoveSpecs(fs, finalPath, backupPath, partitionsCriteria) HadoopLoadHelper.backupPartitions(fs, backupSpecs) logger.info(s"Loading data to final location $finalPath") - val loadSpecs = HadoopLoadHelper.createMoveSpecs(fs, dataPath, finalPath, partitionsCriteria) + val loadSpecs = + HadoopLoadHelper.createMoveSpecs(fs, dataPath, finalPath, partitionsCriteria) - try { - HadoopLoadHelper.loadPartitions(fs, loadSpecs) - } catch { + try HadoopLoadHelper.loadPartitions(fs, loadSpecs) + catch { case e: Throwable => logger.error("Data processing failed", e) logger.info(s"Restoring previous state $backupPath -> $finalPath") HadoopLoadHelper.restorePartitions(fs, backupSpecs) throw new RuntimeException(s"Unable to load data to $finalPath", e) } - } else { - logger.warn(s"Unable to load data, output data has no partitions for partition columns $targetPartitions") - } - } + } else + logger.warn( + s"Unable to load data, output data has no partitions for partition columns $targetPartitions" + ) } - case class TableWriter(table: String, targetPartitions: Seq[String], options: Map[String, String], - loadMode: LoadMode) extends OutputWriter { + case class TableWriter( + table: String, + targetPartitions: Seq[String], + options: Map[String, String], + loadMode: LoadMode + ) extends OutputWriter { - override def write(dfs: DFSWrapper, df: DataFrame): DataFrame = { + override def write(dfs: DFSWrapper, df: DataFrame): DataFrame = Try { logger.info(s"Writing data to table $table") if (loadMode == LoadMode.OverwriteTable) { @@ -210,27 +278,37 @@ object OutputWriter { getWriter(df).options(options).mode(loadMode.sparkMode).saveAsTable(table) } match { case Failure(exception) => throw exception - case Success(_) => df + case Success(_) => df } - } } - case class FileSystemWriter(location: String, format: DataFormat, targetPartitions: Seq[String], - options: Map[String, String], loadMode: LoadMode) extends AtomicWriter { + case class FileSystemWriter( + location: String, + format: DataFormat, + targetPartitions: Seq[String], + options: Map[String, String], + loadMode: LoadMode + ) extends AtomicWriter { - override def write(dfs: DFSWrapper, df: DataFrame): DataFrame = { + override def write(dfs: DFSWrapper, df: DataFrame): DataFrame = writeUnsafe(dfs, df, location, loadMode) - } - override def writeWithBackup(dfs: DFSWrapper, df: DataFrame): DataFrame = { - writeSafe(dfs, df, location, loadMode) - } + override def writeWithBackup( + dfs: DFSWrapper, + df: DataFrame, + affectedPartitions: Option[Seq[PartitionCriteria]] = None + ): DataFrame = writeSafe(dfs, df, location, loadMode) } - case class TableLocationWriter(table: String, format: DataFormat, targetPartitions: Seq[String], - options: Map[String, String], loadMode: LoadMode, - metadataConfiguration: Metadata) extends AtomicWriter { + case class TableLocationWriter( + table: String, + format: DataFormat, + targetPartitions: Seq[String], + options: Map[String, String], + loadMode: LoadMode, + metadataConfiguration: Metadata + ) extends AtomicWriter { override def write(dfs: DFSWrapper, df: DataFrame): DataFrame = { val spark = df.sparkSession @@ -239,28 +317,27 @@ object OutputWriter { updatePartitionsMetadata(df) } - override def writeWithBackup(dfs: DFSWrapper, df: DataFrame): DataFrame = { + override def writeWithBackup( + dfs: DFSWrapper, + df: DataFrame, + affectedPartitions: Option[Seq[PartitionCriteria]] = None + ): DataFrame = { val spark = df.sparkSession val location = getTableLocation(spark) - writeSafe(dfs, df, location, loadMode) + writeSafe(dfs, df, location, loadMode, affectedPartitions) updatePartitionsMetadata(df) } - private def updatePartitionsMetadata(df: DataFrame): DataFrame = { + private def updatePartitionsMetadata(df: DataFrame): DataFrame = Try { - if (targetPartitions.nonEmpty) { - metadataConfiguration.recoverPartitions(df) - } else { - metadataConfiguration.refreshTable(df) - } + if (targetPartitions.nonEmpty) metadataConfiguration.recoverPartitions(df) + else metadataConfiguration.refreshTable(df) } match { case Failure(exception) => throw exception - case Success(_) => df + case Success(_) => df } - } - private def getTableLocation(spark: SparkSession): String = { - HiveTableAttributeReader(table, spark).getTableLocation - } + private def getTableLocation(spark: SparkSession): String = + CatalogTableManager(table, spark).getTableLocation } } diff --git a/src/main/scala/com/adidas/analytics/util/RecoverPartitionsCustom.scala b/src/main/scala/com/adidas/analytics/util/RecoverPartitionsCustom.scala index 30f8f87..3cec510 100644 --- a/src/main/scala/com/adidas/analytics/util/RecoverPartitionsCustom.scala +++ b/src/main/scala/com/adidas/analytics/util/RecoverPartitionsCustom.scala @@ -1,12 +1,13 @@ package com.adidas.analytics.util -import com.adidas.analytics.algo.core.{PartitionHelpers, Metadata} +import com.adidas.analytics.algo.core.{Metadata, PartitionHelpers} import org.apache.spark.sql._ -import scala.collection.JavaConversions._ - -case class RecoverPartitionsCustom(override val tableName: String, - override val targetPartitions: Seq[String]) extends Metadata with PartitionHelpers{ +case class RecoverPartitionsCustom( + override val tableName: String, + override val targetPartitions: Seq[String] +) extends Metadata + with PartitionHelpers { override def recoverPartitions(outputDataFrame: DataFrame): Unit = { val spark: SparkSession = outputDataFrame.sparkSession @@ -14,19 +15,18 @@ case class RecoverPartitionsCustom(override val tableName: String, val distinctPartitions: DataFrame = getDistinctPartitions(outputDataFrame, targetPartitions) generateAddPartitionStatements(distinctPartitions) - .collectAsList() + .collect() .foreach((statement: String) => spark.sql(statement)) } - private def generateAddPartitionStatements(df: DataFrame): Dataset[String] = { - df.map(partitionValue => { - val partitionStatementValues: Seq[String] = targetPartitions - .map(partitionColumn => s"${partitionColumn}=${getParameterValue(partitionValue, partitionColumn)}") - - s"ALTER TABLE ${tableName} ADD IF NOT EXISTS PARTITION(${partitionStatementValues.mkString(",")})" - })(Encoders.STRING) - } + private def generateAddPartitionStatements(df: DataFrame): Dataset[String] = + df.map { partitionValue => + val partitionStatementValues: Seq[String] = targetPartitions.map(partitionColumn => + s"$partitionColumn=${getParameterValue(partitionValue, partitionColumn)}" + ) + s"ALTER TABLE $tableName ADD IF NOT EXISTS PARTITION(${partitionStatementValues.mkString(",")})" + }(Encoders.STRING) } diff --git a/src/main/scala/com/adidas/analytics/util/RecoverPartitionsNative.scala b/src/main/scala/com/adidas/analytics/util/RecoverPartitionsNative.scala index 6b6d5d1..1b7ff33 100644 --- a/src/main/scala/com/adidas/analytics/util/RecoverPartitionsNative.scala +++ b/src/main/scala/com/adidas/analytics/util/RecoverPartitionsNative.scala @@ -3,8 +3,10 @@ package com.adidas.analytics.util import com.adidas.analytics.algo.core.Metadata import org.apache.spark.sql.DataFrame -case class RecoverPartitionsNative(override val tableName: String, - override val targetPartitions: Seq[String]) extends Metadata { +case class RecoverPartitionsNative( + override val tableName: String, + override val targetPartitions: Seq[String] +) extends Metadata { override def recoverPartitions(outputDataFrame: DataFrame): Unit = outputDataFrame.sparkSession.catalog.recoverPartitions(tableName) diff --git a/src/main/scala/com/adidas/analytics/util/RemoteIteratorWrapper.scala b/src/main/scala/com/adidas/analytics/util/RemoteIteratorWrapper.scala index 91b57ea..928110d 100644 --- a/src/main/scala/com/adidas/analytics/util/RemoteIteratorWrapper.scala +++ b/src/main/scala/com/adidas/analytics/util/RemoteIteratorWrapper.scala @@ -1,16 +1,17 @@ package com.adidas.analytics.util import org.apache.hadoop.fs.RemoteIterator - import scala.collection.Iterator -/** - * Convert RemoteIterator from Hadoop to Scala Iterator that provides all the functions such as map, filter, foreach, etc. +/** Convert RemoteIterator from Hadoop to Scala Iterator that provides all the functions such as + * map, filter, foreach, etc. */ object RemoteIteratorWrapper { - implicit class RemoteIteratorToIterator[T](underlying: RemoteIterator[T]){ - def remoteIteratorToIterator : Iterator[T] = RemoteIteratorWrapper[T](underlying) + + implicit class RemoteIteratorToIterator[T](underlying: RemoteIterator[T]) { + + def remoteIteratorToIterator: Iterator[T] = RemoteIteratorWrapper[T](underlying) } } @@ -18,4 +19,3 @@ case class RemoteIteratorWrapper[T](underlying: RemoteIterator[T]) extends Itera override def hasNext: Boolean = underlying.hasNext override def next(): T = underlying.next() } - diff --git a/src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsCustom.scala b/src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsCustom.scala deleted file mode 100644 index 8c57fcf..0000000 --- a/src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsCustom.scala +++ /dev/null @@ -1,49 +0,0 @@ -package com.adidas.analytics.util - -import com.adidas.analytics.algo.core.Metadata -import org.apache.spark.sql._ -import org.apache.spark.sql.functions.col - -import scala.collection.JavaConversions._ - -case class SparkRecoverPartitionsCustom(override val tableName: String, - override val targetPartitions: Seq[String]) extends Metadata { - - override def recoverPartitions(outputDataFrame: DataFrame): Unit = { - - val spark: SparkSession = outputDataFrame.sparkSession - - val targetPartitionsColumns: Seq[Column] = targetPartitions.map(partitionString => col(partitionString)) - - val distinctPartitions: DataFrame = outputDataFrame.select(targetPartitionsColumns: _*).distinct - - val sqlStatements: Dataset[String] = generateAddPartitionStatements(distinctPartitions) - - sqlStatements.collectAsList().foreach((statement: String) => spark.sql(statement)) - } - - private def generateAddPartitionStatements(partitionsDataset: DataFrame): Dataset[String] = { - partitionsDataset.map(row => { - val partitionStatementValues: Seq[String] = targetPartitions - .map(partitionString => s"${partitionString}=${getParameterValue(row, partitionString)}") - - s"ALTER TABLE ${tableName} ADD IF NOT EXISTS PARTITION(${partitionStatementValues.mkString(",")})" - })(Encoders.STRING) - } - - private def getParameterValue(row: Row, partitionString: String): String = - createParameterValue(row.get(row.fieldIndex(partitionString))) - - private def createParameterValue(partitionRawValue: Any): String = { - partitionRawValue match { - case value: java.lang.Short => value.toString - case value: java.lang.Integer => value.toString - case value: scala.Predef.String => "'" + value + "'" - case null => throw new Exception("Partition Value is null. No support for null partitions!") - case value => throw new Exception("Unsupported partition DataType: " + value.getClass) - - } - - } - -} diff --git a/src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsNative.scala b/src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsNative.scala deleted file mode 100644 index c79c8c4..0000000 --- a/src/main/scala/com/adidas/analytics/util/SparkRecoverPartitionsNative.scala +++ /dev/null @@ -1,11 +0,0 @@ -package com.adidas.analytics.util - -import com.adidas.analytics.algo.core.Metadata -import org.apache.spark.sql.DataFrame - -case class SparkRecoverPartitionsNative(override val tableName: String, - override val targetPartitions: Seq[String]) extends Metadata { - - override def recoverPartitions(outputDataFrame: DataFrame): Unit = outputDataFrame.sparkSession.catalog.recoverPartitions(tableName) - -} diff --git a/src/test/resources/DeltaLoadTest/parquet_test_delta_merge_unpartitioned/params.json b/src/test/resources/DeltaLoadTest/parquet_test_delta_merge_unpartitioned/params.json deleted file mode 100644 index fb9b4f7..0000000 --- a/src/test/resources/DeltaLoadTest/parquet_test_delta_merge_unpartitioned/params.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "active_records_table_lake": "test_lake.delta_load_active_data", - "business_key": ["salesorder", "item"], - "delta_records_file_path": "hdfs:///tmp/tests/", - "technical_key": ["m3d_timestamp", "datapakid", "partno", "record"], - "target_partitions": ["year", "month", "day"], - "partition_column": "date", - "partition_column_format": "yyyyMMdd" -} \ No newline at end of file diff --git a/src/test/resources/GzipDecompressorTest/data_20180719111849_data_3-3.gz b/src/test/resources/GzipDecompressorTest/data_20180719111849_data_3-3.gz deleted file mode 100644 index 13343ad13b20e40c3a201a3536aa677b130184f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 182 zcmV;n07?HJiwFoSY9L+&0AyiwVP7&ZF*qSwCCL@Jn=5()*`QjUb^nGn{> zIZ5gZPhlqbl-Gr^kuOMj;kUT$esj=(QTm~rR;@K0HHn@2=8#eC-t@c`bn&+5M?6n_ kZ10RpH*OC;{LYKqx#@8Ych}=7yGrhK3d!z{2-2E#l8P%?)TZvY(1CwpKi%o`*4ltn=2EY-L_U={v38;{&w5 - val dataLocation = resolveResource(s"$testResourceDir/$fileName", withProtocol = true) - table.write(Seq(dataLocation), dsvReader, LoadMode.OverwritePartitionsWithAddedColumns, fillNulls = true) - table - case None => table - } - } - - def placeParametersFile(testResourceDir: String, paramsFileName: String): Unit ={ - paramsFileHdfsPath = new Path(hdfsRootTestPath, paramsFileName) - copyResourceFileToHdfs(s"$testResourceDir/$paramsFileName", paramsFileHdfsPath) - } - - override def beforeEach(): Unit = { - super.beforeEach() - spark.sql(s"DROP DATABASE IF EXISTS $testDatabase CASCADE") - spark.sql(s"CREATE DATABASE $testDatabase") - } -} diff --git a/src/test/scala/com/adidas/analytics/feature/FixedSizeStringExtractorTest.scala b/src/test/scala/com/adidas/analytics/feature/FixedSizeStringExtractorTest.scala index d83b15a..8d8c3f8 100644 --- a/src/test/scala/com/adidas/analytics/feature/FixedSizeStringExtractorTest.scala +++ b/src/test/scala/com/adidas/analytics/feature/FixedSizeStringExtractorTest.scala @@ -6,13 +6,13 @@ import com.adidas.analytics.util.{DFSWrapper, LoadMode} import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ - -class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { +class FixedSizeStringExtractorTest extends AnyFeatureSpec with BaseAlgorithmTest { private val paramsFileName: String = "params.json" + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) private val database: String = "test_lake" @@ -22,9 +22,10 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { private var sourceTable: Table = _ private var targetTable: Table = _ - - feature("Fixed-size string can be extractor from the input table and stored to the output table") { - scenario("Extracted strings match to the target schema") { + Feature( + "Fixed-size string can be extractor from the input table and stored to the output table" + ) { + Scenario("Extracted strings match to the target schema") { val testResourceDir = "matched_schema" prepare(testResourceDir) @@ -37,7 +38,8 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { // read expected data val testDataReader = FileReader.newDSVFileReader(Some(targetTable.schema)) - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = testDataReader.read(spark, expectedDataLocation) // compare the result @@ -45,7 +47,9 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { actualDf.hasDiff(expectedDf) shouldBe false } - scenario("Number of string to extract is less than the number of non-partition fields in the target schema") { + Scenario( + "Number of string to extract is less than the number of non-partition fields in the target schema" + ) { val testResourceDir = "non_matched_schema1" prepare(testResourceDir, initialData = false) @@ -57,7 +61,9 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { caught.getMessage shouldBe "Field positions do not correspond to the target schema" } - scenario("Number of string to extract is greater than the number of non-partition fields in the target schema") { + Scenario( + "Number of string to extract is greater than the number of non-partition fields in the target schema" + ) { val testResourceDir = "non_matched_schema2" prepare(testResourceDir, initialData = false) @@ -69,7 +75,7 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { caught.getMessage shouldBe "Field positions do not correspond to the target schema" } - scenario("Data matches to the schema and partitioning type is year/month") { + Scenario("Data matches to the schema and partitioning type is year/month") { val testResourceDir = "matched_schema_partitioned" prepare(testResourceDir, Seq("year", "month")) @@ -82,7 +88,8 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { // read expected data val testDataReader = FileReader.newDSVFileReader(Some(targetTable.schema)) - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = testDataReader.read(spark, expectedDataLocation) // compare the result @@ -97,19 +104,37 @@ class FixedSizeStringExtractorTest extends FeatureSpec with BaseAlgorithmTest { spark.sql(s"CREATE DATABASE $database") } - private def createTable(tableName: String, database: String, schema: StructType, targetPartitions: Seq[String]): Table = { - val table = Table.newBuilder(tableName, database, fs.makeQualified(new Path(hdfsRootTestPath, tableName)).toString, schema) - - if (targetPartitions.nonEmpty) { - table.withPartitions(targetPartitions).buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) - } else { - table.buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) - } + private def createTable( + tableName: String, + database: String, + schema: StructType, + targetPartitions: Seq[String] + ): Table = { + val table = Table.newBuilder( + tableName, + database, + fs.makeQualified(new Path(hdfsRootTestPath, tableName)).toString, + schema + ) + + if (targetPartitions.nonEmpty) + table + .withPartitions(targetPartitions) + .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + else table.buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } - private def prepare(testResourceDir: String, targetPartitions: Seq[String] = Seq.empty, initialData: Boolean = true): Unit = { - val sourceSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/source_schema.json")).asInstanceOf[StructType] - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + private def prepare( + testResourceDir: String, + targetPartitions: Seq[String] = Seq.empty, + initialData: Boolean = true + ): Unit = { + val sourceSchema = DataType + .fromJson(getResourceAsText(s"$testResourceDir/source_schema.json")) + .asInstanceOf[StructType] + val targetSchema = DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] // copy job parameters to HDFS copyResourceFileToHdfs(s"$testResourceDir/$paramsFileName", paramsFileHdfsPath) diff --git a/src/test/scala/com/adidas/analytics/feature/FullLoadTest.scala b/src/test/scala/com/adidas/analytics/feature/FullLoadTest.scala deleted file mode 100644 index 62327d2..0000000 --- a/src/test/scala/com/adidas/analytics/feature/FullLoadTest.scala +++ /dev/null @@ -1,445 +0,0 @@ -package com.adidas.analytics.feature - -import com.adidas.analytics.algo.FullLoad -import com.adidas.analytics.util.{DFSWrapper, HiveTableAttributeReader, LoadMode} -import com.adidas.utils.TestUtils._ -import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} -import org.apache.hadoop.fs.Path -import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ - -class FullLoadTest extends FeatureSpec with BaseAlgorithmTest { - - private val sourceEnvironmentLocation: String = "test_landing" - private val targetDatabase: String = "test_lake" - private val tableName: String = "test_table" - - private val paramsFileName: String = "params.json" - private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - - private val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceEnvironmentLocation/test/$tableName/data") - private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") - private val backupDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data_backup") - - feature("Reader mode can be specified in configuration") { - scenario("when reader_mode is invalid string an exception is thrown") { - val resourceDir = "failfast_option" - copyResourceFileToHdfs(s"$resourceDir/params_invalid_reader_mode.json", paramsFileHdfsPath) - - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createNonPartitionedTargetTable(targetSchema) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - val caught = intercept[RuntimeException] { - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } - caught.getMessage shouldBe "Invalid reader mode: invalid_mode provided" - } - - scenario("when reader mode is FailFast and malformed records are present, an exception is thrown") { - val resourceDir = "failfast_option" - copyResourceFileToHdfs(s"$resourceDir/params.json", paramsFileHdfsPath) - - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createNonPartitionedTargetTable(targetSchema) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - val caught = intercept[RuntimeException] { - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } - caught.getMessage shouldBe "Unable to process data" - } - - scenario("when reader mode is FailFast and no malformed records are present, load is completed correctly") { - val resourceDir = "failfast_option" - copyResourceFileToHdfs(s"$resourceDir/params.json", paramsFileHdfsPath) - - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createNonPartitionedTargetTable(targetSchema) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareDefaultSourceData() - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - } - - scenario("when reader mode is DROPMALFORMED and malformed records are present, some records are not loaded") { - val resourceDir = "failfast_option" - copyResourceFileToHdfs(s"$resourceDir/params_dropmalformed_mode.json", paramsFileHdfsPath) - - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createNonPartitionedTargetTable(targetSchema) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - assert(actualDf.count() < expectedDf.count()) - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - } - - scenario("when reader mode is PERMISSIVE and malformed records are present, malformed records are also loaded") { - val resourceDir = "failfast_option" - copyResourceFileToHdfs(s"$resourceDir/params_permissive_mode.json", paramsFileHdfsPath) - - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createNonPartitionedTargetTable(targetSchema) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe true - actualDf.count() shouldBe expectedDf.count() - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - } - } - - feature("Data can be loaded from source to target with full mode") { - scenario("Loading data to non-partitioned table") { - val resourceDir = "non_partitioned" - copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) - - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createNonPartitionedTargetTable(targetSchema) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareDefaultSourceData() - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - - scenario("Loading data to partitioned table") { - val resourceDir = "partitioned" - copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) - - val targetPath20180110 = new Path(targetDirPath, "year=2018/month=1/day=10") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareDefaultSourceData() - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - fs.exists(targetPath20180110) shouldBe false - - // executing load - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath20180110) shouldBe true - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - - scenario("Loading data to partitioned table in weekly mode") { - val resourceDir = "partitioned_weekly" - copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) - - val targetPath201801 = new Path(targetDirPath, "year=2018/week=1") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createPartitionedTargetTable(Seq("year", "week"), targetSchema, tableName) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareDefaultSourceData("landing/new_data_weekly.psv") - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 25 - targetTable.read().count() shouldBe 19 - - fs.exists(targetPath201801) shouldBe false - - // executing load - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath201801) shouldBe true - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - - scenario("Try loading data from location that does not exist and expect the data to be as it was before load") { - val resourceDir = "partitioned" - copyResourceFileToHdfs(s"partitioned_not_exist_dir/$paramsFileName", paramsFileHdfsPath) - - val targetPath20180110 = new Path(targetDirPath, "year=2018/month=1/day=10") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - - targetTable.read().count() shouldBe 19 - - // executing load - val caught = intercept[RuntimeException]{ - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } - - assert(caught.getMessage.equals("Unable to read input location.")) - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_pre.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath20180110) shouldBe false - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - - scenario("Try loading data while generating error in backup table creation") { - val resourceDir = "partitioned" - copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) - - val targetPath20180110 = new Path(targetDirPath, "year=2018/month=1/day=10") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) - createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName + "_temp") - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - - targetTable.read().count() shouldBe 19 - - // executing load - val caught = intercept[RuntimeException]{ - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } - - assert(caught.getMessage.equals("Unable to change table location.")) - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_pre.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath20180110) shouldBe false - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - - scenario("Try loading data while partitioning column is missing") { - val resourceDir = "partitioned" - copyResourceFileToHdfs(s"partitioned_partition_column_wrong/$paramsFileName", paramsFileHdfsPath) - - val targetPath20180110 = new Path(targetDirPath, "year=2018/month=1/day=10") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareDefaultSourceData() - - // checking pre-conditions - targetTable.read().count() shouldBe 19 - fs.exists(targetPath20180110) shouldBe false - - // executing load - val caught = intercept[RuntimeException]{ - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } - - assert(caught.getMessage.equals("Unable to transform data frames.")) - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_pre.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath20180110) shouldBe false - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - - scenario("Try loading data while date format is wrong") { - val resourceDir = "partitioned_date_format_wrong" - copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) - - val targetPath99999999 = new Path(targetDirPath, "year=9999/month=99/day=99") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - - val targetTable = createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) - setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) - prepareDefaultSourceData() - - // checking pre-conditions - targetTable.read().count() shouldBe 19 - fs.exists(targetPath99999999) shouldBe false - - // executing load - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath99999999) shouldBe true - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 - } - } - - override def beforeEach(): Unit = { - super.beforeEach() - spark.sql(s"DROP DATABASE IF EXISTS $targetDatabase CASCADE") - spark.sql(s"CREATE DATABASE $targetDatabase") - logger.info(s"Creating ${sourceDirPath.toString}") - fs.mkdirs(sourceDirPath) - logger.info(s"Creating ${targetDirPath.toString}") - fs.mkdirs(targetDirPath) - } - - private def createPartitionedTargetTable(targetPartitions: Seq[String], targetSchema: StructType, tableName: String): Table = { - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) - .withPartitions(targetPartitions) - .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) - } - - private def createNonPartitionedTargetTable(targetSchema: StructType): Table = { - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) - .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) - } - - private def prepareDefaultSourceData(sourceData: String = "landing/new_data.psv"): Unit = { - prepareSourceData(Seq(sourceData)) - } - - private def prepareSourceData(sourceFiles: Seq[String]): Unit = { - sourceFiles.foreach { file => - logger.info(s"copyResourceFileToHdfs $file to ${sourceDirPath.toString}") - copyResourceFileToHdfs(s"$file", sourceDirPath) - } - } - - private def setupInitialState(targetTable: Table, localDataFile: String, dataReader: FileReader): Unit = { - val initialDataLocation = resolveResource(localDataFile, withProtocol = true) - targetTable.write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) - } -} diff --git a/src/test/scala/com/adidas/analytics/feature/GzipDecompressorTest.scala b/src/test/scala/com/adidas/analytics/feature/GzipDecompressorTest.scala index 69f2961..569c3cc 100644 --- a/src/test/scala/com/adidas/analytics/feature/GzipDecompressorTest.scala +++ b/src/test/scala/com/adidas/analytics/feature/GzipDecompressorTest.scala @@ -1,110 +1,105 @@ package com.adidas.analytics.feature -import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.GzipDecompressor import com.adidas.analytics.util.DFSWrapper._ import com.adidas.utils.BaseAlgorithmTest import org.apache.hadoop.fs.Path -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ - -class GzipDecompressorTest extends FeatureSpec with BaseAlgorithmTest { +class GzipDecompressorTest extends AnyFeatureSpec with BaseAlgorithmTest { private val paramFileName: String = "params.json" + private val paramFilePathHDFS: Path = new Path(hdfsRootTestPath, paramFileName) + private val paramFilePathLocal: Path = new Path(resolveResource(paramFileName)) - private val fileNames: Seq[String] = Seq( - "data_20180719111849_data_1-3", - "data_20180719111849_data_2-3", - "data_20180719111849_data_3-3" - ) + private val fileNamesGZip: Seq[String] = + Seq("data_20180719111849_data_1-3", "data_20180719111849_data_2-3") - private val compressedFiles: Seq[String] = fileNames.map(_ + ".gz").map(fileName => resolveResource(fileName, withProtocol = true)) - private val uncompressedFiles: Seq[String] = fileNames.map(fileName => resolveResource(fileName, withProtocol = true)) + private val fileNamesZip: Seq[String] = Seq("data_20180719111849_data_3-3") - private val testDataPath: Path = new Path(hdfsRootTestPath, "test_landing/bi/test_data_table/data") - private val testDataLocation: String = testDataPath.toUri.toString + private val compressedFiles: Seq[String] = fileNamesGZip + .map(_ + ".gz") + .map(fileName => resolveResource(fileName, withProtocol = true)) ++ fileNamesZip + .map(_ + ".zip") + .map(fileName => resolveResource(fileName, withProtocol = true)) - private val uncompressedDataPath: Path = new Path(hdfsRootTestPath, "test_landing/bi/uncompressed_data_table/data") - private val uncompressedDataLocation: String = uncompressedDataPath.toUri.toString + private val uncompressedFiles: Seq[String] = fileNamesGZip + .map(fileName => resolveResource(fileName, withProtocol = true)) ++ + fileNamesZip.map(fileName => resolveResource(fileName, withProtocol = true)) + private val testDataPath: Path = + new Path(hdfsRootTestPath, "test_landing/bi/test_data_table/data") private val recursive: Boolean = true - - feature("Correct execution of GzipDecompressorBytes") { - scenario("File names are correct, compressed files are deleted and file size increases") { + Feature("Correct execution of GzipDecompressorBytes") { + Scenario("File names are correct, compressed files are deleted and file size increases") { // prepare data fs.mkdirs(testDataPath) - fs.mkdirs(uncompressedDataPath) - compressedFiles.foreach(location => fs.copyFromLocalFile(new Path(location), testDataPath)) - uncompressedFiles.foreach(location => fs.copyFromLocalFile(new Path(location), uncompressedDataPath)) // checking pre-conditions - val sourceData = spark.read.textFile(testDataLocation).toDF() - val expectedData = spark.read.textFile(uncompressedDataLocation).toDF().persist() - - expectedData.hasDiff(sourceData) shouldEqual false - sourceData.count() shouldEqual expectedData.count() - fs.ls(testDataPath, recursive).size shouldEqual 3 - fs.ls(testDataPath, recursive).map(path=> fs.getFileStatus(path).getLen) - fs.ls(testDataPath, recursive).forall(_.getName.endsWith(".gz")) shouldBe true - val baseFileNamesAndSizesBeforeDecompression = fs.ls(testDataPath, recursive).map(path => - path.getName.substring(0, path.getName.lastIndexOf("."))-> fs.getFileStatus(path).getLen - ).toMap - - - fs.ls(uncompressedDataPath, recursive).size shouldEqual 3 - fs.ls(uncompressedDataPath, recursive).forall(!_.getName.endsWith(".gz")) shouldBe true + fs.ls(testDataPath, recursive).map(path => fs.getFileStatus(path).getLen) + fs.ls(testDataPath, recursive) + .forall(p => p.getName.endsWith(".gz") || p.getName.endsWith(".zip")) shouldBe true + val baseFileNamesAndSizesBeforeDecompression = fs + .ls(testDataPath, recursive) + .map(path => + path.getName.substring(0, path.getName.lastIndexOf(".")) -> fs.getFileStatus(path).getLen + ) + .toMap + + fs.ls(testDataPath, recursive) + .count(r => !(r.getName.endsWith(".gz") || r.getName.endsWith(".zip"))) shouldEqual 0 + fs.ls(testDataPath, recursive) + .count(r => r.getName.endsWith(".gz") || r.getName.endsWith(".zip")) shouldEqual 3 // running the algorithm GzipDecompressor(spark, dfs, paramFilePathHDFS.toUri.toString).run() // validating results fs.ls(testDataPath, recursive).length shouldEqual 3 - fs.ls(testDataPath, recursive).count(!_.getName.endsWith(".gz")) shouldEqual 3 - fs.ls(testDataPath, recursive).count(_.getName.endsWith(".gz")) shouldEqual 0 - val baseFileNamesAndSizesAfterDecompression = fs.ls(testDataPath, recursive).map(path => - path.getName.substring(0, path.getName.lastIndexOf("."))-> fs.getFileStatus(path).getLen - ).toMap - - baseFileNamesAndSizesBeforeDecompression.size shouldBe baseFileNamesAndSizesAfterDecompression.size - baseFileNamesAndSizesBeforeDecompression.forall{ - case (k, v) => baseFileNamesAndSizesAfterDecompression.contains(k) && baseFileNamesAndSizesAfterDecompression(k) >= v + fs.ls(testDataPath, recursive) + .count(r => !(r.getName.endsWith(".gz") || r.getName.endsWith(".zip"))) shouldEqual 3 + fs.ls(testDataPath, recursive) + .count(r => r.getName.endsWith(".gz") || r.getName.endsWith(".zip")) shouldEqual 0 + val baseFileNamesAndSizesAfterDecompression = fs + .ls(testDataPath, recursive) + .map(path => + path.getName.substring(0, path.getName.lastIndexOf(".")) -> fs.getFileStatus(path).getLen + ) + .toMap + + baseFileNamesAndSizesBeforeDecompression.size shouldBe + baseFileNamesAndSizesAfterDecompression.size + baseFileNamesAndSizesBeforeDecompression.forall { + case (k, v) => + baseFileNamesAndSizesAfterDecompression.contains(k) && + baseFileNamesAndSizesAfterDecompression(k) >= v } } - scenario("Only .gz files are processed"){ + Scenario("Should throw an exception if uncompressed files are already present") { // prepare data fs.mkdirs(testDataPath) - uncompressedFiles.foreach(location => fs.copyFromLocalFile(new Path(location), testDataPath)) - val expectedFiles = fs.ls(testDataPath, recursive) - val expectedData = spark.read.textFile(testDataLocation).collect() - // running the algorithm - GzipDecompressor(spark, dfs, paramFilePathHDFS.toUri.toString).run() - - // validating results - val actualFiles = fs.ls(testDataPath, recursive) - actualFiles.length shouldEqual expectedFiles.length - actualFiles.toSet shouldEqual expectedFiles.toSet + val caught = intercept[RuntimeException] { + GzipDecompressor(spark, dfs, paramFilePathHDFS.toUri.toString).run() + } - val actualData = spark.read.textFile(testDataLocation).collect() - actualData.length shouldEqual expectedData.length - actualData.toSet shouldEqual expectedData.toSet + assert(caught.getMessage.contains("No codec found for file")) } - scenario("Expect exception if directory does not exist"){ + Scenario("Should throw an exception if directory does not exist") { - val caught = - intercept[RuntimeException] { + val caught = intercept[RuntimeException] { GzipDecompressor(spark, dfs, paramFilePathHDFS.toUri.toString).run() } diff --git a/src/test/scala/com/adidas/analytics/feature/MaterializationTest.scala b/src/test/scala/com/adidas/analytics/feature/MaterializationTest.scala new file mode 100644 index 0000000..48f8c15 --- /dev/null +++ b/src/test/scala/com/adidas/analytics/feature/MaterializationTest.scala @@ -0,0 +1,559 @@ +package com.adidas.analytics.feature + +import com.adidas.analytics.algo.Materialization +import com.adidas.analytics.util.{DFSWrapper, CatalogTableManager, LoadMode} +import com.adidas.utils.TestUtils._ +import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.types.{DataType, StructType} +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ + +class MaterializationTest extends AnyFeatureSpec with BaseAlgorithmTest { + + private val paramsFileName: String = "params.json" + + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) + + private val sourceDatabase: String = "test_mart_mod" + private val targetDatabase: String = "test_mart_cal" + + private var sourceTable: Table = _ + private var targetTable: Table = _ + private var fileReader: FileReader = _ + + Feature("View should be materialized in full") { + Scenario("When there is no partitioning scheme and old materialization leftovers") { + val testDir = "full_materialization" + prepareTables(testDir, Seq.empty[String]) + copyResourceFileToHdfs(s"$testDir/no_partitions/$paramsFileName", paramsFileHdfsPath) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 0 + + //manually create previous versions of the materialized view + val tableDataPath = new Path(hdfsRootTestPath, s"target_table/data/") + val tableLocations = createPreviousVersionsAndLeftovers(tableDataPath) + + var numFilesInTableDataDir = fs.listStatus(tableDataPath).count(_ => true) + numFilesInTableDataDir shouldBe 16 + + // execute materialization + Materialization.newFullMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + actualDf.hasDiff(expectedDf) shouldBe false + + /* table data folder contains only the new materialization plus the default previous versions */ + val finalTableLocation = CatalogTableManager(targetTable.table, spark).getTableLocation + val fileStatus = fs.listStatus(tableDataPath) + + numFilesInTableDataDir = fileStatus.count(_ => true) + numFilesInTableDataDir shouldBe 5 + + fs.exists(tableLocations.head) shouldBe false + fs.exists(tableLocations(1)) shouldBe false + fs.exists(tableLocations(2)) shouldBe false + fs.exists(tableLocations(3)) shouldBe false + fs.exists(tableLocations(4)) shouldBe false + fs.exists(tableLocations(5)) shouldBe false + fs.exists(tableLocations(6)) shouldBe false + fs.exists(tableLocations(7)) shouldBe false + fs.exists(tableLocations(8)) shouldBe true + fs.exists(tableLocations(9)) shouldBe true + + /* most recent subfolder is the table location and its parent folder is as expected */ + new Path(finalTableLocation).getParent.getName shouldBe tableDataPath.getName + val mostRecentSubFolder = + fileStatus.map(_.getPath.toString).toSeq.sorted(Ordering.String.reverse).head + (mostRecentSubFolder == finalTableLocation) shouldBe true + } + + Scenario("When view is partitioned and there was a previous materialization") { + val testDir = "full_materialization" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 0 + + //manually create previous versions of the materialized view + val tableDataPath = new Path(hdfsRootTestPath, s"target_table/data/") + val tableLocations = createPreviousVersionsAndLeftovers(tableDataPath) + + var numFilesInTableDataDir = fs.listStatus(tableDataPath).count(_ => true) + numFilesInTableDataDir shouldBe 16 + + /* execute two materializations (wait two seconds between them to avoid folders with same + * timestamp) */ + /* to check if there is no problem with a materialized view that is not initially pointing to + * base_data_dir */ + Materialization.newFullMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + Thread.sleep(2000) + Materialization.newFullMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + actualDf.hasDiff(expectedDf) shouldBe false + + /* table data folder contains the new materialization plus the 3 previous versions */ + val finalTableLocation = CatalogTableManager(targetTable.table, spark).getTableLocation + val fileStatus = fs.listStatus(tableDataPath) + + numFilesInTableDataDir = fileStatus.count(_ => true) + numFilesInTableDataDir shouldBe 6 + + fs.exists(tableLocations.head) shouldBe false + fs.exists(tableLocations(1)) shouldBe false + fs.exists(tableLocations(2)) shouldBe false + fs.exists(tableLocations(3)) shouldBe false + fs.exists(tableLocations(4)) shouldBe false + fs.exists(tableLocations(5)) shouldBe false + fs.exists(tableLocations(6)) shouldBe false + fs.exists(tableLocations(7)) shouldBe false + fs.exists(tableLocations(8)) shouldBe true + fs.exists(tableLocations(9)) shouldBe true + + // check that partitions were created + fs.exists(new Path(finalTableLocation, "year=2016")) shouldBe true + fs.exists(new Path(finalTableLocation, "year=2017")) shouldBe true + fs.exists(new Path(finalTableLocation, "year=2018")) shouldBe true + + /* most recent subfolder is the table location and its parent folder is as expected */ + new Path(finalTableLocation).getParent.getName shouldBe tableDataPath.getName + val mostRecentSubFolder = + fileStatus.map(_.getPath.toString).toSeq.sorted(Ordering.String.reverse).head + (mostRecentSubFolder == finalTableLocation) shouldBe true + } + } + + Feature( + "Partitions should be loaded for ranges of dates if correct partitioning schema is specified" + ) { + Scenario("When partitioning schema is year/month/day") { + val testDir = "range_materialization/year_month_day_single_day" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 0 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + false + + // execute load + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + } + + Scenario("When partitioning schema is year/month") { + val testDir = "range_materialization/year_month" + prepareTables(testDir, Seq("year", "month")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 0 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2")) shouldBe false + + // execute load + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2")) shouldBe true + } + + Scenario("When partitioning schema is year/week") { + val testDir = "range_materialization/year_week" + prepareTables(testDir, Seq("year", "week")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // adding data to target table + targetTable.write( + Seq(getInitialDataFile(testDir)), + fileReader, + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 2 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=1")) shouldBe false + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=2")) shouldBe false + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=3")) shouldBe false + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=4")) shouldBe false + + // execute load + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=1")) shouldBe true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=2")) shouldBe true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=3")) shouldBe true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/week=4")) shouldBe true + } + + Scenario("When partitioning schema is year/week/day") { + val testDir = "range_materialization/year_week_day" + prepareTables(testDir, Seq("year", "week", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // execute load + assertThrows[RuntimeException] { + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + } + } + + Scenario("When the same partition exists in the target table") { + val testDir = "range_materialization/year_month_day_single_day" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // add data to target table + targetTable.write( + Seq(Seq(9999, 1111, "", 20170215, "CUSTOMER99", "ARTICLE", 99, 2017, 2, 15)), + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // checking pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 1 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + + // execute load + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that the partition exists + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + } + + Scenario("When other partitions exist in the target table") { + val testDir = "range_materialization/year_month_day_single_day" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // adding data to target table + targetTable.write( + Seq(getInitialDataFile(testDir)), + fileReader, + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 2 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=3/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=6/day=15")) shouldBe + true + + // execute load + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) + val existingPartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) + val expectedDf = newPartitionDf.union(existingPartitionsDf) + actualDf.hasDiff(expectedDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=3/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=6/day=15")) shouldBe + true + } + + Scenario("When a range of multiple days is specified in the job configuration") { + val testDir = "range_materialization/year_month_day_multiple_days" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 0 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + false + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=16")) shouldBe + false + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=17")) shouldBe + false + + // execute load + Materialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=16")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=17")) shouldBe + true + } + } + + Feature("Partitions should be loaded if they correspond to the specified conditions") { + Scenario("When the same partition does not exist on the filesystem") { + val testDir = "query_materialization" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 0 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + false + + // execute load + Materialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + } + + Scenario("When the same partition exists on the filesystem") { + val testDir = "query_materialization" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // add data to target table + targetTable.write( + Seq(Seq(9999, 1111, "", 20170215, "CUSTOMER99", "ARTICLE", 99, 2017, 2, 15)), + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // checking pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 1 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + + // execute load + Materialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) + expectedDf.hasDiff(actualDf) shouldBe false + + // check that the partition exists + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + } + + Scenario("When other partitions exist on the filesystem") { + val testDir = "query_materialization" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) + + // adding data to target table + targetTable.write( + Seq(getInitialDataFile(testDir)), + fileReader, + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 2 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=3/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=6/day=15")) shouldBe + true + + // execute load + Materialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) + val intermediatePartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) + val expectedDf = newPartitionDf.union(intermediatePartitionsDf) + actualDf.hasDiff(expectedDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=3/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=6/day=15")) shouldBe + true + } + } + + Feature("The number of output partitions should be configurable") { + Scenario("When number of output partitions is 5") { + val testDir = "query_materialization" + prepareTables(testDir, Seq("year", "month", "day")) + copyResourceFileToHdfs(s"$testDir/output_files_5/$paramsFileName", paramsFileHdfsPath) + + // adding data to target table + targetTable.write( + Seq(getInitialDataFile(testDir)), + fileReader, + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // check pre-conditions + sourceTable.read().count() shouldBe 19 + targetTable.read().count() shouldBe 2 + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=3/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=6/day=15")) shouldBe + true + + // execute load + Materialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() + + // compare data + val actualDf = targetTable.read() + val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) + val intermediatePartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) + val expectedDf = newPartitionDf.union(intermediatePartitionsDf) + actualDf.hasDiff(expectedDf) shouldBe false + + // check that new partition was created + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=3/day=15")) shouldBe + true + fs.exists(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=6/day=15")) shouldBe + true + fs.listStatus(new Path(hdfsRootTestPath, "target_table/data/year=2017/month=2/day=15")) + .length shouldBe 5 + } + } + + override def beforeEach(): Unit = { + super.beforeEach() + spark.sql(s"DROP DATABASE IF EXISTS $targetDatabase CASCADE") + spark.sql(s"DROP DATABASE IF EXISTS $sourceDatabase CASCADE") + spark.sql(s"CREATE DATABASE $sourceDatabase") + spark.sql(s"CREATE DATABASE $targetDatabase") + } + + private def getSourceDataFile(testDir: String): String = + resolveResource(s"$testDir/source_data.psv", withProtocol = true) + + private def getInitialDataFile(testDir: String): String = + resolveResource(s"$testDir/initial_data.psv", withProtocol = true) + + private def getExpectedDataFile(testDir: String): String = + resolveResource(s"$testDir/expected_data.psv", withProtocol = true) + + private def prepareTables(testDir: String, partitions: Seq[String]): Unit = { + val schema = + DataType.fromJson(getResourceAsText(s"$testDir/schema.json")).asInstanceOf[StructType] + + // create file reader with the current schema + fileReader = FileReader.newDSVFileReader(optionalSchema = Some(schema)) + + // create source table and add data to it + sourceTable = createTable("source_table", sourceDatabase, partitions, "source_table", schema) + sourceTable.write( + Seq(getSourceDataFile(testDir)), + fileReader, + LoadMode.OverwritePartitionsWithAddedColumns + ) + + // create target table + targetTable = createTable("target_table", targetDatabase, partitions, "target_table", schema) + } + + private def createTable( + table: String, + database: String, + partitions: Seq[String], + tableDirName: String, + schema: StructType + ): Table = + Table + .newBuilder( + table, + database, + fs.makeQualified(new Path(hdfsRootTestPath, s"$tableDirName/data")).toString, + schema + ) + .withPartitions(partitions) + .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + + /** Creates folders and files in the filesystem from previous materialization attempts, also + * including some leftovers from executions of other versions of the algorithm that did not use + * the timestamped folder structure. + * + * @param tableDataPath + * Path to table's base data dir (e.g., "data/") + * @return + * previous versions and leftover files locations + */ + private def createPreviousVersionsAndLeftovers(tableDataPath: Path): Seq[Path] = { + val leftoverParquetFiles = Seq[Path]( + new Path(tableDataPath, "part-0001.parquet"), + new Path(tableDataPath, "part-0002.parquet"), + new Path(tableDataPath, "part-0003.parquet"), + new Path(tableDataPath, "part-0004.parquet") + ) + + leftoverParquetFiles.foreach(location => fs.createNewFile(location)) + + val tableLocations = Seq[Path]( + new Path(tableDataPath, "year=2019/"), + new Path(tableDataPath, "year=2020/"), + new Path(tableDataPath, "20200101_124514_UTC/"), + new Path(tableDataPath, "20200102_123012_UTC/"), + new Path(tableDataPath, "20200103_114329_UTC/"), + new Path(tableDataPath, "20200201_103210_UTC/") + ) + + tableLocations.foreach { location => + fs.mkdirs(location) + fs.createNewFile(new Path(tableDataPath, s"${location.getName}_$$folder$$")) + fs.createNewFile(new Path(location, "sample_file.parquet")) + } + + leftoverParquetFiles ++ tableLocations + } +} diff --git a/src/test/scala/com/adidas/analytics/feature/NestedFlattenerTest.scala b/src/test/scala/com/adidas/analytics/feature/NestedFlattenerTest.scala index 96ad025..b8e728d 100644 --- a/src/test/scala/com/adidas/analytics/feature/NestedFlattenerTest.scala +++ b/src/test/scala/com/adidas/analytics/feature/NestedFlattenerTest.scala @@ -5,16 +5,18 @@ import com.adidas.analytics.algo.NestedFlattener import com.adidas.utils.{BaseAlgorithmTest, FileReader} import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ -class NestedFlattenerTest extends FeatureSpec with BaseAlgorithmTest { +class NestedFlattenerTest extends AnyFeatureSpec with BaseAlgorithmTest { private val database = "test_lake" private val paramsFileName = "params.json" + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) private val rootSourceDirPath: Path = new Path(hdfsRootTestPath, s"$database/nest") + private val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$database/nest/nest_test/data") private val sourceDataLocalDir = "nest_test" @@ -22,12 +24,17 @@ class NestedFlattenerTest extends FeatureSpec with BaseAlgorithmTest { private val expectedDataFileName = "expected_target_data.psv" private val targetTableName = "nest_flattened" + private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$database/$targetTableName/data") - private val targetSchema = DataType.fromJson(getResourceAsText(s"target_schema.json")).asInstanceOf[StructType] - feature("Semi-structured data is fully flattened ... and problematic characters are replaced") { + private val targetSchema = + DataType.fromJson(getResourceAsText(s"target_schema.json")).asInstanceOf[StructType] - scenario("Test Case 1: target Schema is correct and non-partitioned table was successfully flattened and exploded") { + Feature("Semi-structured data is fully flattened ... and problematic characters are replaced") { + + Scenario( + "Test Case 1: target Schema is correct and non-partitioned table was successfully flattened and exploded" + ) { val testCaseId = "scenario1" copyResourceFileToHdfs(s"$testCaseId/$paramsFileName", paramsFileHdfsPath) @@ -51,14 +58,17 @@ class NestedFlattenerTest extends FeatureSpec with BaseAlgorithmTest { tableName = expectedTargetTableName, schema = targetSchema, filePath = expectedDataFileName, - reader = FileReader.newDSVFileReader(Some(targetSchema))) + reader = FileReader.newDSVFileReader(Some(targetSchema)) + ) val expectedTargetDf = expectedTargetTable.read() // target table has exactly the same data as the expected data targetDf.hasDiff(expectedTargetDf) shouldBe false } - scenario("Test Case 2: target Schema is correct and partitioned table was successfully flattened and exploded") { + Scenario( + "Test Case 2: target Schema is correct and partitioned table was successfully flattened and exploded" + ) { val testCaseId = "scenario2" copyResourceFileToHdfs(s"$testCaseId/$paramsFileName", paramsFileHdfsPath) @@ -68,7 +78,12 @@ class NestedFlattenerTest extends FeatureSpec with BaseAlgorithmTest { // source table has the expected number of records sourceDf.count() shouldBe 3 - val targetTable = createParquetTable(database, targetTableName, partitionColumns = Some(Seq("device_brand")), schema = targetSchema) + val targetTable = createParquetTable( + database, + targetTableName, + partitionColumns = Some(Seq("device_brand")), + schema = targetSchema + ) val nestedFlattener = NestedFlattener(spark, dfs, paramsFileHdfsPath.toString) nestedFlattener.run() @@ -84,7 +99,8 @@ class NestedFlattenerTest extends FeatureSpec with BaseAlgorithmTest { partitionColumns = Some(Seq("device_brand")), schema = targetSchema, filePath = expectedDataFileName, - reader = FileReader.newDSVFileReader(Some(targetSchema))) + reader = FileReader.newDSVFileReader(Some(targetSchema)) + ) val expectedTargetDf = expectedTargetTable.read() // target table has exactly the same data as the expected data @@ -93,9 +109,7 @@ class NestedFlattenerTest extends FeatureSpec with BaseAlgorithmTest { } - /* - * Creates the FS folders, sends the parameters and data to FS, and creates the database - */ + /* Creates the FS folders, sends the parameters and data to FS, and creates the database */ override def beforeEach(): Unit = { super.beforeEach() diff --git a/src/test/scala/com/adidas/analytics/feature/PartitionMaterializationTest.scala b/src/test/scala/com/adidas/analytics/feature/PartitionMaterializationTest.scala deleted file mode 100644 index 09fd9fb..0000000 --- a/src/test/scala/com/adidas/analytics/feature/PartitionMaterializationTest.scala +++ /dev/null @@ -1,377 +0,0 @@ -package com.adidas.analytics.feature - -import com.adidas.analytics.algo.PartitionMaterialization -import com.adidas.analytics.util.{DFSWrapper, LoadMode} -import com.adidas.utils.TestUtils._ -import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} -import org.apache.hadoop.fs.Path -import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ - - -class PartitionMaterializationTest extends FeatureSpec with BaseAlgorithmTest { - - private val paramsFileName: String = "params.json" - private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - - private val sourceDatabase: String = "test_mart_mod" - private val targetDatabase: String = "test_mart_cal" - - private var sourceTable: Table = _ - private var targetTable: Table = _ - private var fileReader: FileReader = _ - - - feature("Partitions should be loaded for ranges of dates if correct partitioning schema is specified") { - scenario("When partitioning schema is year/month/day") { - val testDir = "range_materialization/year_month_day_single_day" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 0 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe false - - // execute load - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - } - - scenario("When partitioning schema is year/month") { - val testDir = "range_materialization/year_month" - prepareTables(testDir, Seq("year", "month")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 0 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2")) shouldBe false - - // execute load - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2")) shouldBe true - } - - scenario("When partitioning schema is year/week") { - val testDir = "range_materialization/year_week" - prepareTables(testDir, Seq("year", "week")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // adding data to target table - targetTable.write(Seq(getInitialDataFile(testDir)), fileReader, LoadMode.OverwritePartitionsWithAddedColumns) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 2 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=1")) shouldBe false - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=2")) shouldBe false - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=3")) shouldBe false - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=4")) shouldBe false - - // execute load - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=1")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=2")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=3")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/week=4")) shouldBe true - } - - scenario("When partitioning schema is year/week/day") { - val testDir = "range_materialization/year_week_day" - prepareTables(testDir, Seq("year", "week", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // execute load - assertThrows[RuntimeException] { - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - } - } - - scenario("When the same partition exists in the target table") { - val testDir = "range_materialization/year_month_day_single_day" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // add data to target table - targetTable.write(Seq(Seq(9999, 1111, "", 20170215, "CUSTOMER99", "ARTICLE", 99, 2017, 2, 15)), LoadMode.OverwritePartitionsWithAddedColumns) - - // checking pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 1 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - - // execute load - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that the partition exists - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - } - - scenario("When other partitions exist in the target table") { - val testDir = "range_materialization/year_month_day_single_day" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // adding data to target table - targetTable.write(Seq(getInitialDataFile(testDir)), fileReader, LoadMode.OverwritePartitionsWithAddedColumns) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 2 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - - // execute load - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) - val existingPartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) - val expectedDf = newPartitionDf.union(existingPartitionsDf) - actualDf.hasDiff(expectedDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - } - - scenario("When a range of multiple days is specified in the job configuration") { - val testDir = "range_materialization/year_month_day_multiple_days" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 0 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe false - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=16")) shouldBe false - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=17")) shouldBe false - - // execute load - PartitionMaterialization.newRangeMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=16")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=17")) shouldBe true - } - } - - feature("Partitions should be loaded if they correspond to the specified conditions") { - scenario("When the same partition does not exist on the filesystem") { - val testDir = "condition_materialization" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 0 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe false - - // execute load - PartitionMaterialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - } - - scenario("When the same partition exists on the filesystem") { - val testDir = "condition_materialization" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // add data to target table - targetTable.write(Seq(Seq(9999, 1111, "", 20170215, "CUSTOMER99", "ARTICLE", 99, 2017, 2, 15)), LoadMode.OverwritePartitionsWithAddedColumns) - - // checking pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 1 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - - // execute load - PartitionMaterialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val expectedDf = fileReader.read(spark, getExpectedDataFile(testDir)) - expectedDf.hasDiff(actualDf) shouldBe false - - // check that the partition exists - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - } - - scenario("When other partitions exist on the filesystem") { - val testDir = "condition_materialization" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"$testDir/$paramsFileName", paramsFileHdfsPath) - - // adding data to target table - targetTable.write(Seq(getInitialDataFile(testDir)), fileReader, LoadMode.OverwritePartitionsWithAddedColumns) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 2 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - - // execute load - PartitionMaterialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) - val intermediatePartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) - val expectedDf = newPartitionDf.union(intermediatePartitionsDf) - actualDf.hasDiff(expectedDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - } - } - - feature("The number of output partitions should be configurable") { - scenario("When number of output partitions is 3") { - val testDir = "condition_materialization" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"output_files_3/$paramsFileName", paramsFileHdfsPath) - - // adding data to target table - targetTable.write(Seq(getInitialDataFile(testDir)), fileReader, LoadMode.OverwritePartitionsWithAddedColumns) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 2 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - - // execute load - PartitionMaterialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) - val intermediatePartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) - val expectedDf = newPartitionDf.union(intermediatePartitionsDf) - actualDf.hasDiff(expectedDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - fs.listStatus(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")).length shouldBe 3 - } - - scenario("When number of output partitions is 5") { - val testDir = "condition_materialization" - prepareTables(testDir, Seq("year", "month", "day")) - copyResourceFileToHdfs(s"output_files_5/$paramsFileName", paramsFileHdfsPath) - - // adding data to target table - targetTable.write(Seq(getInitialDataFile(testDir)), fileReader, LoadMode.OverwritePartitionsWithAddedColumns) - - // check pre-conditions - sourceTable.read().count() shouldBe 19 - targetTable.read().count() shouldBe 2 - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - - // execute load - PartitionMaterialization.newQueryMaterialization(spark, dfs, paramsFileHdfsPath.toUri.toString).run() - - // compare data - val actualDf = targetTable.read() - val newPartitionDf = fileReader.read(spark, getExpectedDataFile(testDir)) - val intermediatePartitionsDf = fileReader.read(spark, getInitialDataFile(testDir)) - val expectedDf = newPartitionDf.union(intermediatePartitionsDf) - actualDf.hasDiff(expectedDf) shouldBe false - - // check that new partition was created - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=3/day=15")) shouldBe true - fs.exists(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=6/day=15")) shouldBe true - fs.listStatus(new Path(hdfsRootTestPath, "target_table_data/year=2017/month=2/day=15")).length shouldBe 5 - } - } - - override def beforeEach(): Unit = { - super.beforeEach() - spark.sql(s"DROP DATABASE IF EXISTS $targetDatabase CASCADE") - spark.sql(s"DROP DATABASE IF EXISTS $sourceDatabase CASCADE") - spark.sql(s"CREATE DATABASE $sourceDatabase") - spark.sql(s"CREATE DATABASE $targetDatabase") - } - - private def getSourceDataFile(testDir: String): String = { - resolveResource(s"$testDir/source_data.psv", withProtocol = true) - } - - private def getInitialDataFile(testDir: String): String = { - resolveResource(s"$testDir/initial_data.psv", withProtocol = true) - } - - private def getExpectedDataFile(testDir: String): String = { - resolveResource(s"$testDir/expected_data.psv", withProtocol = true) - } - - private def prepareTables(testDir: String, partitions: Seq[String]): Unit = { - val schema = DataType.fromJson(getResourceAsText(s"$testDir/schema.json")).asInstanceOf[StructType] - - // create file reader with the current schema - fileReader = FileReader.newDSVFileReader(optionalSchema = Some(schema)) - - // create source table and add data to it - sourceTable = createTable("source_table", sourceDatabase, partitions, "source_table_data", schema) - sourceTable.write(Seq(getSourceDataFile(testDir)), fileReader, LoadMode.OverwritePartitionsWithAddedColumns) - - // create target table - targetTable = createTable("target_table", targetDatabase, partitions, "target_table_data", schema) - } - - private def createTable(table: String, database: String, partitions: Seq[String], tableDirName: String, schema: StructType): Table = { - Table.newBuilder(table, database, fs.makeQualified(new Path(hdfsRootTestPath, tableDirName)).toString, schema) - .withPartitions(partitions) - .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) - } -} diff --git a/src/test/scala/com/adidas/analytics/feature/SQLRunnerTest.scala b/src/test/scala/com/adidas/analytics/feature/SQLRunnerTest.scala new file mode 100644 index 0000000..f6b5a53 --- /dev/null +++ b/src/test/scala/com/adidas/analytics/feature/SQLRunnerTest.scala @@ -0,0 +1,81 @@ +package com.adidas.analytics.feature + +import com.adidas.utils.TestUtils._ +import com.adidas.analytics.algo.SQLRunner +import com.adidas.analytics.util.{DFSWrapper, CatalogTableManager, LoadMode} +import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.types.StructType +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ + +class SQLRunnerTest extends AnyFeatureSpec with BaseAlgorithmTest { + + private val sourceDatabase: String = "test_landing" + private val targetDatabase: String = "test_lake" + private val tableName: String = "bi_sales_order" + + private val paramsFileName: String = "params.json" + + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) + + Feature("Data can be loaded with Hive runner") { + Scenario("Data can be loaded from source to target with full mode") { + val dsvReader: FileReader = FileReader.newDSVFileReader(header = true) + val sourceDataLocation = resolveResource("sql_runner_dataset.psv", withProtocol = true) + + val schema: StructType = dsvReader.read(spark, sourceDataLocation).schema + + val sourceTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName")) + val oldTargetTableLocation = fs.makeQualified( + new Path(hdfsRootTestPath, s"$targetDatabase/$tableName/20180505_020927_EDT") + ) + val newTargetTableLocation = fs.makeQualified( + new Path(hdfsRootTestPath, s"$targetDatabase/$tableName/20190201_020927_EDT") + ) + copyResourceFileToHdfs(paramsFileName, paramsFileHdfsPath) + + val sourceTable = + Table + .newBuilder(tableName, sourceDatabase, sourceTableLocation.toString, schema) + .buildDSVTable(DFSWrapper(fs.getConf), spark, external = true) + + val targetTable = + Table + .newBuilder(tableName, targetDatabase, oldTargetTableLocation.toString, schema) + .withPartitions(Seq("year", "month", "day")) + .buildDSVTable(DFSWrapper(fs.getConf), spark, external = true) + + sourceTable.write(Seq(sourceDataLocation), dsvReader, LoadMode.OverwritePartitions) + + sourceTable.read().count() shouldBe 19 + + SQLRunner(spark, paramsFileHdfsPath.toString).run() + + targetTable.read().count() shouldBe 19 + + val fullTargetTableName = s"$targetDatabase.$tableName" + spark.catalog.tableExists(s"${fullTargetTableName}_swap") shouldBe false + sourceTable.read().hasDiff(targetTable.read()) shouldBe false + + val actualTableLocation = + new Path(CatalogTableManager(fullTargetTableName, spark).getTableLocation) + actualTableLocation shouldBe newTargetTableLocation + } + } + + override def beforeEach(): Unit = { + super.beforeEach() + spark.sql(s"DROP DATABASE IF EXISTS $targetDatabase CASCADE") + spark.sql(s"DROP DATABASE IF EXISTS $sourceDatabase CASCADE") + spark.sql(s"CREATE DATABASE $sourceDatabase") + spark.sql(s"CREATE DATABASE $targetDatabase") + } + + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set("hive.exec.dynamic.partition", "true") + spark.conf.set("hive.exec.dynamic.partition.mode", "nonstrict") + } +} diff --git a/src/test/scala/com/adidas/analytics/feature/TransposeTest.scala b/src/test/scala/com/adidas/analytics/feature/TransposeTest.scala new file mode 100644 index 0000000..3c193e1 --- /dev/null +++ b/src/test/scala/com/adidas/analytics/feature/TransposeTest.scala @@ -0,0 +1,83 @@ +package com.adidas.analytics.feature + +import com.adidas.utils.TestUtils._ +import com.adidas.analytics.algo.Transpose +import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.types.{DataType, StructType} +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ + +class TransposeTest extends AnyFeatureSpec with BaseAlgorithmTest { + + private val lake_database = "test_lake" + private val paramsFileName = "params.json" + + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) + private val sourceDataLocalDir = "input_data.psv" + private val expectedTargetTableName = "expected_pos_transpose" + private val expectedDataFileName = "expected_target_data.psv" + + private val sourceSchema: StructType = + DataType.fromJson(getResourceAsText("source_schema.json")).asInstanceOf[StructType] + private val targetTableName = "pos_transpose" + + private val targetDirPath: Path = + new Path(hdfsRootTestPath, s"$lake_database/$targetTableName/data") + + private val targetSchema = + DataType.fromJson(getResourceAsText(s"target_schema.json")).asInstanceOf[StructType] + private val sourceTableName = "pre_transpose" + + private var sourceTable: Table = _ + + Feature("Transpose Algorithm from table") { + Scenario("Simple transformation non partitioned table") { + val sourceDf = sourceTable.read() + // source and mart table have the expected number of records + sourceDf.count() shouldBe 6 + val targetTable = createParquetTable(lake_database, targetTableName, schema = targetSchema) + + val TransposeTransformation = Transpose(spark, dfs, paramsFileHdfsPath.toString) + TransposeTransformation.run() + + // target table has correct schema + targetTable.schema.equals(targetSchema) shouldBe true + + val targetDf = targetTable.read() + val expectedTargetTable = createAndLoadParquetTable( + database = lake_database, + tableName = expectedTargetTableName, + schema = targetSchema, + filePath = expectedDataFileName, + reader = FileReader.newDSVFileReader(Some(targetSchema)) + ) + + val expectedTargetDf = expectedTargetTable.read() + + // target table has exactly the same data as the expected data + targetDf.hasDiff(expectedTargetDf) shouldBe false + } + } + + /* Creates the FS folders, sends the parameters and data to FS, and creates the mart_database */ + override def beforeEach(): Unit = { + super.beforeEach() + fs.mkdirs(targetDirPath) + spark.sql(s"DROP DATABASE IF EXISTS $lake_database CASCADE") + spark.sql(s"CREATE DATABASE $lake_database") + + // copy job parameters to HDFS + copyResourceFileToHdfs(paramsFileName, paramsFileHdfsPath) + // create tables + sourceTable = createAndLoadParquetTable( + lake_database, + sourceTableName, + None, + schema = sourceSchema, + sourceDataLocalDir, + FileReader.newDSVFileReader(Some(sourceSchema)) + ) + } + +} diff --git a/src/test/scala/com/adidas/analytics/feature/AppendLoadTest.scala b/src/test/scala/com/adidas/analytics/feature/loads/AppendLoadTest.scala similarity index 63% rename from src/test/scala/com/adidas/analytics/feature/AppendLoadTest.scala rename to src/test/scala/com/adidas/analytics/feature/loads/AppendLoadTest.scala index f91c0a0..8febe2f 100644 --- a/src/test/scala/com/adidas/analytics/feature/AppendLoadTest.scala +++ b/src/test/scala/com/adidas/analytics/feature/loads/AppendLoadTest.scala @@ -1,6 +1,6 @@ -package com.adidas.analytics.feature +package com.adidas.analytics.feature.loads -import com.adidas.analytics.algo.AppendLoad +import com.adidas.analytics.algo.loads.AppendLoad import com.adidas.analytics.util.DFSWrapper._ import com.adidas.analytics.util.{DFSWrapper, LoadMode} import com.adidas.utils.TestUtils._ @@ -8,33 +8,38 @@ import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ - -class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { +class AppendLoadTest extends AnyFeatureSpec with BaseAlgorithmTest { private val sourceDatabase: String = "test_landing" private val targetDatabase: String = "test_lake" private val tableName: String = "test_table" private val paramsFileName: String = "params.json" + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) private val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/data") + private val headerDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/header") + private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableName") - feature("Reader mode can be specified in configuration") { - scenario("when reader_mode is set to an invalid string, an exception is thrown") { + Feature("Reader mode can be specified in configuration") { + Scenario("when reader_mode is set to an invalid string, an exception is thrown") { val testResourceDir = "reader_mode_specification" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("wrong_data_20180422-00001.psv")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir, "params_invalid_reader_mode.json") @@ -47,22 +52,24 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180422) shouldBe false // executing load - val caught = intercept[RuntimeException] { - AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } + val caught = + intercept[RuntimeException](AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()) caught.getMessage shouldBe "Invalid reader mode: invalid_mode provided" } - scenario("when reader mode is not set, DROPMALFORMED is used") { + Scenario("when reader mode is not set, DROPMALFORMED is used") { val testResourceDir = "reader_mode_specification" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("wrong_data_20180422-00001.psv")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir, "params_no_reader_mode.json") @@ -78,7 +85,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe true @@ -87,15 +95,20 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180422) shouldBe true } - scenario("when reader mode is set to FAILFAST and there are malformed records, an exception is thrown") { + Scenario( + "when reader mode is set to FAILFAST and there are malformed records, an exception is thrown" + ) { val testResourceDir = "reader_mode_specification" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("wrong_data_20180422-00001.psv")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir, "params_failfast_mode.json") @@ -108,20 +121,23 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180422) shouldBe false // executing load - assertThrows[SparkException] { - AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } + assertThrows[SparkException](AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run()) } - scenario("when reader mode is set to PERMISSIVE and there are malformed records, data is incorrectly loaded") { + Scenario( + "when reader mode is set to PERMISSIVE and there are malformed records, data is incorrectly loaded" + ) { val testResourceDir = "reader_mode_specification" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("wrong_data_20180422-00001.psv")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir, "params_permissive_mode.json") @@ -137,41 +153,55 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe true } } - - feature("Data can be loaded from source to target with append mode") { - scenario("Data can be loaded with append mode by creating partitions from full path") { + Feature("Data can be loaded from source to target with append mode") { + Scenario("Data can be loaded with append mode by creating partitions from full path") { val tableNamePartFromFullPath: String = "test_table_full_path_part" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNamePartFromFullPath/data/year=2018/month=01/day=01/") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNamePartFromFullPath/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNamePartFromFullPath") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNamePartFromFullPath/data/year=2018/month=01/day=01/" + ) + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNamePartFromFullPath/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNamePartFromFullPath") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) fs.mkdirs(targetDirPathPartFromFullPath) val testResourceDir = "partition_from_full_path" - val headerPath20180101 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=1/header.json") + val headerPath20180101 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPathPartFromFullPath, "year=2018/month=1/day=1") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString - val targetTable = - Table.newBuilder(tableNamePartFromFullPath, targetDatabase, targetTableLocation, targetSchema) + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetTable = Table + .newBuilder(tableNamePartFromFullPath, targetDatabase, targetTableLocation, targetSchema) .withPartitions(Seq("year", "month", "day")) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(testResourceDir, Seq("data-nodate-part-00000.psv", "data-nodate-part-00001.psv"), sourceDirFullPath) + prepareSourceData( + testResourceDir, + Seq("data-nodate-part-00000.psv", "data-nodate-part-00001.psv"), + sourceDirFullPath + ) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) // checking pre-conditions @@ -185,7 +215,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -194,17 +225,23 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true } - scenario("Loading data from multiple files") { + Scenario("Loading data from multiple files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) + prepareSourceData( + testResourceDir, + Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv") + ) uploadParameters(testResourceDir) // checking pre-conditions @@ -218,7 +255,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -227,7 +265,9 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true } - scenario("Loading data from hierarchical directory structure and one of the partitions has the only bad record") { + Scenario( + "Loading data from hierarchical directory structure and one of the partitions has the only bad record" + ) { val testResourceDir = "hierarchical_load" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") @@ -235,10 +275,13 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetPath20180105 = new Path(targetDirPath, "year=2018/month=1/day=5") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) copyResourceFileToHdfs(s"$testResourceDir/20180101_schema.json", headerPath20180101) @@ -259,7 +302,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -268,20 +312,26 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true } - scenario("Loading data when some header files are available and schemas are different") { + Scenario("Loading data when some header files are available and schemas are different") { val testResourceDir = "different_schemas" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val headerPath20180105 = new Path(headerDirPath, "year=2018/month=1/day=5/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetPath20180105 = new Path(targetDirPath, "year=2018/month=1/day=5") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) copyResourceFileToHdfs(s"$testResourceDir/20180101_schema.json", headerPath20180101) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180105-part-00000.psv")) + prepareSourceData( + testResourceDir, + Seq("data_20180101-part-00000.psv", "data_20180105-part-00000.psv") + ) uploadParameters(testResourceDir) // checking pre-conditions @@ -294,14 +344,18 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true fs.exists(headerPath20180105) shouldBe false - val expectedSchema20180101 = DataType.fromJson(getResourceAsText(s"$testResourceDir/20180101_schema.json")).asInstanceOf[StructType] + val expectedSchema20180101 = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/20180101_schema.json")) + .asInstanceOf[StructType] val expectedSchema20180105 = StructType(targetSchema.dropRight(3)) // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -312,27 +366,35 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true fs.exists(headerPath20180105) shouldBe true - val actualSchema20180101 = DataType.fromJson(fs.readFile(headerPath20180101)).asInstanceOf[StructType] - val actualSchema20180105 = DataType.fromJson(fs.readFile(headerPath20180105)).asInstanceOf[StructType] + val actualSchema20180101 = + DataType.fromJson(fs.readFile(headerPath20180101)).asInstanceOf[StructType] + val actualSchema20180105 = + DataType.fromJson(fs.readFile(headerPath20180105)).asInstanceOf[StructType] actualSchema20180101 shouldBe expectedSchema20180101 actualSchema20180105 shouldBe expectedSchema20180105 } - scenario("Loading data when some header files are available and schemas are the same") { + Scenario("Loading data when some header files are available and schemas are the same") { val testResourceDir = "similar_schemas" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val headerPath20180105 = new Path(headerDirPath, "year=2018/month=1/day=5/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetPath20180105 = new Path(targetDirPath, "year=2018/month=1/day=5") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) copyResourceFileToHdfs(s"$testResourceDir/20180101_schema.json", headerPath20180101) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180105-part-00000.psv")) + prepareSourceData( + testResourceDir, + Seq("data_20180101-part-00000.psv", "data_20180105-part-00000.psv") + ) uploadParameters(testResourceDir) // checking pre-conditions @@ -345,14 +407,18 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true fs.exists(headerPath20180105) shouldBe false - val expectedSchema20180101 = DataType.fromJson(getResourceAsText(s"$testResourceDir/20180101_schema.json")).asInstanceOf[StructType] + val expectedSchema20180101 = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/20180101_schema.json")) + .asInstanceOf[StructType] val expectedSchema20180105 = StructType(targetSchema.dropRight(3)) // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -363,27 +429,37 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true fs.exists(headerPath20180105) shouldBe true - val actualSchema20180101 = DataType.fromJson(fs.readFile(headerPath20180101)).asInstanceOf[StructType] - val actualSchema20180105 = DataType.fromJson(fs.readFile(headerPath20180105)).asInstanceOf[StructType] + val actualSchema20180101 = + DataType.fromJson(fs.readFile(headerPath20180101)).asInstanceOf[StructType] + val actualSchema20180105 = + DataType.fromJson(fs.readFile(headerPath20180105)).asInstanceOf[StructType] actualSchema20180101 shouldBe expectedSchema20180101 actualSchema20180105 shouldBe expectedSchema20180105 } - scenario("Loading data with time partition columns when some there are duplicates for some records") { + Scenario( + "Loading data with time partition columns when some there are duplicates for some records" + ) { val testResourceDir = "duplicate_values" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val headerPath20180105 = new Path(headerDirPath, "year=2018/month=1/day=5/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetPath20180105 = new Path(targetDirPath, "year=2018/month=1/day=5") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) copyResourceFileToHdfs(s"$testResourceDir/20180101_schema.json", headerPath20180101) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180105-part-00000.psv")) + prepareSourceData( + testResourceDir, + Seq("data_20180101-part-00000.psv", "data_20180105-part-00000.psv") + ) uploadParameters(testResourceDir) // checking pre-conditions @@ -400,7 +476,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -412,15 +489,18 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180105) shouldBe true } - scenario("Loading data without partition columns from parquet files") { + Scenario("Loading data without partition columns from parquet files") { val testResourceDir = "parquet_test" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("data_20180422-00001.parquet")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir) @@ -436,7 +516,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -444,15 +525,18 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(targetPath20180422) shouldBe true fs.exists(headerPath20180422) shouldBe true } - scenario("Loading input data that has missing columns and expecting them to be dropped") { + Scenario("Loading input data that has missing columns and expecting them to be dropped") { val testResourceDir = "missing_columns" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("data_20180422-00001.psv")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir) @@ -468,7 +552,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -477,15 +562,18 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180422) shouldBe true } - scenario("Loading data without partition columns from psv files") { + Scenario("Loading data without partition columns from psv files") { val testResourceDir = "main_test" val headerPath20180422 = new Path(headerDirPath, "year=2018/month=4/day=22/header.json") val targetPath20180422 = new Path(targetDirPath, "year=2018/month=4/day=22") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) prepareSourceData(testResourceDir, Seq("data_20180422-00001.psv")) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) uploadParameters(testResourceDir) @@ -501,7 +589,8 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false @@ -509,6 +598,44 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(targetPath20180422) shouldBe true fs.exists(headerPath20180422) shouldBe true } + + Scenario("Using Append Load Algorithm to integrate date columns as date format") { + val testResourceDir = "partitioned_and_date_columns" + val headerPath202020 = new Path(headerDirPath, "year=2020/week=20/header.json") + val targetPath202020 = new Path(targetDirPath, "year=2020/week=20") + + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema), dateFormat = "MM/dd/yyyy") + + val targetTable = createTargetTable(Seq("year", "week"), targetSchema) + setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) + prepareSourceData(testResourceDir, Seq("new_data.psv")) + uploadParameters(testResourceDir) + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 4 + targetTable.read().count() shouldBe 10 + + fs.exists(headerPath202020) shouldBe false + fs.exists(targetPath202020) shouldBe false + + // executing load + AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + + actualDf.hasDiff(expectedDf) shouldBe false + + fs.exists(headerPath202020) shouldBe true + fs.exists(targetPath202020) shouldBe true + } } override def beforeEach(): Unit = { @@ -522,23 +649,34 @@ class AppendLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirPath) } - private def uploadParameters(testResourceDir: String, whichParamsFile: String = paramsFileName, whichParamsPath: Path = paramsFileHdfsPath): Unit = { - copyResourceFileToHdfs(s"$testResourceDir/$whichParamsFile", whichParamsPath) - } + private def uploadParameters( + testResourceDir: String, + whichParamsFile: String = paramsFileName, + whichParamsPath: Path = paramsFileHdfsPath + ): Unit = copyResourceFileToHdfs(s"$testResourceDir/$whichParamsFile", whichParamsPath) - private def createTargetTable(testResourceDir: String, targetPartitions: Seq[String], targetSchema: StructType): Table = { + private def createTargetTable(targetPartitions: Seq[String], targetSchema: StructType) = { val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) + Table + .newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) .withPartitions(targetPartitions) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } - private def prepareSourceData(testResourceDir: String, sourceFiles: Seq[String], sourceDirPath: Path = sourceDirPath): Unit = { + private def prepareSourceData( + testResourceDir: String, + sourceFiles: Seq[String], + sourceDirPath: Path = sourceDirPath + ): Unit = sourceFiles.foreach(file => copyResourceFileToHdfs(s"$testResourceDir/$file", sourceDirPath)) - } - private def setupInitialState(targetTable: Table, localDataFile: String, dataReader: FileReader): Unit = { + private def setupInitialState( + targetTable: Table, + localDataFile: String, + dataReader: FileReader + ): Unit = { val initialDataLocation = resolveResource(localDataFile, withProtocol = true) - targetTable.write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) + targetTable + .write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) } } diff --git a/src/test/scala/com/adidas/analytics/feature/loads/DeltaLakeLoadTest.scala b/src/test/scala/com/adidas/analytics/feature/loads/DeltaLakeLoadTest.scala new file mode 100644 index 0000000..5c79cff --- /dev/null +++ b/src/test/scala/com/adidas/analytics/feature/loads/DeltaLakeLoadTest.scala @@ -0,0 +1,285 @@ +package com.adidas.analytics.feature.loads + +import com.adidas.analytics.algo.loads.DeltaLakeLoad +import com.adidas.analytics.util.{DFSWrapper, LoadMode} +import com.adidas.utils.TestUtils._ +import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.types.{DataType, StructType} +import org.scalatest.GivenWhenThen +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ + +class DeltaLakeLoadTest extends AnyFeatureSpec with BaseAlgorithmTest with GivenWhenThen { + + spark.conf.set("spark.executor.instances", "1") + spark.conf.set("spark.executor.cores", "2") + + private val sourceSystem: String = "sales" + private val testTableName: String = "orders" + private val layerLanding: String = "test_landing" + private val layerLake: String = "test_lake" + + private val paramsFile = "params.json" + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFile) + + private val testInitDataFile = "init_data.psv" + + private val testLandingDeltaTableDirPath = + new Path(hdfsRootTestPath, s"$layerLanding/$sourceSystem/$testTableName/delta_table") + + private val testNewDataFile = "new_data.psv" + + private val testLandingNewDataDirPath = + new Path(hdfsRootTestPath, s"$layerLanding/$sourceSystem/$testTableName/data") + + private val testLakePath = new Path(hdfsRootTestPath, s"$layerLake") + private val testLakePartitions = Seq("year", "month", "day") + private val lakeSchemaName = "lake_schema.json" + + private val testControlDataFile = "control_data.psv" + private val testControlTableName = "control" + + Feature( + "Handle merges after init loads with duplicate rows with different record modes and dates" + ) { + + Scenario("New data is available with new columns") { + val resourceDir = "added_columns_and_duplicates_in_init" + + var targetTable = createTable( + tableName = s"${sourceSystem}_$testTableName", + targetPartitions = testLakePartitions, + testResourceDir = resourceDir, + schemaName = "lake_schema_initial.json" + ) + + Given("there was an init load of the delta table") + And( + "init load contains more than 1 row per business key with different record modes and null dates" + ) + uploadNewDataToLanding(resourceDir, testInitDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + And("target table was then recreated with new schema that includes discount column") + spark.sql(s"DROP TABLE IF EXISTS $layerLake.${sourceSystem}_$testTableName") + targetTable = createTable( + tableName = s"${sourceSystem}_$testTableName", + targetPartitions = testLakePartitions, + testResourceDir = resourceDir, + schemaName = "lake_schema_final.json" + ) + spark.catalog.recoverPartitions(s"$layerLake.${sourceSystem}_$testTableName") + + When("new delta data includes new discount column") + uploadNewDataToLanding(resourceDir, testNewDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + Then("data in target table is correct") + val controlTable = createTable( + s"${sourceSystem}_${testTableName}_$testControlTableName", + testLakePartitions, + testResourceDir = resourceDir, + Some(testControlDataFile), + "lake_schema_final.json" + ) + + targetTable.read().hasDiff(controlTable.read()) shouldBe false + spark.read.load(targetTable.location).schema shouldEqual targetTable.schema + } + + } + + Feature("Regular Delta Lake Loads with (updates, inserts and deletes)") { + + Scenario("New data is available and target table is not partitioned") { + val resourceDir = "nonpartitioned" + + copyResourceFileToHdfs(s"$resourceDir/$paramsFile", paramsFileHdfsPath) + + val targetTable = createTable( + tableName = s"${sourceSystem}_$testTableName", + targetPartitions = Seq(), + testResourceDir = resourceDir, + schemaName = lakeSchemaName + ) + + Given("there was an init load of the delta table") + uploadNewDataToLanding(resourceDir, testInitDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + When("merge is executed in non partitioned delta table") + uploadNewDataToLanding(resourceDir, testNewDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + Then("data in non partitioned target table is correct") + val controlTable = createTable( + s"${sourceSystem}_${testTableName}_$testControlTableName", + Seq(), + testResourceDir = resourceDir, + Some(testControlDataFile), + lakeSchemaName + ) + + targetTable.read().hasDiff(controlTable.read()) shouldBe false + spark.read.load(targetTable.location).schema shouldEqual targetTable.schema + } + + Scenario("New data is available with removed columns") { + val resourceDir = "removed_columns" + + val targetTable = createTable( + tableName = s"${sourceSystem}_$testTableName", + targetPartitions = testLakePartitions, + testResourceDir = resourceDir, + schemaName = lakeSchemaName + ) + + Given("there was an init load of the delta table") + uploadNewDataToLanding(resourceDir, testInitDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + When("new delta data does not include one column") + uploadNewDataToLanding(resourceDir, testNewDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + Then("data in target table is correct") + val controlTable = createTable( + s"${sourceSystem}_${testTableName}_$testControlTableName", + testLakePartitions, + testResourceDir = resourceDir, + Some(testControlDataFile), + lakeSchemaName + ) + + targetTable.read().hasDiff(controlTable.read()) shouldBe false + spark.read.load(targetTable.location).schema shouldEqual targetTable.schema + } + + } + + Feature("Unstable target partitions whose values change over time for several rows") { + + Scenario(s"Unstable target partitions, and parameters are not properly configured") { + Given("ignore_affected_partitions_merge is false") + val resourceDir = "unstable_partitions_wrong_params" + + copyResourceFileToHdfs(s"$resourceDir/$paramsFile", paramsFileHdfsPath) + + val targetTable = createTable( + tableName = s"${sourceSystem}_$testTableName", + targetPartitions = testLakePartitions, + testResourceDir = resourceDir, + schemaName = lakeSchemaName + ) + + Given("there was an init load of the delta table") + uploadNewDataToLanding(resourceDir, testInitDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + When("values for the target partition column have changed for several rows") + uploadNewDataToLanding(resourceDir, testNewDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val controlTable = createTable( + s"${sourceSystem}_${testTableName}_$testControlTableName", + testLakePartitions, + testResourceDir = resourceDir, + Some(testControlDataFile), + lakeSchemaName + ) + + val targetDf = targetTable.read() + val controlDf = controlTable.read() + + Then("there is two extra rows which should not be in the target table") + targetDf.count() shouldBe controlDf.count() + 2 + + And("target table does not match control table in terms of content") + targetDf.hasDiff(controlDf) shouldBe true + spark.read.load(targetTable.location).schema shouldEqual targetTable.schema + } + + Scenario(s"Unstable target partitions, but parameters are properly configured") { + val resourceDir = "unstable_partitions_right_params" + + Given("ignore_affected_partitions_merge is true") + copyResourceFileToHdfs(s"$resourceDir/$paramsFile", paramsFileHdfsPath) + + val targetTable = createTable( + tableName = s"${sourceSystem}_$testTableName", + targetPartitions = testLakePartitions, + testResourceDir = resourceDir, + schemaName = lakeSchemaName + ) + + Given("there was an init load of the delta table") + uploadNewDataToLanding(resourceDir, testInitDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + When("values for the target partition column have changed for several rows") + uploadNewDataToLanding(resourceDir, testNewDataFile) + DeltaLakeLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + Then("target table is as expected") + val controlTable = createTable( + s"${sourceSystem}_${testTableName}_$testControlTableName", + testLakePartitions, + testResourceDir = resourceDir, + Some(testControlDataFile), + lakeSchemaName + ) + + targetTable.read().hasDiff(controlTable.read()) shouldBe false + spark.read.load(targetTable.location).schema shouldEqual targetTable.schema + } + + } + + override def beforeEach(): Unit = { + super.beforeEach() + spark.sql(s"DROP DATABASE IF EXISTS $layerLake CASCADE") + spark.sql(s"CREATE DATABASE $layerLake") + + fs.mkdirs(testLandingNewDataDirPath) + fs.mkdirs(testLandingDeltaTableDirPath) + fs.mkdirs(testLakePath) + + copyResourceFileToHdfs(paramsFile, paramsFileHdfsPath) + } + + private def createTable( + tableName: String, + targetPartitions: Seq[String], + testResourceDir: String, + dsvFileName: Option[String] = None, + schemaName: String + ): Table = { + val tableLocation = fs.makeQualified(new Path(testLakePath, tableName)) + val schema = + DataType.fromJson(getResourceAsText(s"$testResourceDir/$schemaName")).asInstanceOf[StructType] + + val table = Table + .newBuilder(tableName, layerLake, tableLocation.toString, schema) + .withPartitions(targetPartitions) + .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + + if (dsvFileName.nonEmpty) { + val dataLocation = + resolveResource(s"$testResourceDir/${dsvFileName.get}", withProtocol = true) + table.write( + Seq(dataLocation), + FileReader.newDSVFileReader(header = true), + LoadMode.OverwritePartitionsWithAddedColumns, + fillNulls = true + ) + } + + table + } + + private def uploadNewDataToLanding(testResourceDir: String, fileName: String): Unit = { + fs.listStatus(testLandingNewDataDirPath).foreach(f => fs.delete(f.getPath, false)) + copyResourceFileToHdfs(s"$testResourceDir/$fileName", testLandingNewDataDirPath) + } +} diff --git a/src/test/scala/com/adidas/analytics/feature/loads/DeltaLoadTest.scala b/src/test/scala/com/adidas/analytics/feature/loads/DeltaLoadTest.scala new file mode 100644 index 0000000..4725089 --- /dev/null +++ b/src/test/scala/com/adidas/analytics/feature/loads/DeltaLoadTest.scala @@ -0,0 +1,311 @@ +package com.adidas.analytics.feature.loads + +import com.adidas.analytics.algo.loads.DeltaLoad +import com.adidas.analytics.util.{DFSWrapper, LoadMode} +import com.adidas.utils.TestUtils._ +import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.types.{DataType, StructType} +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ + +class DeltaLoadTest extends AnyFeatureSpec with BaseAlgorithmTest { + + private val testDatabase: String = "test_lake" + private val dataTableName: String = "delta_load_active_data" + private val deltaTableName: String = "delta_load_delta_data" + private val controlTableName: String = "control_table" + + private val dsvReader: FileReader = FileReader.newDSVFileReader(header = true) + + private var dataTable: Table = _ + private var deltaTable: Table = _ + private var controlTable: Table = _ + + private var paramsFileHdfsPath: Path = _ + + Feature("Correct execution of delta load with delta records from csv file") { + Scenario("Correct execution of delta load partitioned by date/time columns") { + setupEnvironment(Seq("year", "month", "day"), "params.json", "csv_test") + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Correct execution of delta load partitioned by customer") { + setupEnvironment(Seq("customer"), "params_part.json", "csv_test") + + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + } + + Feature("Correct execution of delta load with delta records from parquet file") { + Scenario("Delta Init") { + val testResourceDir = "parquet_test_delta_init" + + dataTable = createTableWithSchema( + dataTableName, + Seq("year", "month", "day"), + testResourceDir, + None, + "active_data_schema.json" + ) + controlTable = createTableWithSchema( + controlTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_post.psv"), + "active_data_schema.json" + ) + + createParquetFileFromDSVfileandWriteToHDSF( + testResourceDir, + "delta_data.psv", + "delta_data_schema.json" + ) + + placeParametersFile(testResourceDir, "params.json") + + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Delta Merge Partitioned") { + val testResourceDir = "parquet_test_delta_merge_partitioned" + + dataTable = createTableWithSchema( + dataTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_pre.psv"), + "active_data_schema.json" + ) + controlTable = createTableWithSchema( + controlTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_post.psv"), + "active_data_schema.json" + ) + + createParquetFileFromDSVfileandWriteToHDSF( + testResourceDir, + "delta_data.psv", + "delta_data_schema.json" + ) + + placeParametersFile(testResourceDir, "params.json") + + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Delta Merge Unpartitioned") { + val testResourceDir = "parquet_test_delta_merge_unpartitioned" + + dataTable = createTableWithSchema( + dataTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_pre.psv"), + "active_data_schema.json" + ) + controlTable = createTableWithSchema( + controlTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_post.psv"), + "active_data_schema.json" + ) + + createParquetFileFromDSVfileandWriteToHDSF( + testResourceDir, + "delta_data.psv", + "delta_data_schema.json" + ) + + placeParametersFile(testResourceDir, "params.json") + + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Delta Merge with additional columns in the delta file") { + val testResourceDir = "parquet_test_delta_merge_additional_columns" + + dataTable = createTableWithSchema( + dataTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_pre.psv"), + "active_data_schema.json" + ) + controlTable = createTableWithSchema( + controlTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_post.psv"), + "active_data_schema.json" + ) + + createParquetFileFromDSVfileandWriteToHDSF( + testResourceDir, + "delta_data.psv", + "delta_data_schema.json" + ) + + placeParametersFile(testResourceDir, "params.json") + + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Delta Merge with missing columns in the delta file") { + val testResourceDir = "parquet_test_delta_merge_missing_columns" + + dataTable = createTableWithSchema( + dataTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_pre.psv"), + "active_data_schema.json" + ) + controlTable = createTableWithSchema( + controlTableName, + Seq("year", "month", "day"), + testResourceDir, + Some("active_data_post.psv"), + "active_data_schema.json" + ) + + createParquetFileFromDSVfileandWriteToHDSF( + testResourceDir, + "delta_data.psv", + "delta_data_schema.json" + ) + + placeParametersFile(testResourceDir, "params.json") + + DeltaLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + val actualDf = dataTable.read() + val expectedDf = controlTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + } + + private def createParquetFileFromDSVfileandWriteToHDSF( + testResourceDir: String, + dsvResource: String, + schemaResource: String + ): Unit = { + val dsvFilePath = resolveResource(s"$testResourceDir/$dsvResource", withProtocol = true) + val deltaDataSchema = DataType + .fromJson(getResourceAsText(s"$testResourceDir/$schemaResource")) + .asInstanceOf[StructType] + spark.read + .option("header", "true") + .option("delimiter", "|") + .schema(deltaDataSchema) + .csv(dsvFilePath) + .write + .parquet(s"hdfs:/tmp/tests/delta_data") + } + + private def setupEnvironment( + targetPartitions: Seq[String], + paramsFileName: String, + testResourceDir: String + ): Unit = { + def createTable(tableName: String, dataFile: String, parquet: Boolean): Table = { + val dataLocation = resolveResource(dataFile, withProtocol = true) + val schema: StructType = dsvReader.read(spark, dataLocation).schema + val tableLocation = fs.makeQualified(new Path(hdfsRootTestPath, s"$testDatabase/$tableName")) + + val tableBuilder = Table + .newBuilder(tableName, testDatabase, tableLocation.toString, schema) + .withPartitions(targetPartitions) + + val table = + if (parquet) tableBuilder.buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + else tableBuilder.buildDSVTable(DFSWrapper(fs.getConf), spark, external = true) + + table.write( + Seq(dataLocation), + dsvReader, + LoadMode.OverwritePartitionsWithAddedColumns, + fillNulls = true + ) + table + } + + dataTable = createTable(dataTableName, s"$testResourceDir/active_data_pre.psv", parquet = true) + if (testResourceDir == "csv_test") + deltaTable = createTable(deltaTableName, s"$testResourceDir/delta_data.psv", parquet = false) + controlTable = + createTable(controlTableName, s"$testResourceDir/active_data_post.psv", parquet = true) + + paramsFileHdfsPath = new Path(hdfsRootTestPath, paramsFileName) + copyResourceFileToHdfs(s"$testResourceDir/$paramsFileName", paramsFileHdfsPath) + } + + private def createTableWithSchema( + tableName: String, + targetPartitions: Seq[String], + testResourceDir: String, + dsvFileName: Option[String], + schemaFileName: String + ): Table = { + val tableLocation = fs.makeQualified(new Path(hdfsRootTestPath, s"$testDatabase/$tableName")) + val schema = DataType + .fromJson(getResourceAsText(s"$testResourceDir/$schemaFileName")) + .asInstanceOf[StructType] + + val table = Table + .newBuilder(tableName, testDatabase, tableLocation.toString, schema) + .withPartitions(targetPartitions) + .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + + dsvFileName match { + case Some(fileName) => + val dataLocation = resolveResource(s"$testResourceDir/$fileName", withProtocol = true) + table.write( + Seq(dataLocation), + dsvReader, + LoadMode.OverwritePartitionsWithAddedColumns, + fillNulls = true + ) + table + case None => table + } + } + + def placeParametersFile(testResourceDir: String, paramsFileName: String): Unit = { + paramsFileHdfsPath = new Path(hdfsRootTestPath, paramsFileName) + copyResourceFileToHdfs(s"$testResourceDir/$paramsFileName", paramsFileHdfsPath) + } + + override def beforeEach(): Unit = { + super.beforeEach() + spark.sql(s"DROP DATABASE IF EXISTS $testDatabase CASCADE") + spark.sql(s"CREATE DATABASE $testDatabase") + } +} diff --git a/src/test/scala/com/adidas/analytics/feature/loads/FullLoadTest.scala b/src/test/scala/com/adidas/analytics/feature/loads/FullLoadTest.scala new file mode 100644 index 0000000..9b2b995 --- /dev/null +++ b/src/test/scala/com/adidas/analytics/feature/loads/FullLoadTest.scala @@ -0,0 +1,599 @@ +package com.adidas.analytics.feature.loads + +import com.adidas.analytics.algo.loads.FullLoad +import com.adidas.analytics.util.{DFSWrapper, HadoopLoadHelper, CatalogTableManager, LoadMode} +import com.adidas.utils.TestUtils._ +import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.types.{DataType, StructType} +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ + +class FullLoadTest extends AnyFeatureSpec with BaseAlgorithmTest { + + private val sourceEnvironmentLocation: String = "test_landing" + private val targetDatabase: String = "test_lake" + private val tableName: String = "test_table" + + private val paramsFileName: String = "params.json" + + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) + + private val sourceDirPath: Path = + new Path(hdfsRootTestPath, s"$sourceEnvironmentLocation/test/$tableName/data") + + private val baseTargetDirPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") + + Feature("Reader mode can be specified in configuration") { + Scenario("when reader_mode is invalid string an exception is thrown") { + val resourceDir = "failfast_option" + copyResourceFileToHdfs(s"$resourceDir/params_invalid_reader_mode.json", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + val caught = + intercept[RuntimeException](FullLoad(spark, dfs, paramsFileHdfsPath.toString).run()) + caught.getMessage shouldBe "Invalid reader mode: invalid_mode provided" + } + + Scenario( + "when reader mode is FailFast and malformed records are present, an exception is thrown" + ) { + val resourceDir = "failfast_option" + copyResourceFileToHdfs(s"$resourceDir/params.json", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + val caught = + intercept[RuntimeException](FullLoad(spark, dfs, paramsFileHdfsPath.toString).run()) + caught.getMessage shouldBe "Unable to write DataFrames." + } + + Scenario( + "when reader mode is FailFast and no malformed records are present, load is completed correctly" + ) { + val resourceDir = "failfast_option" + copyResourceFileToHdfs(s"$resourceDir/params.json", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario( + "when reader mode is DROPMALFORMED and malformed records are present, some records are not loaded" + ) { + val resourceDir = "failfast_option" + copyResourceFileToHdfs(s"$resourceDir/params_dropmalformed_mode.json", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + assert(actualDf.count() < expectedDf.count()) + } + + Scenario( + "when reader mode is PERMISSIVE and malformed records are present, malformed records are also loaded" + ) { + val resourceDir = "failfast_option" + copyResourceFileToHdfs(s"$resourceDir/params_permissive_mode.json", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareSourceData(Seq(s"$resourceDir/new_data_wrong.psv")) + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe true + actualDf.count() shouldBe expectedDf.count() + } + } + + Feature("Data can be loaded from source to target with full mode") { + Scenario("Previous lake table location folder does not exist.") { + val resourceDir = "non_partitioned" + copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + // Deleting table location folder before full load + fs.delete(new Path(targetTable.location), true) + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + + // validating schema + val expectedTS = targetSchema + val actualTS = spark.table(targetDatabase + '.' + tableName).schema + actualTS.equals(expectedTS) shouldBe true + } + + Scenario("Loading data to non-partitioned table") { + val resourceDir = "non_partitioned" + copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Loading data to partitioned table") { + val resourceDir = "partitioned" + copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = + createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) + var targetPath20180110 = new Path(targetTable.location, "year=2018/month=1/day=10") + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + fs.exists(targetPath20180110) shouldBe false + + // executing load + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + + targetPath20180110 = new Path( + CatalogTableManager(targetTable.table, spark).getTableLocation, + "year=2018/month=1/day=10" + ) + fs.exists(targetPath20180110) shouldBe true + } + + Scenario( + "Partitioned table is loaded and old leftovers are cleansed properly after successful load" + ) { + val resourceDir = "partitioned" + copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = + createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) + var targetPath20180110 = new Path(targetTable.location, "year=2018/month=1/day=10") + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + //manually create old leftovers + val tableRootPath = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName") + val oldTableLocation1 = new Path(tableRootPath, "data_20000101124514567/year=2000") + val oldTableLocation2 = new Path(tableRootPath, "data_20000221124511234/year=2000") + fs.mkdirs(oldTableLocation1) + fs.mkdirs(oldTableLocation2) + fs.createNewFile(new Path(tableRootPath, "data_20000101124514567_$folder$")) + fs.createNewFile(new Path(tableRootPath, "data_20000221124511234_$folder$")) + fs.createNewFile(new Path(oldTableLocation1, "sample_file1.parquet")) + fs.createNewFile(new Path(oldTableLocation1, "sample_file2.parquet")) + fs.createNewFile(new Path(oldTableLocation2, "sample_file1.parquet")) + fs.createNewFile(new Path(oldTableLocation2, "sample_file2.parquet")) + + /* num of files in table's root dir should be the leftovers plus the initial state folder */ + var numFilesInTableRootDir = fs.listStatus(tableRootPath).count(_ => true) + numFilesInTableRootDir shouldBe 5 + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + fs.exists(targetPath20180110) shouldBe false + + // executing load + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + + val finalTableLocation = CatalogTableManager(targetTable.table, spark).getTableLocation + targetPath20180110 = new Path(finalTableLocation, "year=2018/month=1/day=10") + fs.exists(targetPath20180110) shouldBe true + + // table root folder only contains the final table location + val fileStatus = fs.listStatus(tableRootPath) + numFilesInTableRootDir = fileStatus.count(_ => true) + numFilesInTableRootDir shouldBe 1 + + /* most recent subfolder is the table location and its parent folder is as expected */ + val mostRecentSubFolder = fileStatus.toList.head.getPath + mostRecentSubFolder.getParent.getName shouldBe tableRootPath.getName + (mostRecentSubFolder.toString == finalTableLocation) shouldBe true + } + + Scenario("Loading data to partitioned table in weekly mode") { + val resourceDir = "partitioned_weekly" + copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createPartitionedTargetTable(Seq("year", "week"), targetSchema, tableName) + var targetPath201801 = new Path(targetTable.location, "year=2018/week=1") + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData("landing/new_data_weekly.psv") + + // checking pre-conditions + spark.read.csv(sourceDirPath.toString).count() shouldBe 25 + targetTable.read().count() shouldBe 19 + + fs.exists(targetPath201801) shouldBe false + + // executing load + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + + targetPath201801 = + new Path(CatalogTableManager(targetTable.table, spark).getTableLocation, "year=2018/week=1") + fs.exists(targetPath201801) shouldBe true + } + + Scenario( + "Try loading data from location that does not exist and expect the data to be as it was before load" + ) { + val resourceDir = "partitioned" + copyResourceFileToHdfs(s"partitioned_not_exist_dir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = + createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) + val targetPath20180110 = new Path(targetTable.location, "year=2018/month=1/day=10") + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + + targetTable.read().count() shouldBe 19 + + // executing load + val caught = + intercept[RuntimeException](FullLoad(spark, dfs, paramsFileHdfsPath.toString).run()) + + assert(caught.getMessage.equals("Unable to read input data.")) + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_pre.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + + fs.exists(targetPath20180110) shouldBe false + } + + Scenario("Try loading data while partitioning column is missing") { + val resourceDir = "partitioned" + copyResourceFileToHdfs( + s"partitioned_partition_column_wrong/$paramsFileName", + paramsFileHdfsPath + ) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = + createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) + val targetPath20180110 = new Path(targetTable.location, "year=2018/month=1/day=10") + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + // checking pre-conditions + targetTable.read().count() shouldBe 19 + fs.exists(targetPath20180110) shouldBe false + + // executing load + val caught = + intercept[RuntimeException](FullLoad(spark, dfs, paramsFileHdfsPath.toString).run()) + + assert(caught.getMessage.equals("Unable to transform data frames.")) + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_pre.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + + actualDf.hasDiff(expectedDf) shouldBe false + + fs.exists(targetPath20180110) shouldBe false + } + + Scenario("Try loading data while date format is wrong") { + val resourceDir = "partitioned_date_format_wrong" + copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) + + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = + createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) + var targetPath99999999 = new Path(targetTable.location, "year=9999/month=99/day=99") + setupInitialState(targetTable, s"$resourceDir/lake_data_pre.psv", dataReader) + prepareDefaultSourceData() + + // checking pre-conditions + targetTable.read().count() shouldBe 19 + fs.exists(targetPath99999999) shouldBe false + + // executing load + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + + targetPath99999999 = new Path( + CatalogTableManager(targetTable.table, spark).getTableLocation, + "year=9999/month=99/day=99" + ) + fs.exists(targetPath99999999) shouldBe true + } + } + + Feature("Full load with nested flattener job") { + Scenario("Full load nested with nested flattener and transpose") { + val resourceDir = "nested_flattener" + + val paramsFileNameExtended: String = "params_transpose_scenario.json" + + val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileNameExtended) + + copyResourceFileToHdfs(s"$resourceDir/$paramsFileNameExtended", paramsFileHdfsPath) + + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema_transpose_scenario.json")) + .asInstanceOf[StructType] + + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + + setupInitialState(targetTable, s"$resourceDir/data_transpose_test.json", dataReader) + prepareDefaultSourceData("nested_flattener/data_transpose_test.json") + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/expected_target_data_tranpose.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + + Scenario("Full load nested with nested flattener only") { + val resourceDir = "nested_flattener" + + val paramsFileNameExtended: String = "params_normal_scenario.json" + val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileNameExtended) + + copyResourceFileToHdfs(s"$resourceDir/$paramsFileNameExtended", paramsFileHdfsPath) + + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema_extend.json")) + .asInstanceOf[StructType] + + copyResourceFileToHdfs(s"$resourceDir/$paramsFileNameExtended", paramsFileHdfsPath) + + val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) + + val targetTable = createNonPartitionedTargetTable(targetSchema) + + setupInitialState(targetTable, s"$resourceDir/data_normal_test.json", dataReader) + prepareDefaultSourceData("nested_flattener/data_normal_test.json") + + // checking pre-conditions + spark.read.json(sourceDirPath.toString).count() shouldBe 1 + targetTable.read().count() shouldBe 1 + + FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() + + // validating result + val expectedDataLocation = + resolveResource(s"$resourceDir/expected_target_data_extend.psv", withProtocol = true) + val expectedDf = dataReader.read(spark, expectedDataLocation) + + val actualDf = targetTable.read() + actualDf.hasDiff(expectedDf) shouldBe false + } + } + + override def beforeEach(): Unit = { + super.beforeEach() + spark.sql(s"DROP DATABASE IF EXISTS $targetDatabase CASCADE") + spark.sql(s"CREATE DATABASE $targetDatabase") + logger.info(s"Creating ${sourceDirPath.toString}") + fs.mkdirs(sourceDirPath) + } + + private def createPartitionedTargetTable( + targetPartitions: Seq[String], + targetSchema: StructType, + tableName: String + ): Table = { + val targetTableLocation = fs + .makeQualified( + new Path(hdfsRootTestPath, HadoopLoadHelper.buildTimestampedTablePath(baseTargetDirPath)) + ) + .toString + Table + .newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) + .withPartitions(targetPartitions) + .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + } + + private def createNonPartitionedTargetTable(targetSchema: StructType): Table = + createNonPartitionedTargetTable( + targetSchema, + HadoopLoadHelper.buildTimestampedTablePath(baseTargetDirPath) + ) + + private def createNonPartitionedTargetTable(targetSchema: StructType, dir: Path): Table = { + val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, dir)).toString + Table + .newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) + .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) + } + + private def prepareDefaultSourceData(sourceData: String = "landing/new_data.psv"): Unit = + prepareSourceData(Seq(sourceData)) + + private def prepareSourceData(sourceFiles: Seq[String]): Unit = + sourceFiles.foreach { file => + logger.info(s"copyResourceFileToHdfs $file to ${sourceDirPath.toString}") + copyResourceFileToHdfs(s"$file", sourceDirPath) + } + + private def setupInitialState( + targetTable: Table, + localDataFile: String, + dataReader: FileReader + ): Unit = { + val initialDataLocation = resolveResource(localDataFile, withProtocol = true) + targetTable + .write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) + } +} diff --git a/src/test/scala/com/adidas/analytics/feature/SemiStructuredLoadTest.scala b/src/test/scala/com/adidas/analytics/feature/loads/SemiStructuredLoadTest.scala similarity index 52% rename from src/test/scala/com/adidas/analytics/feature/SemiStructuredLoadTest.scala rename to src/test/scala/com/adidas/analytics/feature/loads/SemiStructuredLoadTest.scala index 10a3012..f20ee00 100644 --- a/src/test/scala/com/adidas/analytics/feature/SemiStructuredLoadTest.scala +++ b/src/test/scala/com/adidas/analytics/feature/loads/SemiStructuredLoadTest.scala @@ -1,39 +1,48 @@ -package com.adidas.analytics.feature +package com.adidas.analytics.feature.loads -import com.adidas.utils.TestUtils._ -import com.adidas.analytics.algo.AppendLoad +import com.adidas.analytics.algo.loads.AppendLoad import com.adidas.analytics.util.DFSWrapper._ import com.adidas.analytics.util.DataFormat.ParquetFormat import com.adidas.analytics.util.OutputWriter +import com.adidas.utils.TestUtils._ import com.adidas.utils.{BaseAlgorithmTest, FileReader} import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ - -class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { +class SemiStructuredLoadTest extends AnyFeatureSpec with BaseAlgorithmTest { private val sourceDatabase: String = "test_landing" private val targetDatabase: String = "test_lake" private val tableName: String = "test_table" private val paramsFileName: String = "params.json" + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) private val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/data") + private val headerDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/header") - private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableName") + private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableName") - feature("Data can be loaded from source to target with append mode") { - scenario("SemiStructured Data can be loaded with append mode by creating partitions from full path") { + Feature("Data can be loaded from source to target with append mode") { + Scenario( + "SemiStructured Data can be loaded with append mode by creating partitions from full path" + ) { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -41,15 +50,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_json_load" - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) @@ -65,7 +80,8 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchema).parquet(targetTableLocation) actualDf.hasDiff(expectedDf) shouldBe false @@ -74,13 +90,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180102) shouldBe true } - scenario("Nested SemiStructured Data can be loaded with append mode by creating partitions from full path") { + Scenario( + "Nested SemiStructured Data can be loaded with append mode by creating partitions from full path" + ) { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -88,15 +112,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_nested_json_load" - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) @@ -112,7 +142,8 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchema).parquet(targetTableLocation) actualDf.hasDiff(expectedDf) shouldBe false @@ -121,13 +152,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180102) shouldBe true } - scenario("SemiStructured Parquet Data can be loaded with append mode by creating partitions from full path") { + Scenario( + "SemiStructured Parquet Data can be loaded with append mode by creating partitions from full path" + ) { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -135,15 +174,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_parquet_test" - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat(Some(targetSchema)) - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("sales.parquet"), sourceDirFullPath) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) @@ -159,23 +204,29 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchema).parquet(targetTableLocation) actualDf.hasDiff(expectedDf) shouldBe false - fs.exists(targetPath20180102) shouldBe true fs.exists(headerPath20180102) shouldBe true - } + } - scenario("SemiStructured Data can be loaded with append mode with evolving schema") { + Scenario("SemiStructured Data can be loaded with append mode with evolving schema") { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -183,15 +234,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_json_load_evolving_schema" - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) @@ -204,9 +261,19 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // Executing append load with the evolved schema - val sourceDirFullPath20180103: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=03/") - prepareSourceData(testResourceDir, Seq("data-nodate-part-00002.txt"), sourceDirFullPath20180103) - val targetSchemaEvolved = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema_evolved.json")).asInstanceOf[StructType] + val sourceDirFullPath20180103: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=03/" + ) + prepareSourceData( + testResourceDir, + Seq("data-nodate-part-00002.txt"), + sourceDirFullPath20180103 + ) + val targetSchemaEvolved = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema_evolved.json")) + .asInstanceOf[StructType] val dataReaderEvolved = FileReader.newJsonFileReader(Some(targetSchemaEvolved)) fs.delete(paramsFileModdedRegexHdfsPath, false) @@ -215,7 +282,8 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReaderEvolved.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchemaEvolved).parquet(targetTableLocation) @@ -225,13 +293,19 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180102) shouldBe true } - scenario("SemiStructured Data can be loaded with append mode with dropping columns") { + Scenario("SemiStructured Data can be loaded with append mode with dropping columns") { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -239,17 +313,24 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_json_load_dropping_column" - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") - val headerPath20180103 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=3/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180103 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=3/header.json") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") val targetPath20180103 = new Path(targetDirFullPath, "year=2018/month=1/day=3") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath) @@ -269,11 +350,28 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.delete(paramsFileModdedRegexHdfsPath, false) // prepare new data - val sourceDirFullPath20180103: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=03/") - val sourceDirFullPath20180104: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=04/") - prepareSourceData(testResourceDir, Seq("data-nodate-part-00002.txt"), sourceDirFullPath20180103) - prepareSourceData(testResourceDir, Seq("data-nodate-part-00003.txt"), sourceDirFullPath20180104) - val targetSchemaDroppedCol = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema_column_dropped.json")).asInstanceOf[StructType] + val sourceDirFullPath20180103: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=03/" + ) + val sourceDirFullPath20180104: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=04/" + ) + prepareSourceData( + testResourceDir, + Seq("data-nodate-part-00002.txt"), + sourceDirFullPath20180103 + ) + prepareSourceData( + testResourceDir, + Seq("data-nodate-part-00003.txt"), + sourceDirFullPath20180104 + ) + val targetSchemaDroppedCol = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema_column_dropped.json")) + .asInstanceOf[StructType] val dataReaderDroppedCol = FileReader.newJsonFileReader(Some(targetSchemaDroppedCol)) val paramsDroppedColFileName: String = "params_column_dropped.json" uploadParameters(testResourceDir, paramsDroppedColFileName, paramsFileModdedRegexHdfsPath) @@ -282,7 +380,8 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReaderDroppedCol.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchemaDroppedCol).parquet(targetTableLocation) @@ -293,13 +392,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { } - scenario("SemiStructured Data cannot be loaded when data contains more columns than target schema") { + Scenario( + "SemiStructured Data cannot be loaded when data contains more columns than target schema" + ) { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -307,15 +414,21 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_json_load_mismatching_schema" - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath) @@ -330,10 +443,14 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() } logger.info(caught.getMessage) - assert(caught.getMessage.equals(s"Schema does not match the input data for some of the input folders.")) + assert( + caught.getMessage + .equals(s"Schema does not match the input data for some of the input folders.") + ) // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchema).parquet(targetTableLocation) @@ -343,13 +460,19 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(targetPath20180102) shouldBe false } - scenario("SemiStructured Data cannot be loaded with wrong configuration") { + Scenario("SemiStructured Data cannot be loaded with wrong configuration") { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath) fs.mkdirs(headerDirPathPartFromFullPath) @@ -357,31 +480,50 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_json_load_wrong_configuration" - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) - val caught = intercept[RuntimeException]{ + val caught = intercept[RuntimeException] { AppendLoad(spark, dfs, paramsFileModdedRegexHdfsPath.toString).run() } logger.info(caught.getMessage) - assert(caught.getMessage.equals(s"Unsupported data type: unstructured in AppendLoad or the configuration file is malformed.")) + assert( + caught.getMessage.equals( + s"Unsupported data type: unstructured in AppendLoad or the configuration file is malformed." + ) + ) } - scenario("Loading semistructured data when some header files are available and schemas are the same") { + Scenario( + "Loading semistructured data when some header files are available and schemas are the same" + ) { val tableNameJson: String = "test_table_semistructured" val paramsFileModdedRegexHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - val sourceDirFullPath20180101: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=01/") - val sourceDirFullPath20180102: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/") - val targetDirFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") - val headerDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") - val targetDirPathPartFromFullPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") + val sourceDirFullPath20180101: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=01/" + ) + val sourceDirFullPath20180102: Path = new Path( + hdfsRootTestPath, + s"$sourceDatabase/$tableNameJson/data/year=2018/month=01/day=02/" + ) + val targetDirFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson/data") + val headerDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableNameJson/header") + val targetDirPathPartFromFullPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/$tableNameJson") fs.mkdirs(sourceDirFullPath20180102) fs.mkdirs(headerDirPathPartFromFullPath) @@ -389,21 +531,36 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirFullPath) val testResourceDir = "semistructured_load_with_existing_header" - val headerPath20180101 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=1/header.json") - val headerPath20180102 = new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") + val headerPath20180101 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=1/header.json") + val headerPath20180102 = + new Path(headerDirPathPartFromFullPath, "year=2018/month=1/day=2/header.json") val targetPath20180101 = new Path(targetDirFullPath, "year=2018/month=1/day=1") val targetPath20180102 = new Path(targetDirFullPath, "year=2018/month=1/day=2") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val targetTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, targetDirPathPartFromFullPath)).toString val dataReader = FileReader.newJsonFileReader(Some(targetSchema)) val dataFormat = ParquetFormat() - val dataWriter = OutputWriter.newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) + val dataWriter = OutputWriter + .newFileSystemWriter(targetDirFullPath.toString, dataFormat, Seq("year", "month", "day")) setupInitialState(s"$testResourceDir/lake_data_pre.txt", dataReader, dataWriter) copyResourceFileToHdfs(s"$testResourceDir/20180101_schema.json", headerPath20180101) - prepareSourceData(testResourceDir, Seq("data-nodate-part-00001.txt"), sourceDirFullPath20180101) - prepareSourceData(testResourceDir, Seq("data-nodate-part-00002.txt"), sourceDirFullPath20180102) + prepareSourceData( + testResourceDir, + Seq("data-nodate-part-00001.txt"), + sourceDirFullPath20180101 + ) + prepareSourceData( + testResourceDir, + Seq("data-nodate-part-00002.txt"), + sourceDirFullPath20180102 + ) uploadParameters(testResourceDir, paramsFileName, paramsFileModdedRegexHdfsPath) // checking pre-conditions @@ -416,14 +573,18 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true fs.exists(headerPath20180102) shouldBe false - val expectedSchema20180101 = DataType.fromJson(getResourceAsText(s"$testResourceDir/20180101_schema.json")).asInstanceOf[StructType] + val expectedSchema20180101 = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/20180101_schema.json")) + .asInstanceOf[StructType] val expectedSchema20180102 = StructType(targetSchema.dropRight(3)) // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = spark.read.schema(targetSchema).parquet(targetTableLocation) @@ -435,8 +596,10 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.exists(headerPath20180101) shouldBe true fs.exists(headerPath20180102) shouldBe true - val actualSchema20180101 = DataType.fromJson(fs.readFile(headerPath20180101)).asInstanceOf[StructType] - val actualSchema20180105 = DataType.fromJson(fs.readFile(headerPath20180102)).asInstanceOf[StructType] + val actualSchema20180101 = + DataType.fromJson(fs.readFile(headerPath20180101)).asInstanceOf[StructType] + val actualSchema20180105 = + DataType.fromJson(fs.readFile(headerPath20180102)).asInstanceOf[StructType] actualSchema20180101 shouldBe expectedSchema20180101 actualSchema20180105 shouldBe expectedSchema20180102 @@ -450,15 +613,24 @@ class SemiStructuredLoadTest extends FeatureSpec with BaseAlgorithmTest { fs.mkdirs(targetDirPath) } - private def uploadParameters(testResourceDir: String, whichParamsFile: String = paramsFileName, whichParamsPath: Path = paramsFileHdfsPath): Unit = { - copyResourceFileToHdfs(s"$testResourceDir/$whichParamsFile", whichParamsPath) - } - - private def prepareSourceData(testResourceDir: String, sourceFiles: Seq[String], sourceDirPath: Path = sourceDirPath): Unit = { + private def uploadParameters( + testResourceDir: String, + whichParamsFile: String = paramsFileName, + whichParamsPath: Path = paramsFileHdfsPath + ): Unit = copyResourceFileToHdfs(s"$testResourceDir/$whichParamsFile", whichParamsPath) + + private def prepareSourceData( + testResourceDir: String, + sourceFiles: Seq[String], + sourceDirPath: Path = sourceDirPath + ): Unit = sourceFiles.foreach(file => copyResourceFileToHdfs(s"$testResourceDir/$file", sourceDirPath)) - } - private def setupInitialState(localDataFile: String, dataReader: FileReader, dataWriter: OutputWriter): Unit = { + private def setupInitialState( + localDataFile: String, + dataReader: FileReader, + dataWriter: OutputWriter + ): Unit = { val initialDataLocation = resolveResource(localDataFile, withProtocol = true) dataWriter.write(dfs, dataReader.read(spark, initialDataLocation)) } diff --git a/src/test/scala/com/adidas/analytics/feature/AlgorithmTemplateTest.scala b/src/test/scala/com/adidas/analytics/feature/templates/AlgorithmTemplateTest.scala similarity index 57% rename from src/test/scala/com/adidas/analytics/feature/AlgorithmTemplateTest.scala rename to src/test/scala/com/adidas/analytics/feature/templates/AlgorithmTemplateTest.scala index c785eb4..4a9b643 100644 --- a/src/test/scala/com/adidas/analytics/feature/AlgorithmTemplateTest.scala +++ b/src/test/scala/com/adidas/analytics/feature/templates/AlgorithmTemplateTest.scala @@ -1,35 +1,39 @@ -package com.adidas.analytics.feature +package com.adidas.analytics.feature.templates -import com.adidas.analytics.algo.FullLoad -import com.adidas.analytics.util.{DFSWrapper, HiveTableAttributeReader, LoadMode} +import com.adidas.analytics.algo.loads.FullLoad +import com.adidas.analytics.util.{DFSWrapper, LoadMode} import com.adidas.utils.TestUtils._ import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ -class AlgorithmTemplateTest extends FeatureSpec with BaseAlgorithmTest { +class AlgorithmTemplateTest extends AnyFeatureSpec with BaseAlgorithmTest { private val sourceEnvironmentLocation: String = "test_landing" private val targetDatabase: String = "test_lake" private val tableName: String = "test_table" private val paramsFileName: String = "algorithm_template_params.json" + private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) - private val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceEnvironmentLocation/test/$tableName/data") - private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") - private val backupDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data_backup") + private val sourceDirPath: Path = + new Path(hdfsRootTestPath, s"$sourceEnvironmentLocation/test/$tableName/data") + + private val targetDirPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") + + Feature("Algorithm template successfully loads files to lake") { + Scenario("when table is not partitioned, load is successful") { - feature("Algorithm template successfully loads files to lake") { - scenario("when table is not partitioned, load is successful") { - /** - * Implement here the steps required for the given test case. + /** Implement here the steps required for the given test case. */ copyResourceFileToHdfs(s"$paramsFileName", paramsFileHdfsPath) - val targetSchema = DataType.fromJson(getResourceAsText("target_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType.fromJson(getResourceAsText("target_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val targetTable = createNonPartitionedTargetTable(targetSchema) @@ -47,33 +51,31 @@ class AlgorithmTemplateTest extends FeatureSpec with BaseAlgorithmTest { val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 } } private def createNonPartitionedTargetTable(targetSchema: StructType): Table = { val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) + Table + .newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } - private def setupInitialState(targetTable: Table, localDataFile: String, dataReader: FileReader): Unit = { + private def setupInitialState( + targetTable: Table, + localDataFile: String, + dataReader: FileReader + ): Unit = { val initialDataLocation = resolveResource(localDataFile, withProtocol = true) - targetTable.write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) + targetTable + .write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) } - private def prepareDefaultSourceData(): Unit = { + private def prepareDefaultSourceData(): Unit = Seq("new_data.psv").foreach { file => logger.info(s"copyResourceFileToHdfs $file to ${sourceDirPath.toString}") copyResourceFileToHdfs(s"$file", sourceDirPath) } - } override def beforeEach(): Unit = { super.beforeEach() @@ -84,4 +86,4 @@ class AlgorithmTemplateTest extends FeatureSpec with BaseAlgorithmTest { logger.info(s"Creating ${targetDirPath.toString}") fs.mkdirs(targetDirPath) } -} \ No newline at end of file +} diff --git a/src/test/scala/com/adidas/analytics/integration/BaseIntegrationTest.scala b/src/test/scala/com/adidas/analytics/integration/BaseIntegrationTest.scala index 6ab8a41..24b098c 100644 --- a/src/test/scala/com/adidas/analytics/integration/BaseIntegrationTest.scala +++ b/src/test/scala/com/adidas/analytics/integration/BaseIntegrationTest.scala @@ -12,12 +12,15 @@ trait BaseIntegrationTest extends BaseAlgorithmTest { protected val tableName: String = "test_table" protected val paramsFileName: String = "params.json" + protected val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) protected val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/data") - protected val headerDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/header") - protected val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableName") + protected val headerDirPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/$tableName/header") + + protected val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/$tableName") override def beforeEach(): Unit = { super.beforeEach() @@ -30,42 +33,59 @@ trait BaseIntegrationTest extends BaseAlgorithmTest { fs.mkdirs(targetDirPath) } - protected def uploadParameters(testResourceDir: String, whichParamsFile: String = paramsFileName, whichParamsPath: Path = paramsFileHdfsPath): Unit = { - copyResourceFileToHdfs(s"$testResourceDir/$whichParamsFile", whichParamsPath) - } + protected def uploadParameters( + testResourceDir: String, + whichParamsFile: String = paramsFileName, + whichParamsPath: Path = paramsFileHdfsPath + ): Unit = copyResourceFileToHdfs(s"$testResourceDir/$whichParamsFile", whichParamsPath) - protected def createTargetTable(testResourceDir: String, targetPartitions: Seq[String], targetSchema: StructType): Table = { + protected def createTargetTable( + targetPartitions: Seq[String], + targetSchema: StructType + ): Table = { val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) + Table + .newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) .withPartitions(targetPartitions) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } - protected def prepareSourceData(testResourceDir: String, sourceFiles: Seq[String], sourceDirPath: Path = sourceDirPath): Unit = { + protected def prepareSourceData( + testResourceDir: String, + sourceFiles: Seq[String], + sourceDirPath: Path = sourceDirPath + ): Unit = sourceFiles.foreach(file => copyResourceFileToHdfs(s"$testResourceDir/$file", sourceDirPath)) - } - protected def prepareSourceData(sourceFiles: Seq[String]): Unit = { + protected def prepareSourceData(sourceFiles: Seq[String]): Unit = sourceFiles.foreach { file => logger.info(s"copyResourceFileToHdfs $file to ${sourceDirPath.toString}") copyResourceFileToHdfs(s"$file", sourceDirPath) } - } - protected def setupInitialState(targetTable: Table, localDataFile: String, dataReader: FileReader): Unit = { + protected def setupInitialState( + targetTable: Table, + localDataFile: String, + dataReader: FileReader + ): Unit = { val initialDataLocation = resolveResource(localDataFile, withProtocol = true) - targetTable.write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) + targetTable + .write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) } - protected def createPartitionedTargetTable(targetPartitions: Seq[String], targetSchema: StructType, tableName: String): Table = { + protected def createPartitionedTargetTable( + targetPartitions: Seq[String], + targetSchema: StructType, + tableName: String + ): Table = { val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) + Table + .newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) .withPartitions(targetPartitions) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } - protected def prepareDefaultSourceData(sourceDataFile: String): Unit = { + protected def prepareDefaultSourceData(sourceDataFile: String): Unit = prepareSourceData(Seq(sourceDataFile)) - } } diff --git a/src/test/scala/com/adidas/analytics/integration/FailFastIntegrationTest.scala b/src/test/scala/com/adidas/analytics/integration/FailFastIntegrationTest.scala index 6fcda30..33e4ec3 100644 --- a/src/test/scala/com/adidas/analytics/integration/FailFastIntegrationTest.scala +++ b/src/test/scala/com/adidas/analytics/integration/FailFastIntegrationTest.scala @@ -1,40 +1,51 @@ package com.adidas.analytics.integration +import com.adidas.analytics.algo.loads.FullLoad import com.adidas.utils.TestUtils._ -import com.adidas.analytics.algo.FullLoad -import com.adidas.analytics.util.HiveTableAttributeReader +import com.adidas.analytics.util.CatalogTableManager import com.adidas.utils.{FileReader, Table} import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} -import org.scalatest.Matchers._ -import org.scalatest.{Assertion, FeatureSpec} - -import scala.collection.JavaConverters._ +import org.scalatest.matchers.should.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.Assertion import scala.util.{Failure, Success, Try} -class FailFastIntegrationTest extends FeatureSpec with BaseIntegrationTest { +class FailFastIntegrationTest extends AnyFeatureSpec with BaseIntegrationTest { + + override val sourceDirPath: Path = + new Path(hdfsRootTestPath, s"$sourceDatabase/test/$tableName/data") + + override val targetDirPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") - override val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceDatabase/test/$tableName/data") - override val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") - protected val backupDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data_backup") + protected val backupDirPath: Path = + new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data_backup") - feature("FailFast Option should fail safely regarding data and metadata") { + Feature("FailFast Option should fail safely regarding data and metadata") { - scenario("Full Load Algorithm running in FailFast mode and failing safely!") { + Scenario("Full Load Algorithm running in FailFast mode and failing safely!") { val resourceDir = "partitioned" copyResourceFileToHdfs(s"$resourceDir/$paramsFileName", paramsFileHdfsPath) - val targetPath20180110 = new Path(targetDirPath, "year=2018/month=1/day=10") - val targetSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/target_schema.json")).asInstanceOf[StructType] + val targetPath20180110 = new Path("year=2018/month=1/day=10") + val targetSchema = DataType + .fromJson(getResourceAsText(s"$resourceDir/target_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$resourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] + val expectedPartitionsSchema = + DataType + .fromJson(getResourceAsText(s"$resourceDir/expected_partitions_schema.json")) + .asInstanceOf[StructType] val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) - val targetTable = createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) + val targetTable = + createPartitionedTargetTable(Seq("year", "month", "day"), targetSchema, tableName) // Populate the table with data and Partitions - integrationTestStep(sourceDataFile = "landing/new_data.psv", + integrationTestStep( + sourceDataFile = "landing/new_data.psv", resourceDir = resourceDir, targetPath = targetPath20180110, shouldFail = false, @@ -44,7 +55,8 @@ class FailFastIntegrationTest extends FeatureSpec with BaseIntegrationTest { ) // Wrong Data Should not affect table data and partitioning - integrationTestStep(sourceDataFile = "landing/new_data_wrong_format.psv", + integrationTestStep( + sourceDataFile = "landing/new_data_wrong_format.psv", resourceDir = resourceDir, targetPath = targetPath20180110, shouldFail = true, @@ -57,19 +69,19 @@ class FailFastIntegrationTest extends FeatureSpec with BaseIntegrationTest { } - private def integrationTestStep(sourceDataFile: String, - shouldFail: Boolean, - resourceDir: String, - targetPath: Path, - dataReader: FileReader, - metadataReader: FileReader, - targetTable: Table): Assertion = { + private def integrationTestStep( + sourceDataFile: String, + shouldFail: Boolean, + resourceDir: String, + targetPath: Path, + dataReader: FileReader, + metadataReader: FileReader, + targetTable: Table + ): Assertion = { prepareDefaultSourceData(sourceDataFile) // executing load - val isPipelineFailing = Try { - FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() - } match { + val isPipelineFailing = Try(FullLoad(spark, dfs, paramsFileHdfsPath.toString).run()) match { case Failure(_) => true case Success(_) => false } @@ -77,36 +89,29 @@ class FailFastIntegrationTest extends FeatureSpec with BaseIntegrationTest { isPipelineFailing should equal(shouldFail) // validating result - val expectedDataLocation = resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$resourceDir/lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false - fs.exists(targetPath) shouldBe true - - // check the resulting table location is /data folder - val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation - tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString - - //check backUp dir is empty - fs.listStatus(backupDirPath).length shouldBe 0 + fs.exists( + new Path(CatalogTableManager(targetTable.table, spark).getTableLocation, targetPath) + ) shouldBe true // MetaData Specific Tests - val producedPartitionsNumber: Dataset[String] = spark - .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") - .as(Encoders.STRING) - - val expectedPartitionsLocation = resolveResource(s"$resourceDir/expected_partitions.txt", withProtocol = true) - val expectedPartitions: Dataset[String] = metadataReader - .read(spark, expectedPartitionsLocation) - .as(Encoders.STRING) + val producedPartitionsNumber: Dataset[String] = + spark.sql(s"SHOW PARTITIONS $targetDatabase.$tableName").as(Encoders.STRING) + val expectedPartitionsLocation = + resolveResource(s"$resourceDir/expected_partitions.txt", withProtocol = true) + val expectedPartitions: Dataset[String] = + metadataReader.read(spark, expectedPartitionsLocation).as(Encoders.STRING) expectedPartitions - .collectAsList() - .asScala + .collect() .toSet - .diff(producedPartitionsNumber.collectAsList().asScala.toSet) should equal(Set()) + .diff(producedPartitionsNumber.collect().toSet) should equal(Set()) } } diff --git a/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTest.scala b/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTest.scala index cd0cdb1..4af69e6 100644 --- a/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTest.scala +++ b/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsCustomIntegrationTest.scala @@ -1,34 +1,40 @@ package com.adidas.analytics.integration +import com.adidas.analytics.algo.loads.AppendLoad import com.adidas.utils.TestUtils._ -import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ -import scala.collection.JavaConverters._ +class RecoverPartitionsCustomIntegrationTest extends AnyFeatureSpec with BaseIntegrationTest { + Feature("Partitions can be updated programmatically using custom logic") { -class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { - - feature("Partitions can be updated programmatically using custom logic") { - - scenario("Using Append Load Algorithm with multiple source files") { + Scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val expectedPartitionsSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) + prepareSourceData( + testResourceDir, + Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv") + ) uploadParameters(testResourceDir) // checking pre-conditions @@ -42,35 +48,35 @@ class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegr AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) - val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedPartitionsLocation = + resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() - val producedPartitionsNumber: Dataset[String] = spark - .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") - .as(Encoders.STRING) + val producedPartitionsNumber: Dataset[String] = + spark.sql(s"SHOW PARTITIONS $targetDatabase.$tableName").as(Encoders.STRING) // MetaData Specific Tests - val expectedPartitions: Dataset[String] = expectedPartitionsDataReader - .read(spark, expectedPartitionsLocation) - .as(Encoders.STRING) + val expectedPartitions: Dataset[String] = + expectedPartitionsDataReader.read(spark, expectedPartitionsLocation).as(Encoders.STRING) - expectedPartitions.collectAsList().asScala.sorted.toSet should - equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) + expectedPartitions.collect().sorted.toSet should + equal(producedPartitionsNumber.collect().sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark - .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") + .sql(s"DESCRIBE extended $targetDatabase.$tableName PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() - .getAs[String]("data_type").contains("6 rows") shouldBe true + .getAs[String]("data_type") + .contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } - } diff --git a/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTest.scala b/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTest.scala index b6c2d96..35ad414 100644 --- a/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTest.scala +++ b/src/test/scala/com/adidas/analytics/integration/RecoverPartitionsNativeIntegrationTest.scala @@ -1,34 +1,40 @@ package com.adidas.analytics.integration +import com.adidas.analytics.algo.loads.AppendLoad import com.adidas.utils.TestUtils._ -import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ +import org.scalatest.featurespec.AnyFeatureSpec +import org.scalatest.matchers.should.Matchers._ -import scala.collection.JavaConverters._ +class RecoverPartitionsNativeIntegrationTest extends AnyFeatureSpec with BaseIntegrationTest { + Feature("Partitions can be updated with native spark.recoverPartitions()") { -class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { - - feature("Partitions can be updated with native spark.recoverPartitions()") { - - scenario("Using Append Load Algorithm with multiple source files") { + Scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] + val targetSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")) + .asInstanceOf[StructType] + val expectedPartitionsSchema = + DataType + .fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")) + .asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) + val targetTable = createTargetTable(Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) + prepareSourceData( + testResourceDir, + Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv") + ) uploadParameters(testResourceDir) // checking pre-conditions @@ -42,35 +48,35 @@ class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegr AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) - val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) + val expectedDataLocation = + resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) + val expectedPartitionsLocation = + resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() - val producedPartitionsNumber: Dataset[String] = spark - .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") - .as(Encoders.STRING) + val producedPartitionsNumber: Dataset[String] = + spark.sql(s"SHOW PARTITIONS $targetDatabase.$tableName").as(Encoders.STRING) // MetaData Specific Tests - val expectedPartitions: Dataset[String] = expectedPartitionsDataReader - .read(spark, expectedPartitionsLocation) - .as(Encoders.STRING) + val expectedPartitions: Dataset[String] = + expectedPartitionsDataReader.read(spark, expectedPartitionsLocation).as(Encoders.STRING) - expectedPartitions.collectAsList().asScala.sorted.toSet should - equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) + expectedPartitions.collect().sorted.toSet should + equal(producedPartitionsNumber.collect().sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark - .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") + .sql(s"DESCRIBE extended $targetDatabase.$tableName PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() - .getAs[String]("data_type").contains("6 rows") shouldBe true + .getAs[String]("data_type") + .contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } - } diff --git a/src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsCustomIntegrationTest.scala b/src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsCustomIntegrationTest.scala deleted file mode 100644 index fbe2721..0000000 --- a/src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsCustomIntegrationTest.scala +++ /dev/null @@ -1,70 +0,0 @@ -package com.adidas.analytics.integration - -import com.adidas.utils.TestUtils._ -import com.adidas.analytics.algo.AppendLoad -import com.adidas.utils.FileReader -import org.apache.hadoop.fs.Path -import org.apache.spark.sql.types.{DataType, StructType} -import org.apache.spark.sql.{Dataset, Encoders} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ - -import scala.collection.JavaConverters._ - - -class SparkRecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { - - feature("Partitions can be updated programmatically using custom logic") { - - scenario("Using Append Load Algorithm with multiple source files") { - val testResourceDir = "multiple_source_files" - val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") - val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") - - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) - - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) - setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) - uploadParameters(testResourceDir) - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 7 - targetTable.read().count() shouldBe 19 - - fs.exists(targetPath20180101) shouldBe false - fs.exists(headerPath20180101) shouldBe false - - // executing load - AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) - val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - - val producedPartitionsNumber: Dataset[String] = spark - .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") - .as(Encoders.STRING) - - // MetaData Specific Tests - val expectedPartitions: Dataset[String] = expectedPartitionsDataReader - .read(spark, expectedPartitionsLocation) - .as(Encoders.STRING) - - expectedPartitions.collectAsList().asScala.sorted.toSet should - equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) - - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath20180101) shouldBe true - fs.exists(headerPath20180101) shouldBe true - } - } - - -} diff --git a/src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsNativeIntegrationTest.scala b/src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsNativeIntegrationTest.scala deleted file mode 100644 index d0cd335..0000000 --- a/src/test/scala/com/adidas/analytics/integration/SparkRecoverPartitionsNativeIntegrationTest.scala +++ /dev/null @@ -1,70 +0,0 @@ -package com.adidas.analytics.integration - -import com.adidas.utils.TestUtils._ -import com.adidas.analytics.algo.AppendLoad -import com.adidas.utils.FileReader -import org.apache.hadoop.fs.Path -import org.apache.spark.sql.types.{DataType, StructType} -import org.apache.spark.sql.{Dataset, Encoders} -import org.scalatest.FeatureSpec -import org.scalatest.Matchers._ - -import scala.collection.JavaConverters._ - - -class SparkRecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { - - feature("Partitions can be updated with native spark.recoverPartitions()") { - - scenario("Using Append Load Algorithm with multiple source files") { - val testResourceDir = "multiple_source_files" - val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") - val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") - - val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] - val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] - val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) - val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) - - val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) - setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) - prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) - uploadParameters(testResourceDir) - - // checking pre-conditions - spark.read.csv(sourceDirPath.toString).count() shouldBe 7 - targetTable.read().count() shouldBe 19 - - fs.exists(targetPath20180101) shouldBe false - fs.exists(headerPath20180101) shouldBe false - - // executing load - AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() - - // validating result - val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) - val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) - val expectedDf = dataReader.read(spark, expectedDataLocation) - val actualDf = targetTable.read() - - val producedPartitionsNumber: Dataset[String] = spark - .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") - .as(Encoders.STRING) - - // MetaData Specific Tests - val expectedPartitions: Dataset[String] = expectedPartitionsDataReader - .read(spark, expectedPartitionsLocation) - .as(Encoders.STRING) - - expectedPartitions.collectAsList().asScala.sorted.toSet should - equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) - - actualDf.hasDiff(expectedDf) shouldBe false - - fs.exists(targetPath20180101) shouldBe true - fs.exists(headerPath20180101) shouldBe true - } - } - - -} diff --git a/src/test/scala/com/adidas/analytics/unit/DateComponentDerivationTest.scala b/src/test/scala/com/adidas/analytics/unit/DateComponentDerivationTest.scala index a0d71fa..881ee34 100644 --- a/src/test/scala/com/adidas/analytics/unit/DateComponentDerivationTest.scala +++ b/src/test/scala/com/adidas/analytics/unit/DateComponentDerivationTest.scala @@ -4,35 +4,40 @@ import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.shared.DateComponentDerivation import com.adidas.utils.SparkSessionWrapper import org.apache.spark.sql.DataFrame -import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} +import org.scalatest.BeforeAndAfterAll +import org.scalatest.matchers.should.Matchers +import org.scalatest.funsuite.AnyFunSuite -class DateComponentDerivationTest extends FunSuite with SparkSessionWrapper with Matchers with BeforeAndAfterAll { +class DateComponentDerivationTest + extends AnyFunSuite + with SparkSessionWrapper + with Matchers + with BeforeAndAfterAll { class DateComponentDerivationSubClass extends DateComponentDerivation { - def validateWithDateComponents(sourceDateColumnName: String, - sourceDateFormat: String, - targetDateComponentColumnNames: Seq[String])(inputDf: DataFrame): DataFrame = { - super.withDateComponents(sourceDateColumnName, sourceDateFormat, targetDateComponentColumnNames)(inputDf) - } + + def validateWithDateComponents( + sourceDateColumnName: String, + sourceDateFormat: String, + targetDateComponentColumnNames: Seq[String] + )(inputDf: DataFrame): DataFrame = + super + .withDateComponents(sourceDateColumnName, sourceDateFormat, targetDateComponentColumnNames)( + inputDf + ) } import spark.implicits._ - override def afterAll(): Unit = { - spark.stop() - } + override def afterAll(): Unit = spark.stop() test("Partition by year/week with formatter yyyyww") { - val sampleDf = Seq( - ("201301"), - ("201531"), - ("202001") - ).toDF("zatpweek") + val sampleDf = Seq(("201301"), ("201531"), ("202001")).toDF("zatpweek") - val dateComponentDerivationTester: DataFrame => DataFrame = new DateComponentDerivationSubClass() - .validateWithDateComponents( + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( sourceDateColumnName = "zatpweek", sourceDateFormat = "yyyyww", targetDateComponentColumnNames = Seq("year", "week") @@ -40,26 +45,18 @@ class DateComponentDerivationTest extends FunSuite with SparkSessionWrapper with val transformedDf = sampleDf.transform(dateComponentDerivationTester) - val expectedDf = Seq( - ("201301", 2013, 1), - ("201531", 2015, 31), - ("202001", 2020, 1) - ).toDF("zatpweek", "year", "week") + val expectedDf = Seq(("201301", 2013, 1), ("201531", 2015, 31), ("202001", 2020, 1)) + .toDF("zatpweek", "year", "week") transformedDf.hasDiff(expectedDf) shouldBe false } test("Partition by year/month/day with formatter yyyyMMdd") { - val sampleDf = Seq( - ("20130112"), - ("20150815"), - ("20200325"), - ("20180110") - ).toDF("partcol") + val sampleDf = Seq(("20130112"), ("20150815"), ("20200325"), ("20180110")).toDF("partcol") - val dateComponentDerivationTester: DataFrame => DataFrame = new DateComponentDerivationSubClass() - .validateWithDateComponents( + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( sourceDateColumnName = "partcol", sourceDateFormat = "yyyyMMdd", targetDateComponentColumnNames = Seq("year", "month", "day") @@ -79,14 +76,10 @@ class DateComponentDerivationTest extends FunSuite with SparkSessionWrapper with test("Partition by year/month with formatter yyyyMMdd") { - val sampleDf = Seq( - ("20130112"), - ("20150815"), - ("20200325") - ).toDF("partcol") + val sampleDf = Seq(("20130112"), ("20150815"), ("20200325")).toDF("partcol") - val dateComponentDerivationTester: DataFrame => DataFrame = new DateComponentDerivationSubClass() - .validateWithDateComponents( + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( sourceDateColumnName = "partcol", sourceDateFormat = "yyyyMMdd", targetDateComponentColumnNames = Seq("year", "month") @@ -94,25 +87,18 @@ class DateComponentDerivationTest extends FunSuite with SparkSessionWrapper with val transformedDf = sampleDf.transform(dateComponentDerivationTester) - val expectedDf = Seq( - ("20130112", 2013, 1), - ("20150815", 2015, 8), - ("20200325", 2020, 3) - ).toDF("partcol", "year", "month") + val expectedDf = Seq(("20130112", 2013, 1), ("20150815", 2015, 8), ("20200325", 2020, 3)) + .toDF("partcol", "year", "month") transformedDf.hasDiff(expectedDf) shouldBe false } test("Partition by year/month with formatter yyyyMMdd - with wrong data") { - val sampleDf = Seq( - ("20130112"), - ("201508151"), - ("20200325") - ).toDF("partcol") + val sampleDf = Seq(("20130112"), ("201508151"), ("20200325")).toDF("partcol") - val dateComponentDerivationTester: DataFrame => DataFrame = new DateComponentDerivationSubClass() - .validateWithDateComponents( + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( sourceDateColumnName = "partcol", sourceDateFormat = "yyyyMMdd", targetDateComponentColumnNames = Seq("year", "month") @@ -120,50 +106,36 @@ class DateComponentDerivationTest extends FunSuite with SparkSessionWrapper with val transformedDf = sampleDf.transform(dateComponentDerivationTester) - val expectedDf = Seq( - ("20130112", 2013, 1), - ("201508151", 9999, 99), - ("20200325", 2020, 3) - ).toDF("partcol", "year", "month") + val expectedDf = Seq(("20130112", 2013, 1), ("201508151", 9999, 99), ("20200325", 2020, 3)) + .toDF("partcol", "year", "month") transformedDf.hasDiff(expectedDf) shouldBe false } test("Partition by year/month with formatter yyyyMM as IntegerType") { - val sampleDf = Seq( - (201301), - (2015233), - (202003) - ).toDF("partcol") + val sampleDf = Seq((201301), (2015233), (202003)).toDF("partcol") - val dateComponentDerivationTester: DataFrame => DataFrame = new DateComponentDerivationSubClass() - .validateWithDateComponents( + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( sourceDateColumnName = "partcol", sourceDateFormat = "yyyyMM", targetDateComponentColumnNames = Seq("year", "month") ) val transformedDf = sampleDf.transform(dateComponentDerivationTester) - val expectedDf = Seq( - (201301, 2013, 1), - (2015233, 9999, 99), - (202003, 2020, 3) - ).toDF("partcol", "year", "month") + val expectedDf = Seq((201301, 2013, 1), (2015233, 9999, 99), (202003, 2020, 3)) + .toDF("partcol", "year", "month") transformedDf.hasDiff(expectedDf) shouldBe false } test("Partition by year/week/day with formatter yyyywwe as IntegerType") { - val sampleDf = Seq( - (2013014), - (2015233), - (2020037) - ).toDF("partcol") + val sampleDf = Seq((2013014), (2015233), (2020037)).toDF("partcol") - val dateComponentDerivationTester: DataFrame => DataFrame = new DateComponentDerivationSubClass() - .validateWithDateComponents( + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( sourceDateColumnName = "partcol", sourceDateFormat = "yyyywwe", targetDateComponentColumnNames = Seq("year", "week", "day") @@ -171,11 +143,28 @@ class DateComponentDerivationTest extends FunSuite with SparkSessionWrapper with val transformedDf = sampleDf.transform(dateComponentDerivationTester) - val expectedDf = Seq( - (2013014, 2013, 1, 4), - (2015233, 2015, 23, 3), - (2020037, 2020, 3, 7) - ).toDF("partcol", "year", "week", "day") + val expectedDf = Seq((2013014, 2013, 1, 4), (2015233, 2015, 23, 3), (2020037, 2020, 3, 7)) + .toDF("partcol", "year", "week", "day") + + transformedDf.hasDiff(expectedDf) shouldBe false + } + + test("Partition by year/month/day with formatter MM/dd/yyyy") { + + val sampleDf = Seq(("10/31/2020"), ("05/07/2020"), ("12/15/2020")).toDF("partcol") + + val dateComponentDerivationTester: DataFrame => DataFrame = + new DateComponentDerivationSubClass().validateWithDateComponents( + sourceDateColumnName = "partcol", + sourceDateFormat = "MM/dd/yyyy", + targetDateComponentColumnNames = Seq("year", "month", "day") + ) + + val transformedDf = sampleDf.transform(dateComponentDerivationTester) + + val expectedDf = + Seq(("10/31/2020", 2020, 10, 31), ("05/07/2020", 2020, 5, 7), ("12/15/2020", 2020, 12, 15)) + .toDF("partcol", "year", "month", "day") transformedDf.hasDiff(expectedDf) shouldBe false } diff --git a/src/test/scala/com/adidas/analytics/unit/RecoverPartitionsCustomTest.scala b/src/test/scala/com/adidas/analytics/unit/RecoverPartitionsCustomTest.scala index c568ed4..4e7f6b2 100644 --- a/src/test/scala/com/adidas/analytics/unit/RecoverPartitionsCustomTest.scala +++ b/src/test/scala/com/adidas/analytics/unit/RecoverPartitionsCustomTest.scala @@ -5,18 +5,20 @@ import com.adidas.utils.SparkSessionWrapper import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{Dataset, Row} -import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester} +import org.scalatest.{BeforeAndAfterAll, PrivateMethodTester} +import org.scalatest.matchers.should.Matchers +import org.scalatest.funsuite.AnyFunSuite -import scala.collection.JavaConverters._ - -class RecoverPartitionsCustomTest extends FunSuite - with SparkSessionWrapper - with PrivateMethodTester - with Matchers - with BeforeAndAfterAll{ +class RecoverPartitionsCustomTest + extends AnyFunSuite + with SparkSessionWrapper + with PrivateMethodTester + with Matchers + with BeforeAndAfterAll { test("test conversion of String Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) + val customSparkRecoverPartitions = + RecoverPartitionsCustom(tableName = "", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue") @@ -24,42 +26,46 @@ class RecoverPartitionsCustomTest extends FunSuite } test("test conversion of Short Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) + val customSparkRecoverPartitions = + RecoverPartitionsCustom(tableName = "", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) - val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2")) + val result = customSparkRecoverPartitions invokePrivate + createParameterValue(java.lang.Short.valueOf("2")) result should be("2") } test("test conversion of Integer Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) + val customSparkRecoverPartitions = + RecoverPartitionsCustom(tableName = "", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) - val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4")) + val result = customSparkRecoverPartitions invokePrivate + createParameterValue(java.lang.Integer.valueOf("4")) result should be("4") } test("test conversion of null Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) + val customSparkRecoverPartitions = + RecoverPartitionsCustom(tableName = "", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) - an [Exception] should be thrownBy { + an[Exception] should be thrownBy { customSparkRecoverPartitions invokePrivate createParameterValue(null) } } test("test conversion of not supported Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq()) + val customSparkRecoverPartitions = + RecoverPartitionsCustom(tableName = "", targetPartitions = Seq()) val createParameterValue = PrivateMethod[String]('createParameterValue) - an [Exception] should be thrownBy { + an[Exception] should be thrownBy { customSparkRecoverPartitions invokePrivate createParameterValue(false) } } test("test HiveQL statements Generation") { - val customSparkRecoverPartitions = RecoverPartitionsCustom( - tableName="test", - targetPartitions = Seq("country","district") - ) + val customSparkRecoverPartitions = + RecoverPartitionsCustom(tableName = "test", targetPartitions = Seq("country", "district")) val rowsInput = Seq( Row(1, "portugal", "porto"), @@ -85,15 +91,13 @@ class RecoverPartitionsCustomTest extends FunSuite val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements) - val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset)) - .collectAsList() - .asScala + val producedStatements: Seq[String] = + (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset)) + .collect() expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet) } - override def afterAll(): Unit = { - spark.stop() - } + override def afterAll(): Unit = spark.stop() } diff --git a/src/test/scala/com/adidas/analytics/unit/SparkRecoverPartitionsCustomTest.scala b/src/test/scala/com/adidas/analytics/unit/SparkRecoverPartitionsCustomTest.scala deleted file mode 100644 index b23feeb..0000000 --- a/src/test/scala/com/adidas/analytics/unit/SparkRecoverPartitionsCustomTest.scala +++ /dev/null @@ -1,99 +0,0 @@ -package com.adidas.analytics.unit - -import com.adidas.analytics.util.SparkRecoverPartitionsCustom -import com.adidas.utils.SparkSessionWrapper -import org.apache.spark.sql.catalyst.encoders.RowEncoder -import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} -import org.apache.spark.sql.{Dataset, Row} -import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester} - -import scala.collection.JavaConverters._ - -class SparkRecoverPartitionsCustomTest extends FunSuite - with SparkSessionWrapper - with PrivateMethodTester - with Matchers - with BeforeAndAfterAll{ - - test("test conversion of String Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) - val createParameterValue = PrivateMethod[String]('createParameterValue) - val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue") - - result should be("'theValue'") - } - - test("test conversion of Short Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) - val createParameterValue = PrivateMethod[String]('createParameterValue) - val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2")) - - result should be("2") - } - - test("test conversion of Integer Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) - val createParameterValue = PrivateMethod[String]('createParameterValue) - val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4")) - - result should be("4") - } - - test("test conversion of null Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) - val createParameterValue = PrivateMethod[String]('createParameterValue) - an [Exception] should be thrownBy { - customSparkRecoverPartitions invokePrivate createParameterValue(null) - } - } - - test("test conversion of not supported Value to HiveQL Partition Parameter") { - val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq()) - val createParameterValue = PrivateMethod[String]('createParameterValue) - an [Exception] should be thrownBy { - customSparkRecoverPartitions invokePrivate createParameterValue(false) - } - } - - test("test HiveQL statements Generation") { - val customSparkRecoverPartitions = SparkRecoverPartitionsCustom( - tableName="test", - targetPartitions = Seq("country","district") - ) - - val rowsInput = Seq( - Row(1, "portugal", "porto"), - Row(2, "germany", "herzogenaurach"), - Row(3, "portugal", "coimbra") - ) - - val inputSchema = StructType( - List( - StructField("number", IntegerType, nullable = true), - StructField("country", StringType, nullable = true), - StructField("district", StringType, nullable = true) - ) - ) - - val expectedStatements: Seq[String] = Seq( - "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')", - "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')", - "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')" - ) - - val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema)) - - val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements) - - val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset)) - .collectAsList() - .asScala - - expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet) - } - - override def afterAll(): Unit = { - spark.stop() - } - -} diff --git a/src/test/scala/com/adidas/utils/BaseAlgorithmTest.scala b/src/test/scala/com/adidas/utils/BaseAlgorithmTest.scala index 31ca9b5..e0c3041 100644 --- a/src/test/scala/com/adidas/utils/BaseAlgorithmTest.scala +++ b/src/test/scala/com/adidas/utils/BaseAlgorithmTest.scala @@ -1,17 +1,20 @@ package com.adidas.utils import java.util.UUID - import com.adidas.analytics.util.{DFSWrapper, LoadMode} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.StructType import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Suite} import org.slf4j.{Logger, LoggerFactory} - import scala.io.Source -trait BaseAlgorithmTest extends Suite with BeforeAndAfterAll with BeforeAndAfterEach with HDFSSupport with SparkSupport { +trait BaseAlgorithmTest + extends Suite + with BeforeAndAfterAll + with BeforeAndAfterEach + with HDFSSupport + with SparkSupport { override val logger: Logger = LoggerFactory.getLogger(getClass) override val testAppId: String = UUID.randomUUID().toString @@ -33,24 +36,20 @@ trait BaseAlgorithmTest extends Suite with BeforeAndAfterAll with BeforeAndAfter override def afterEach(): Unit = { spark.sqlContext.clearCache() - spark.sparkContext.getPersistentRDDs.foreach { - case (_, rdd) => rdd.unpersist(true) - } + spark.sparkContext.getPersistentRDDs.foreach { case (_, rdd) => rdd.unpersist(true) } } def resolveResource(fileName: String, withProtocol: Boolean = false): String = { - val resource = s"${getClass.getSimpleName}/$fileName" + val resource = + s"${getClass.getPackage.getName.replace('.', '/')}/${getClass.getSimpleName}Res/$fileName" logger.info(s"Resolving resource $resource") val location = getClass.getClassLoader.getResource(resource).getPath - if (withProtocol) { - s"file://$location" - } else { - location - } + if (withProtocol) s"file://$location" else location } def getResourceAsText(fileName: String): String = { - val resource = s"${getClass.getSimpleName}/$fileName" + val resource = + s"${getClass.getPackage.getName.replace('.', '/')}/${getClass.getSimpleName}Res/$fileName" logger.info(s"Reading resource $resource") val stream = getClass.getClassLoader.getResourceAsStream(resource) Source.fromInputStream(stream).mkString @@ -63,24 +62,35 @@ trait BaseAlgorithmTest extends Suite with BeforeAndAfterAll with BeforeAndAfter fs.copyFromLocalFile(sourcePath, targetPath) } - /* - * Creates (but does not load) a Parquet table for testing purposes - */ - def createParquetTable(database: String, tableName: String, partitionColumns: Option[Seq[String]] = None, schema: StructType): Table = { - val inputTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, s"$database/$tableName")).toString + /* Creates (but does not load) a Parquet table for testing purposes */ + def createParquetTable( + database: String, + tableName: String, + partitionColumns: Option[Seq[String]] = None, + schema: StructType + ): Table = { + val inputTableLocation = + fs.makeQualified(new Path(hdfsRootTestPath, s"$database/$tableName")).toString if (partitionColumns.isEmpty) - Table.newBuilder(tableName, database, inputTableLocation, schema) + Table + .newBuilder(tableName, database, inputTableLocation, schema) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) else - Table.newBuilder(tableName, database, inputTableLocation, schema) + Table + .newBuilder(tableName, database, inputTableLocation, schema) .withPartitions(partitionColumns.get) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } - /* - * Creates and Loads Parquet table for testing purposes - */ - def createAndLoadParquetTable(database: String, tableName: String, partitionColumns: Option[Seq[String]] = None, schema: StructType, filePath: String, reader: FileReader): Table = { + /* Creates and Loads Parquet table for testing purposes */ + def createAndLoadParquetTable( + database: String, + tableName: String, + partitionColumns: Option[Seq[String]] = None, + schema: StructType, + filePath: String, + reader: FileReader + ): Table = { val table = createParquetTable(database, tableName, partitionColumns, schema) val inputTableDataURI = resolveResource(filePath, withProtocol = true) table.write(Seq(inputTableDataURI), reader, LoadMode.OverwritePartitions) diff --git a/src/test/scala/com/adidas/utils/FileReader.scala b/src/test/scala/com/adidas/utils/FileReader.scala index c56ecb2..09d3f79 100644 --- a/src/test/scala/com/adidas/utils/FileReader.scala +++ b/src/test/scala/com/adidas/utils/FileReader.scala @@ -5,44 +5,41 @@ import com.adidas.analytics.util.DataFormat.{DSVFormat, JSONFormat, ParquetForma import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} - class FileReader(format: DataFormat, options: Map[String, String]) { def read(spark: SparkSession, location: String, fillNulls: Boolean = false): DataFrame = { val df = format.read(spark.read.options(options), location) - if (fillNulls) { - df.na.fill("") - } else { - df - } + if (fillNulls) df.na.fill("") else df } } - object FileReader { - def newDSVFileReader(optionalSchema: Option[StructType] = None, delimiter: Char = '|', header: Boolean = false): FileReader = { - val options = Map("delimiter" -> delimiter.toString, "header" -> header.toString) - if (optionalSchema.isEmpty) { + def newDSVFileReader( + optionalSchema: Option[StructType] = None, + delimiter: Char = '|', + header: Boolean = false, + dateFormat: String = "yyyy-MM-dd" + ): FileReader = { + val options = Map( + "delimiter" -> delimiter.toString, + "header" -> header.toString, + "dateFormat" -> dateFormat + ) + if (optionalSchema.isEmpty) new FileReader(DSVFormat(optionalSchema), options + ("inferSchema" -> "true")) - } else { - new FileReader(DSVFormat(optionalSchema), options) - } + else new FileReader(DSVFormat(optionalSchema), options) } - def newParquetFileReader(): FileReader = { + def newParquetFileReader(): FileReader = new FileReader(ParquetFormat(), Map.empty[String, String]) - } - def newJsonFileReader(optionalSchema: Option[StructType] = None): FileReader = { - new FileReader(JSONFormat(optionalSchema), Map.empty[String, String]) - } + def newJsonFileReader(optionalSchema: Option[StructType] = None): FileReader = + new FileReader(JSONFormat(optionalSchema), Map.empty[String, String]) - def apply(format: DataFormat, options: (String, String)*): FileReader = { + def apply(format: DataFormat, options: (String, String)*): FileReader = new FileReader(format, options.toMap) - } - def apply(format: DataFormat, options: Map[String, String]): FileReader = { + def apply(format: DataFormat, options: Map[String, String]): FileReader = new FileReader(format, options) - } } diff --git a/src/test/scala/com/adidas/utils/HDFSSupport.scala b/src/test/scala/com/adidas/utils/HDFSSupport.scala index 6cb20ad..67eaebe 100644 --- a/src/test/scala/com/adidas/utils/HDFSSupport.scala +++ b/src/test/scala/com/adidas/utils/HDFSSupport.scala @@ -1,7 +1,6 @@ package com.adidas.utils import java.io.File - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{CommonConfigurationKeysPublic, FileSystem} import org.apache.hadoop.hdfs.MiniDFSCluster @@ -27,9 +26,11 @@ trait HDFSSupport { val clusterConf = hadoopConf.fold(new Configuration())(c => new Configuration(c)) clusterConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsTestDir.getAbsolutePath) - clusterConf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, s"hdfs://localhost:$defaultPort/") + clusterConf + .set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, s"hdfs://localhost:$defaultPort/") - logger.info(s"Starting test DFS cluster with base directory at ${hdfsTestDir.getAbsolutePath} ...") + logger + .info(s"Starting test DFS cluster with base directory at ${hdfsTestDir.getAbsolutePath} ...") new MiniDFSCluster.Builder(clusterConf) .numDataNodes(defaultDataNodesNum) .nameNodePort(defaultPort) diff --git a/src/test/scala/com/adidas/utils/SparkSessionWrapper.scala b/src/test/scala/com/adidas/utils/SparkSessionWrapper.scala index 09de5be..df53e7b 100644 --- a/src/test/scala/com/adidas/utils/SparkSessionWrapper.scala +++ b/src/test/scala/com/adidas/utils/SparkSessionWrapper.scala @@ -3,8 +3,7 @@ package com.adidas.utils import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession - -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters.iterableAsScalaIterableConverter trait SparkSessionWrapper { @@ -14,17 +13,11 @@ trait SparkSessionWrapper { def startSpark(hadoopConf: Option[Configuration]): SparkSession = { - val sparkConf = hadoopConf.foldLeft { - new SparkConf(false) - } { (sparkConf, hadoopConf) => - hadoopConf.foldLeft(sparkConf)((sc, entry) => sc.set(entry.getKey, entry.getValue)) + val sparkConf = hadoopConf.foldLeft(new SparkConf(false)) { (sparkConf, hadoopConf) => + hadoopConf.asScala.foldLeft(sparkConf)((sc, entry) => sc.set(entry.getKey, entry.getValue)) } - SparkSession.builder() - .config(sparkConf) - .appName("spark tests") - .master("local[*]") - .getOrCreate() + SparkSession.builder().config(sparkConf).appName("spark tests").master("local[*]").getOrCreate() } } diff --git a/src/test/scala/com/adidas/utils/SparkSupport.scala b/src/test/scala/com/adidas/utils/SparkSupport.scala index 3d0637b..fa0c3c8 100644 --- a/src/test/scala/com/adidas/utils/SparkSupport.scala +++ b/src/test/scala/com/adidas/utils/SparkSupport.scala @@ -1,13 +1,11 @@ package com.adidas.utils import java.io.File - import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.slf4j.Logger - -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters.iterableAsScalaIterableConverter trait SparkSupport extends SparkSessionWrapper { @@ -29,14 +27,18 @@ trait SparkSupport extends SparkSessionWrapper { new SparkConf(false) .set("spark.ui.enabled", "false") .set("spark.sql.warehouse.dir", new File(sparkTestDir, "warehouse").getAbsolutePath) + .set("spark.sql.shuffle.partitions", "8") } { (sparkConf, hadoopConf) => - hadoopConf.foldLeft(sparkConf)((sc, entry) => sc.set(s"spark.hadoop.${entry.getKey}", entry.getValue)) + hadoopConf.asScala.foldLeft(sparkConf)((sc, entry) => + sc.set(s"spark.hadoop.${entry.getKey}", entry.getValue) + ) } System.setProperty("derby.system.home", new File(sparkTestDir, "metastore").getAbsolutePath) - logger.info(s"Staring Spark Session with warehouse dir at ${sparkTestDir.getAbsolutePath} ...") - SparkSession.builder() + logger.info(s"Starting Spark Session with warehouse dir at ${sparkTestDir.getAbsolutePath} ...") + SparkSession + .builder() .config(sparkConf) .appName(s"test-${getClass.getName}") .master("local[*]") @@ -44,14 +46,12 @@ trait SparkSupport extends SparkSessionWrapper { .getOrCreate() } - def addHadoopConfiguration(conf: Configuration): Unit = { - conf.foreach { property => + def addHadoopConfiguration(conf: Configuration): Unit = + conf.asScala.foreach { property => spark.sparkContext.hadoopConfiguration.set(property.getKey, property.getValue) } - } - def addHadoopProperty(key: String, value: String): Unit = { + def addHadoopProperty(key: String, value: String): Unit = spark.sparkContext.hadoopConfiguration.set(key, value) - } } diff --git a/src/test/scala/com/adidas/utils/Table.scala b/src/test/scala/com/adidas/utils/Table.scala index 6651498..8ac457f 100644 --- a/src/test/scala/com/adidas/utils/Table.scala +++ b/src/test/scala/com/adidas/utils/Table.scala @@ -6,14 +6,25 @@ import org.apache.spark.sql._ import org.apache.spark.sql.types.{StructField, StructType} import org.slf4j.{Logger, LoggerFactory} - -case class Table private(dfs: DFSWrapper, spark: SparkSession, table: String, location: String, schema: StructType, targetPartitions: Seq[String], format: DataFormat, options: Map[String, String]) { - - def read(): DataFrame = { - spark.read.table(table) - } - - def write(files: Seq[String], reader: FileReader, loadMode: LoadMode, fillNulls: Boolean = false): Unit = { +case class Table private ( + dfs: DFSWrapper, + spark: SparkSession, + table: String, + location: String, + schema: StructType, + targetPartitions: Seq[String], + format: DataFormat, + options: Map[String, String] +) { + + def read(): DataFrame = spark.read.table(table) + + def write( + files: Seq[String], + reader: FileReader, + loadMode: LoadMode, + fillNulls: Boolean = false + ): Unit = { val df = files.map(file => reader.read(spark, file, fillNulls)).reduce(_ union _) createWriter(loadMode).write(dfs, df) invalidateCaches() @@ -40,25 +51,28 @@ case class Table private(dfs: DFSWrapper, spark: SparkSession, table: String, lo invalidateCaches() } - private def invalidateCaches(): Unit = { - if (targetPartitions.nonEmpty) { - spark.catalog.recoverPartitions(table) - } else { - spark.catalog.refreshTable(table) - } - } - - private def createWriter(loadMode: LoadMode): OutputWriter = { - OutputWriter.newFileSystemWriter(location, format, targetPartitions, options + ("emptyValue" -> ""), loadMode) - } + private def invalidateCaches(): Unit = + if (targetPartitions.nonEmpty) spark.catalog.recoverPartitions(table) + else spark.catalog.refreshTable(table) + + private def createWriter(loadMode: LoadMode): OutputWriter = + OutputWriter.newFileSystemWriter( + location, + format, + targetPartitions, + options + ("emptyValue" -> ""), + loadMode + ) } - object Table { - def newBuilder(table: String, database: String, location: String, schema: StructType): TableBuilder = { - new TableBuilder(table, database, location, schema) - } + def newBuilder( + table: String, + database: String, + location: String, + schema: StructType + ): TableBuilder = new TableBuilder(table, database, location, schema) private def mapToRow(rowValues: Map[String, Any], fillNulls: Boolean, schema: StructType): Row = { val record = schema.fields.map { @@ -67,12 +81,10 @@ object Table { Row.fromSeq(record) } - protected def buildColumnDefinitions(fields: Seq[StructField]): String = { - fields.map { - case StructField(name, dataType, _, _) => s"$name ${dataType.typeName}" - }.mkString(", ") - } - + protected def buildColumnDefinitions(fields: Seq[StructField]): String = + fields + .map { case StructField(name, dataType, _, _) => s"$name ${dataType.typeName}" } + .mkString(", ") class TableBuilder(table: String, database: String, location: String, schema: StructType) { @@ -85,7 +97,6 @@ object Table { private val defaultDSVOptions: Map[String, String] = Map("delimiter" -> "|") private var options: Map[String, String] = Map() - def withPartitions(targetPartitions: Seq[String]): TableBuilder = { this.targetPartitions = targetPartitions this @@ -96,20 +107,29 @@ object Table { this } - def buildDSVTable(dfs: DFSWrapper, spark: SparkSession, external: Boolean): Table = { + def buildDSVTable(dfs: DFSWrapper, spark: SparkSession, external: Boolean): Table = buildTable(DSVFormat(Some(schema)), defaultDSVOptions ++ options, dfs, spark, external) - } - def buildParquetTable(dfs: DFSWrapper, spark: SparkSession, external: Boolean): Table = { + def buildParquetTable(dfs: DFSWrapper, spark: SparkSession, external: Boolean): Table = buildTable(ParquetFormat(Some(schema)), options, dfs, spark, external) - } - private def buildTable(format: DataFormat, options: Map[String, String], dfs: DFSWrapper, spark: SparkSession, external: Boolean): Table = { + private def buildTable( + format: DataFormat, + options: Map[String, String], + dfs: DFSWrapper, + spark: SparkSession, + external: Boolean + ): Table = { createHiveTable(format, options, spark, external) new Table(dfs, spark, fullTableName, location, schema, targetPartitions, format, options) } - private def createHiveTable(format: DataFormat, options: Map[String, String], spark: SparkSession, external: Boolean): Unit = { + private def createHiveTable( + format: DataFormat, + options: Map[String, String], + spark: SparkSession, + external: Boolean + ): Unit = { val fieldMap = schema.fields.map(f => (f.name, f)).toMap val partitionColumnFields = targetPartitions.map(fieldMap) val columnFields = schema.fields.diff(partitionColumnFields) @@ -119,24 +139,19 @@ object Table { val statementBuilder = Array.newBuilder[String] - if (external) { - statementBuilder += s"CREATE EXTERNAL TABLE $fullTableName($columnDefinitions)" - } else { - statementBuilder += s"CREATE TABLE $fullTableName($columnDefinitions)" - } + if (external) statementBuilder += s"CREATE EXTERNAL TABLE $fullTableName($columnDefinitions)" + else statementBuilder += s"CREATE TABLE $fullTableName($columnDefinitions)" - if (targetPartitions.nonEmpty) { + if (targetPartitions.nonEmpty) statementBuilder += s"PARTITIONED BY ($partitionColumnDefinitions)" - } format match { case _: DataFormat.DSVFormat => val delimiter = options("delimiter") statementBuilder += "ROW FORMAT DELIMITED" statementBuilder += s"FIELDS TERMINATED BY '$delimiter'" - case _: DataFormat.ParquetFormat => - statementBuilder += "STORED AS PARQUET" - case anotherFormat => throw new RuntimeException(s"Unknown file format: $anotherFormat") + case _: DataFormat.ParquetFormat => statementBuilder += "STORED AS PARQUET" + case anotherFormat => throw new RuntimeException(s"Unknown file format: $anotherFormat") } diff --git a/src/test/scala/com/adidas/utils/TestUtils.scala b/src/test/scala/com/adidas/utils/TestUtils.scala index c7ff8d8..b49ba36 100644 --- a/src/test/scala/com/adidas/utils/TestUtils.scala +++ b/src/test/scala/com/adidas/utils/TestUtils.scala @@ -14,7 +14,8 @@ object TestUtils { } val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet - val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet + val groupedAnotherDf = + anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true)) groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false)) diff --git a/static/images/m3d_logo.png b/static/images/m3d_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..2da69eb7a27f8f1a394f3942efb8bde4d6cdf00d GIT binary patch literal 12043 zcmY*<2RNJI_jZgBVymF`C~EIbY^p_76}3Xl+N)-4wMR>fQd(MCTC25*h*`5rQM3{x zYR8WKX@9@}|NFjNSCaRAp7WgZ-1j-xn_SPilkXbqQ&V221ONckw+wXd0RTiASJ#ms zva3(?-2SesFH%=cBTWFHHWPg20KB^Ah8x^70sz7U0D$Ot0N~_mDEbco5F`lztib>P znm000o@`}-yec?W%b6~W5=zGa}L(H&*DkC(WElaHgbc!-zp zl{EmM8lrr4>E#^gz#Zb{=^daPqQ?8rLiy_Y@2~_f_dkev(qr2SNKK{#N=ZpcLQ+~nT3YPN zLM$NEJJ2CS%sYVZKTiI~kB)Ny+~3_d(A~$I`>$UIN1vcTHD2Dof&TaPpM3_pyZqls z-U0uGbp=r3?~H_$xTM7Y`o3DK`gc^>#NXZdD)L|d>QbuzEdRf;|He_3_#6EH+01{C z{u{l5s!pjY@xQdGQ{F)fS^)rJ?6-6@?}rd=z9KJl*3Ivk3SwruOZ8qcx|9Y6V3PX8 z8w++Ev5F^RmV8rEMWngvz(i6dR3cyS#rCB8 zBkzua?e)m&R^w9Mi4g z2rFdO_(&(lCfJThxSi03@tcv^ol-CNNCjGpmW|wYaO9TCVqsw;b9~y;*me?+5AIxe zGVQm^m)U2%qQZaaGbXoS77P55IX+OM>$lZw0{>t< zmC3q{U36dDVeed+G$+huXd7qlqS> zJLB7Yr5o3buFM!nhFO!iB&O~k98sM4vYKkAE$EM{Csb94MZq`x1^aWUuNa#CABNud z0PMZHGjpmfP9Al}jvs}qH_pnYnG}O|$rFA6sV)~K+{yA+`L4J$pt>gY7ZFgHz}AVL zNW>p68CZkzX15r61;<1taQ8Z2{HA@G{13f0s|I1AuUAm$9MM@aF~!XOWl^5v@W?m}1LwQA+BE_2|;_2uETD`4 znPC?Jv)m}sm14BAd5sC8P|IE9G7^7n*-LcVQR=ZBCF50HhoUqtz!{(9%8Dr=tXHLm z9Pzl01LAn=j}1E?9Q5ZO?IctXREVH0Cy>kpGiT8XE{iZRBJH$Awr*}2#NWhbf7Q4M zJn*GnT#p43@%#}M55=CA&McWs(8+%M@O~*_7gitBQS0{eQk-WKtKx;(^B}dp$XPr4 zmlH;mh8+R&)fG&^R%Ax^hnIUIT{pgi2Fs=1UI6n5c$u=!{Y0Pgu(`71_#|}aBL8H@ z@PG3};lK@{yWn*+Y_fvwea4q*gf_d<<6E@BGHrq6t9-y|X$|u_J%i~`z z8^1A65m>)Dc0vNy+Wv)ycLNqWlfO1p@>)>--q;}vwi~CzrULhM-i3Z_sHazt$N=JQ z7}|4BE1U#IOzx^Pm z|51QQ=L=}NQio{w3ZY=?Q%;%f>67_V_2cI12$p$K@ocVD`%(zSR|b;B{^*|V82>HV z&VpuIqOTo%D)yTJTA%Bcn3m^F{9~8+-8+;VPOjnFnn8PTd-0Cl4!D!()w-+LPN>ZG z!DSMy&QoNSVEGxY!_aHjc;v~S7W)ocM4{}B3l2z9uYIsQ^1ID6RSCt)ch>-!Fr z`N5%LyZp_RbW;qiIYsLL|X+q3|5Sf&-B-Jki%oy+fddL`foZB%otAD?5| zfAGOj1)0Uj4Z)D{y5m7kv1UEi+{?NJS|(3W@$6$e^CT96li|-?un6hV*mC@u{BGtD zdF8pTmM5+0!nMwzlRvo)ihjQ#^`|a9UXg<9f&VJr%8^_B9gpJ~38`*D@i=EJW_{iw zBA`>x+DJiWd9m-;r|0TszFhmv#nP|l`G>%VQxmw}Sn3Y?`MgxZ=-M9<$e93QWw3XX z-HTl7(o|aaZxN`oXxOxMrp_#Wf%sgk+(bOZ45xy6(0x(D`w^XbTsy17$IcE&6Hc%R z!=F^l$+_RLT@}iGl*zWxd$r$$7~VHwB{y0(r53z8mU{JMvFn)F|!CGPsA(C zrDBr5AM5uZ(X9Y6bHZyFy~Y}xOoQ#flVvMzCxJH>;ZpCV2mel#J3g1_-`H(ifwnA^l-i5E* z*K;X6Kn-=ZEq6@&Zyt?|^_V7ORJ6Fflrn7YjCG^XF46NNzqq*t<$yA(7=P>1BT~ z^bH#mopSR0#0;r<&(eo72Z~L_z&7@1W~g-3!_7Y4%x&AVj6Q^5{ef|3HLxY6A@nsz zV}0^M-toYHne0Zj-#-b9F~SWqMLqRH?uNeo9K=M~Xm%}3`_w16_2Ph{gt_wK|fuzWe8dR z7`ghrUgC4Mo+AdNFpN{*x|mhM%a@^x^9E*q{8oRzYYB_CHGG4xBDFM&y^e^uh2hFV zarLTFkVOqSGe+i_^@i7@j9~KAQLDv*H;{9CE%(1nY{*pSdJBu?+uB$YZD)Q^VD42s zo*Mi)6|~_k+|8WxQFraoeS^o2>s*(mSAR;aEu~slP(wKbt1q$pfO1y%Q=poWu91;G zoZbP;(T z9%4G4V``orrGJ@vQ8JvK%{4Fz6P4^vm}NU=neEl3_X@3Gy!mV0645rn|G=>2piejp1)-Yl+=&T$o+*etE~KRh^I zV-wkZ?tzOhkUCeuzNL+lFfA^oFXHWffD5qk)?+7XGe$MHEUlOb3&{%JN=x`XldlPu zfx3-%2RuLW>R{N7k2DNDV)C4L<82LRM`|-7btNtK(?OHVqT7|@kbB=jT?N3S4}Z8b zxYB^#b473W^?{-|!9mt)lTi}l{$HOwKqG>Qn*l~=^_YdbUcoVP6~jm{6+w$eY> zh5W87+!!Y&c->fyD{$vfKM`GIBkW1%QX-b?LZdns2!n0=tEsbEaDWG?_%Il8-3=h9 z9RSYV`E6P^GXr|ffx0=b=6$!&ES0+ay2|1JkcVU+c2g_L6{Jsj^&4>K8FS6v%lQ4p zgj?+vl*|;I3XEtSrC$PR&82F_&kM!9(%vLN18T|!RHg0<39ldGl=maJqo+W=xO%fh zXm0!cgEop^Wp!Z+0cVfu)1rn|4tE9@Qc&Epr}Q@O1n+l!*E{jP1vPi(4Q>AJp` z$-XeawxXHiPr-H|U9g7kbv6>$n1^_fmBG@YzFFy~GnSAxRr8QS%;THmFJ$)OiV3G2 zSSd6O+L_pEG4c4`mkGDbT|yVy64V<~tKh6z*-T@8n*>L0eC!fs7E@4^U^wOkM*{*F zwyo{bZH4fy&WH4Sf8%(dv$CX!$QE)+6`%r)rRSbcpY3G;J| zfg`H{XR1wqp*~d}2#y9ZDx3=6 z9pS}=dS920%=}t?o3h>|huE!;HH(8cv)CI?EzrAgz{}#4_9sq#LARezc^kaJ2X0?Q z``IR@J*9YjqS3R>H%%$n3B>tLmjfQ=s524PQZ2x%y!C^${`q|k9_V$1p%-jtX3)RR ztH-g6Ps%cSsdtowg;TC1 zo_rae-qh|l*1SzNw0Bqmy@CIH9oM`*lJ#D*xV|9FWAmt5U z5r^}Cjxsujy{QU??pCA}P(-E2))t*g>wqsK1-ZW!spwir4M;&0n-Wo-JmRUccne`@My}K@VP4yB#W^O2bGw%ccXjR1~L1uN84mTl%VtDOP*2Fg7J$SJJF85%o3* zYBB29O78Q2^4%IF-+M0@ZR3i3QZ$a_j2S*{C2_D#7`XVQyJ^c)!pdPQBrT>fT7Wb}z{%Q8n z({KD<(jU4?r`UjO=OayBgau;S_U46y$sp=b^P4ept*$nL^PgP~+^ay-A_1wlbd7KKRN` z3=k095K>Z@i=Ji0yLhzx9>9GCpKg3hKW%EC3X>ZBC1YNAG3RYHZ$2d|Z`KJsqJQ=& zwkiGBk61*~QO4e`gN@6|`=h2|p;k#{HsZYP!qA`&oA74L&JQNd%&73|Sw@*Ec3HnM z-l|+DCkc-*j-69#KaCry-RCpD_fC=i22Pi~dfc1!tsvUB8D>9o((jU_&`obxEis+P zadgBc)+4)&HsYGwnFX5;^QwKuf zb$Fr9$#VR0({}nFb)4BBrR1rCDdQhDIkrda2(_#qIoi3d-mB%0XmgGk>_6L^bxKS+ zvc)M({kp$%8+d|!eYejdSLxnzdm_7VW$o}gz>L*t$7g8_+NSS}HZ{CmO+Z#Q@3aNX zz-xqac|#O?18o5ewhVT(L5il4KW;w3{PCz{;Pz1*fZoNF-`uJk((h#{)*3ZEagR;! z`P1jQQcs;9-N~;*{q6Bac%XYpDW}=516*?jW0>u(5%k51>>$U5yPuL)?y87gK+uV} z@Z#e`nqMu&SUalG^z}A9-8qXk^;Zg_`!P$hKbBo@)Q&Z*@Q+ldWX1s&O3w+g3u(V1 zXcs#}ahq|n+-!I@2?Gg!i~dC9#h;HC7zGs`Wy0*_UXCwEFfMR5OMS5cSuPu_Rgmnf zOp}ixC~q@`cDUIuJl;FEDf7P#oRzHItuN!@Wj^hJf4xn*XBh4U5j^Pqp;{Euew?v! z?8X+x3FRQ;3b5`?^l@7MkTuU3p5A!SbI`_99!pZ2_xyOt<1jraI4#R(YPCX=b98iK zL?vFlqd~GUUb=>kygeo zOI-3JebsRb@EhCnh6M^_?)PFQx4xh!^*;qN6PD$xEzW{sskz!h3YimYGmpj-;H0N0 z7gY^Q>zdHIS63%}UOS4h(XBqcT}^F7M;7d<=fnO2N1xLHVMW7cR&HB|l-GJ9XyL;b z_|jPs;7k?6lE6hEy~gNic$_`bDmT4j?K(1mSlT4#vK`M)6i{qcK^Sk#hy#RG@2$n{ zNcL7J?cT)y`^kjm^5Xbj|t(qG%S^uzHwa@1NMqV~Lgv@xkrHt~7#o@%w(n}8$cXYVvOjxx=kJ^p1kr$4JW zcOs$b1{Wl%Tr8bhRs?aY7gPpZa}_oB=-C@Gvs-mfUN@p9&l8eB?P-%N9l2AzOos8U zX8Pb?fcq%cDeOeM1SZt|k1!`D3EJ^s?UxZ#7GJWl<$_N7JrWGR1wY7s6&Kn zkJRKX`_rVL-s>{ti6R$kSq-S0O7t#8BcR_U6hWR833(IKC%n_C%+I8JHbj5S3tq@L z?2}s(MB;q1G$hRP9E6J_>CY&}3>Bfb7V1H(TZ`}sA zd}ptzBHNQ~I~f&Sj=JO>*1Jz6NU9-Cw_r#;rS6XdnD}7$I_032!>+*B5VaU7oU(j~ zvRw1A?!EFD)11!C3ZUH&2~vXb*SF$KkEx#zK*EH+hhq|C#o3S4QS=Ib3idSD63xE& z`0M^cE7x6?9PniP@fmMioZ81&CYzIb`O8afNLsc7wVX_ycvzhsrMC)>Os`Jy0U zF+P`)mR=rYS%P;Uu8s|{=?k8W;Glm?n---J->1E1$#sn*d{0%*Kd_LjA7E*7M>6}j zImo>)HOQl9WoBd0=2Xi4-1@cn^Tj^0mBC3e)$p!U+t@sn$@=fUzO^wK# zc-ffdO;I_O;;jH^P~)AD_?lO!dD6Y*U&W-zd>nuOHW^f+?M_II@5{UTq}U|E@*fEt zq@IQQJbUj60qMRUs_1?j9t)6sob|0u?XeJzWQRuy*9Z|Yet%~#5h0iD$tj87k$fUO zRsv$<)aKt3-8vV^5Ys|~SGeP0SFvF7m< zV;9eI~#u^P@hXla~ISu_o&liEE?EM_wf7*WwAe&q_iZ^{*F0dB#3ls%Wwnhpp-(imNzRB$wd?+ zWPJ%qiMyW>oEgk8lQ;FapygQki}To^J==@2kGj4`cVBxB`3^Mw^ig$zJ1>7`2slj^ zhZVSLdPP~>MSjJc6!v4*(@+<9bJcT%fh8}ym~Z-5C+x@*i8Xv#o~ii+2ysoe(_4ZO@eWZR(<*EOnC-s4dPSckXoZmjAi? z`L*Y@st11=mcE@o#B(7I3Gwe9z{w#;6i%Ke+=Lex@me;qq~Z-7$Q=S=j^dWEw;42_ z)%lKiGpKUcL5;uNsF<_6ER+EA}doud5MT!Fmr^nMtJWL=Av9`0g& z6RqoQ8G70(WaFu_<=}PnYUKdM{BVfA#o;pXU|Fu4Sudhe-Ds!=y_nS(ou_IX31zr@ z{0c~xrE`~GRPwVV!gLX70JF{Uaw%pAh3-2!2)-GZ=7+SI#G{=Bhbsxv`rHf=lOd>KhyWQ&(~Vv6DfSCIZRH_ zR9yy|BxH1glie|IjDK9mCu0nr&&yBToE7rUEYAB#SD_;>YsC`yQ-NTG^Kd8!G7})I zGJ;l*q@h;i)12s*SlZi5O!-$u^SwroSaIrYsKWxmP~9nD(+sD3QfNdLL8+?s%1+Rd zE;RxpkO@-LuvF_FoJ^+$o>q!Z>Z0D;Pcly>vIem*dvHz*>RRU`nq1%8y52csv{`&q z`BkbrLdIbm!>>>w4IlC~?9cr;4Sh*xo%rSB&#|b7=lb(zYYuZPcrB!91_wi7s!vAO znJ%E|)NIO8@?&8SF1)3mnXC|5F~d1&oU7_{&ser?7hA1{Zy&t%Ej+g)t#d%p*XnmW z+}Zgv!{<}?9ql0#N?RFy^T;sHj!$*Wlz1gN4YG~WHzkY?*|=?;EM-mOFsUd*^#Fz% zw;bx20>lr29o*4sbwz|#6cQ)@N}N)u^BWhs@3irpREMyKMgV)~^hn*a&1BHE@1mb` zM5wy_u5}DLA2rO}Vk{PmJbJ+Hea3er&_?LnkMsL3M8ck{Iq>`;?pB4lBGb!YDqj*E zNpa>3>e9VZKLdFhI+mHG+0JS(ghp5Y(T3(pPt+lH1l@yt@PPRL7#mREfOPRCTGNpO|ghFdwD*#Q_+~du*o$ukD%iTo}nS~}(ePxKk@{9(4QyG&n* z>qeVylOc->1XQK=P~#hy@crcClsg@8)fe$Y3(w!pAb2s;kI`}Qa-_ZYoRWUDlMZ-` ziZXZ(QJggytuQB~HV3(j8_2In4{jv4Eze2j{yg&JLdD-JWpR#chx z!6Rc93RQ1aOE-$YJ~O<{R`;xgs8iIldz%ye=8d7*mL=Y9|4s6>Ygf&~Ln&M61GfP= zw%GKbcweui%-#KR@iN4m?oK&nVaRk%+Pi*B+TW<8_sR zx3=Fe5*Un`AGrB;I;Kc`NwxPgFzecV?|QeaYAiFIt@9^wfGXT>5EM*8bIRZCHIkbJ zB+vB}5b)Q>MmT7{Sxwe4Cu6K1c$G5bq|F*_zp;nzA?Bs`0o(wmH#@@VI9Uv@4s_XX z!Aut>j%U7U+ZMBOUDT=LpF@Gx`j(<|7BDgY;q^Yp_aC)R3##sJDLOQ3a3S_IPJNAL zk86aglhr@`jKMbQ?j58=8NcpeT_|t+u0k`F{agn1HmtTLI#l*J!8aaN3d%NhC90T4 zSx&PE(i=ZT-hP(rsYv46d)0z4dBPi{P(8>|9mY&K{<(fsG zCG&){msDhgyg>lhnluuJSlG(T(I;7(9#NPxG!jF5JBWKYZV_x=QjEfiyRRCL4 z&Qq?M?w6N^%Y`^r6M^TSr)&%PGB!VidDi!8k;sB>n(H+uxFs>b6Tj78_0Lk{ZAneY zR8PDs+(h77QWp#{xrj?jK@nslZ3%Sm#V$sRlQ%UzBqgr!U<*HsAQjOHbiHtzVRKOiFq%#hkyv!?1yG8es1?Jg>va4mmh6*`Z`23BeS%$gxMSGPO)n^rx{?3f^hNJ7KHO6X3=DUEyaIr)Z)Dn4A@_~R@E8|X~2 zP_;_R?)05oz3GRf?zNe{R#L{4P3}7swAUmd*V<}*Pb5$Ltmn@2bp#51)vtrYHYFu> zI{Gmz)@oofHXa=?NkDLwAzs~WHul+<;mRkaT?Ya-;Z*|>BM{*zLf}o5q zz~$hpc1%+pLAm;IKmU$;bZ(#J9j6xor>;NiXKx!trX_S*7V~U~cnZTn#oIb@X&GGm zngzcN+BygcsQ72l`Jwg1)`NR7!)f@CQsI`A+S;EsP|D^Y-GTFsvM_7#exF`rvn|qn zg}+~Jw_?co!F$YzR{r*v<;W3!p2~HobNgh{B{wFLvXP!;+y#9OwENl(H}i*Wta`-n z0oMW?Pm*DK*F6@p*IP!U2AVYb_tyQrv}abBz8w%lhnw%!sNYnsQCZ6l(*9{24kYT1 zkKVIvQ-nY7h6p^l>G06B#yTbyW zOWd)0B)!|J{F2!{D8e<;c61Cee5Y#xy=~ zk7{RHt|GWOwe#j?-nN8r2Hut*wiIw=CnW!@r;snxX#T!pfDx6EbmnJl6mst@#4=kC zDXLC9`3fj&U*@5_Z8CL;d#f%yf?IsmsS-#2`y)?&EDcqS0iC?J5I&-%rWxhVwzD|V z$6J(2{mwhvNRkA^zCOsyoVXOAW(6bxaTjD#hZW-8V~*~5J~T-Cw0={_+XLBW72yl& zNMe^LwdeP|iND7swRGM2yB5=5DNjHrfQ|n1bi4R%<4aRnwjAjs`elv zqI1ViLFAMyCAr2CB*NeI9bsFnFl|Rqn_@g&rv>FzZ-(B8e?m5%mJ^%CM!tPN7X*QN z^G7&r%2UUz6EsdwXYQ_|h zlG2ynsr#!(Ok+s`(lyd)%F7nyLvtb%@pM==;?Op(ZtwnB&0{)SU!rm)Z_peS2B(6C z$G<~w1B;6g<98qYTU^M}4}OhU{DS6_D#)`Bhgyde`KHuV)@Z%1n&%L1fu$9Ymh=*^ zE4k>;@m|14mJ}BMbm<(Uzk0r-PIAc!79Y4p)$%R)k}8N*g3*xlb6Vq#DQnlPP)FRPassL(^7EJ_zn1qQ4|{( zykN=WCtJJfhRV$AjxO@YGsmZMCGmOVR_S__4t9d`uVkO4-VD&`zlM@zDZhyh<*1BJ z^SVUkkj01DebyK6BjkvPmTte+2encraiJ46cq9MGVJMtYf(53{+xiBHrEBHYkLfkM zDI*5k(`({a$=Tp z4N(gFUhR`RR6}4gy#vsOPg2Kq%a&~PEe6^%{e8t9dR`28mE--B0#PZ+OIDWW0A2{F z&kB8yXh-`F-HTiF3}@Zjw{|S5uKtDdlf9Ib6ej8>kB>XdWPpJ66J^m!HH4h^ZhupA zRSl<9Y4-rE)VzJIP^8i?&5&oP4_V{fh|}8XZK`vzkK#A}`d}a_b%sq4Cmo&XWkk}U zxhOLkA495`=(L8k`d20YR@Ig8kcrGi8^8(@)VvZlz8&p~zk%)}-4M5uButqlo#?RC znf;Yn@{{3cNz{*}N{r&GL4Dve6AR0y0N&Tr0i>L{^yX+@-@2vim*nKQ6DwiFU#Thh zWlS>zX*|W#(#ySgW=i>8`cGHv{VmD2R71}J(n2~kJZbZ%JMRm-P5Al$?p1FTWkdN| ze+n1;_!j^0f7FSSR6~=w+@Cn^F#T0ob)(+1m9y7Cl57F~$o*v~ z8+t}n-BoVf^$JrBhBZ(2D$NWVfmp4WsOoL`&f}DSoHCFcu>74OO&TpcE8@Wy)9pwM zXRXYi-FzwfPfdDXM$(^-s6ljX#!cPvGVfq|!`u^uRo zuNG+`nbM7u`vhQ5tsmj13GUWa?*FoWC8*2hGTOO(9AQyIw_DuGcv%;HgJ>wkp!hEe zRmbRTm}tC&`H6I=7U8SpB!6^F(zUyt%`NQ aToSc0s5ytM@&1*)18(UW>r`pMV*d}ukKzFU literal 0 HcmV?d00001