-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: CLIN-3202 Add renameCsvFile util function (#240)
- Loading branch information
1 parent
1f6fbe1
commit 2d96fb6
Showing
4 changed files
with
161 additions
and
2 deletions.
There are no files selected for viewing
58 changes: 58 additions & 0 deletions
58
datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/utils/CsvUtils.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package bio.ferlab.datalake.spark3.utils | ||
|
||
import bio.ferlab.datalake.commons.config.Format.CSV | ||
import bio.ferlab.datalake.commons.config.{Coalesce, Configuration, DatasetConf, FixedRepartition} | ||
import bio.ferlab.datalake.commons.file.FileSystemResolver | ||
import bio.ferlab.datalake.spark3.implicits.DatasetConfImplicits.DatasetConfOperations | ||
import org.apache.spark.sql.{DataFrame, SparkSession} | ||
|
||
object CsvUtils { | ||
|
||
/** | ||
* Renames the CSV output file if the destination format is CSV and data is repartitioned into a single file. | ||
* | ||
* When writing to CSV format, Spark adds the partition information to the filename. This function replaces the | ||
* partition info with the table name. It also deletes the unnecessary `_SUCCESS` file created by Spark. | ||
* | ||
* @param mainDestination The mainDestination [[DatasetConf]] of the ETL | ||
* @param suffix Optional, adds a suffix to the file name, before the extension | ||
* @param spark An instance of [[SparkSession]] | ||
* @param conf The ETL [[Configuration]] | ||
* @return The renamed CSV loaded as a Dataframe | ||
* @example | ||
* This function would rename this CSV file : | ||
* {{{ | ||
* published/nom_du_projet/nom_de_la_table/part-00000-3afd3298-a186-4289-8ba3-3bf55d27953f-c000.csv | ||
* }}} | ||
* to : | ||
* {{{ | ||
* published/nom_du_projet/nom_de_la_table/nom_de_la_table_suffix.csv | ||
* }}} | ||
* where suffix could be : `v1_0_0`, `2020_01_01`, etc. | ||
*/ | ||
def renameCsvFile(mainDestination: DatasetConf, suffix: Option[String] = None) | ||
(implicit spark: SparkSession, conf: Configuration): DataFrame = { | ||
val (format, repartition) = (mainDestination.format, mainDestination.repartition) | ||
if (format == CSV && repartition.isDefined) { | ||
if (repartition.get == FixedRepartition(1) || repartition.get == Coalesce(1)) { | ||
val fs = FileSystemResolver.resolve(conf.getStorage(mainDestination.storageid).filesystem) | ||
val files = fs.list(mainDestination.location, recursive = false) | ||
val successFilePath = files | ||
.filter(_.name == "_SUCCESS") | ||
.head | ||
.path | ||
val csvFilePath = files | ||
.filter(_.name.startsWith("part-")) | ||
.head | ||
.path | ||
|
||
val newPath = mainDestination.location + "/" + mainDestination.path.split("/").last + suffix.map("_" + _).getOrElse("") + ".csv" | ||
|
||
fs.move(csvFilePath, newPath, overwrite = true) | ||
fs.remove(successFilePath) | ||
} | ||
} | ||
mainDestination.read | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
87 changes: 87 additions & 0 deletions
87
datalake-spark3/src/test/scala/bio/ferlab/datalake/spark3/utils/CsvUtilsSpec.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
package bio.ferlab.datalake.spark3.utils | ||
|
||
import bio.ferlab.datalake.commons.config.{DatasetConf, FixedRepartition, SimpleConfiguration} | ||
import bio.ferlab.datalake.commons.config.Format.{CSV, DELTA} | ||
import bio.ferlab.datalake.commons.config.LoadType.{Insert, OverWrite, OverWritePartition} | ||
import bio.ferlab.datalake.commons.file.FileSystemResolver | ||
import bio.ferlab.datalake.spark3.loader.LoadResolver | ||
import bio.ferlab.datalake.spark3.testutils.{AirportInput, WithTestConfig} | ||
import bio.ferlab.datalake.testutils.SparkSpec | ||
import org.apache.spark.sql.types.DateType | ||
|
||
class CsvUtilsSpec extends SparkSpec with WithTestConfig { | ||
import spark.implicits._ | ||
|
||
val destinationDs: DatasetConf = DatasetConf("destination", alias, "/destination", CSV, OverWrite, repartition = Some(FixedRepartition(1)), readoptions = Map("inferSchema" -> "true", "header" -> "true"), writeoptions = Map("header" -> "true")) | ||
override lazy implicit val conf: SimpleConfiguration = sc.copy(datalake = sc.datalake.copy(sources = List(destinationDs))) | ||
|
||
val inputData = Seq( | ||
AirportInput("1", "YYC", "Calgary Int airport"), | ||
AirportInput("2", "YUL", "Montreal Int airport") | ||
) | ||
|
||
|
||
"renameCsvFile" should "rename file if the format is CSV and data is repartitioned into a single file" in { | ||
withOutputFolder("root") { root => | ||
val updatedConf = updateConfStorages(conf, root) | ||
|
||
val fileName = "destination.csv" | ||
LoadResolver | ||
.write(spark, updatedConf)(destinationDs.format -> destinationDs.loadtype) | ||
.apply(destinationDs, destinationDs.repartition.get.repartition(inputData.toDF())) | ||
|
||
val resultDf = CsvUtils.renameCsvFile(destinationDs)(spark, updatedConf) | ||
resultDf | ||
.as[AirportInput] | ||
.collect() should contain theSameElementsAs inputData | ||
|
||
val files = FileSystemResolver | ||
.resolve(updatedConf.getStorage(destinationDs.storageid).filesystem) | ||
.list(destinationDs.location(updatedConf), recursive = true) | ||
|
||
files.size shouldBe 1 | ||
files.head.name shouldBe fileName | ||
} | ||
} | ||
|
||
"renameCsvFile" should "preserve the existing files in the destination if load type is insert" in { | ||
withOutputFolder("root") { root => | ||
val updatedConf = updateConfStorages(conf, root) | ||
|
||
val insertDestinationDs = destinationDs.copy(loadtype = Insert) | ||
val fs = FileSystemResolver.resolve(updatedConf.getStorage(insertDestinationDs.storageid).filesystem) | ||
val load = LoadResolver.write(spark, updatedConf)(insertDestinationDs.format -> insertDestinationDs.loadtype) | ||
|
||
// Existing data | ||
val existingData = inputData | ||
load(insertDestinationDs, insertDestinationDs.repartition.get.repartition(existingData.toDF())) | ||
|
||
val existingDf = CsvUtils.renameCsvFile(insertDestinationDs, suffix = Some("v1"))(spark, updatedConf) | ||
existingDf | ||
.as[AirportInput] | ||
.collect() should contain theSameElementsAs existingData | ||
|
||
val existingFiles = fs.list(insertDestinationDs.location(updatedConf), recursive = true) | ||
existingFiles.size shouldBe 1 | ||
existingFiles.head.name shouldBe "destination_v1.csv" | ||
|
||
// New data | ||
val newData = Seq( | ||
AirportInput("3", "YYZ", "Toronto Int airport"), | ||
AirportInput("4", "YVR", "Vancouver Int airport") | ||
) | ||
load(insertDestinationDs, insertDestinationDs.repartition.get.repartition(newData.toDF())) | ||
|
||
val newDf = CsvUtils.renameCsvFile(insertDestinationDs, suffix = Some("v2"))(spark, updatedConf) | ||
newDf | ||
.as[AirportInput] | ||
.collect() should contain theSameElementsAs existingData ++ newData | ||
|
||
val newFiles = fs.list(insertDestinationDs.location(updatedConf), recursive = true) | ||
newFiles.size shouldBe 2 | ||
newFiles.count(_.name === "destination_v1.csv") shouldBe 1 | ||
newFiles.count(_.name === "destination_v2.csv") shouldBe 1 | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters