Skip to content

Commit

Permalink
Support read unstructured excel file
Browse files Browse the repository at this point in the history
  • Loading branch information
hare committed Oct 1, 2024
1 parent cab218c commit 2b3361f
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ private fun setWorkbookTempDirectory() {
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
* @param withDefaultHeader make default header
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
* However, when withDefaultHeader is set to true,
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
* ensuring unique column names are generated for unstructured data.
*/
public fun DataFrame.Companion.readExcel(
url: URL,
Expand All @@ -103,11 +107,12 @@ public fun DataFrame.Companion.readExcel(
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
withDefaultHeader: Boolean = false,
): AnyFrame {
setWorkbookTempDirectory()
val wb = WorkbookFactory.create(url.openStream())
return wb.use {
readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
readExcel(wb, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy, withDefaultHeader)
}
}

Expand All @@ -119,7 +124,11 @@ public fun DataFrame.Companion.readExcel(
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
* @param withDefaultHeader make default header
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
* However, when withDefaultHeader is set to true,
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
* ensuring unique column names are generated for unstructured data.
*/
public fun DataFrame.Companion.readExcel(
file: File,
Expand All @@ -129,11 +138,12 @@ public fun DataFrame.Companion.readExcel(
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
withDefaultHeader: Boolean = false,
): AnyFrame {
setWorkbookTempDirectory()
val wb = WorkbookFactory.create(file)
return wb.use {
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy, withDefaultHeader)
}
}

Expand All @@ -145,7 +155,11 @@ public fun DataFrame.Companion.readExcel(
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
* @param withDefaultHeader make default header
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
* However, when withDefaultHeader is set to true,
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
* ensuring unique column names are generated for unstructured data.
*/
@Refine
@Interpretable("ReadExcel")
Expand All @@ -157,7 +171,8 @@ public fun DataFrame.Companion.readExcel(
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy)
withDefaultHeader: Boolean = false,
): AnyFrame = readExcel(asURL(fileOrUrl), sheetName, skipRows, columns, stringColumns, rowsCount, nameRepairStrategy, withDefaultHeader)

/**
* @param sheetName sheet to read. By default, the first sheet in the document
Expand All @@ -167,7 +182,11 @@ public fun DataFrame.Companion.readExcel(
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
* @param withDefaultHeader make default header
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
* However, when withDefaultHeader is set to true,
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
* ensuring unique column names are generated for unstructured data.
*/
public fun DataFrame.Companion.readExcel(
inputStream: InputStream,
Expand All @@ -177,11 +196,12 @@ public fun DataFrame.Companion.readExcel(
stringColumns: StringColumns? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
withDefaultHeader: Boolean = false,
): AnyFrame {
setWorkbookTempDirectory()
val wb = WorkbookFactory.create(inputStream)
return wb.use {
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy)
readExcel(it, sheetName, skipRows, columns, stringColumns?.toFormattingOptions(), rowsCount, nameRepairStrategy, withDefaultHeader)
}
}

Expand All @@ -194,7 +214,11 @@ public fun DataFrame.Companion.readExcel(
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
* @param withDefaultHeader make default header
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
* However, when withDefaultHeader is set to true,
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
* ensuring unique column names are generated for unstructured data.
*/
public fun DataFrame.Companion.readExcel(
wb: Workbook,
Expand All @@ -204,11 +228,12 @@ public fun DataFrame.Companion.readExcel(
formattingOptions: FormattingOptions? = null,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
withDefaultHeader: Boolean = false,
): AnyFrame {
val sheet: Sheet = sheetName
?.let { wb.getSheet(it) ?: error("Sheet with name $sheetName not found") }
?: wb.getSheetAt(0)
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy)
return readExcel(sheet, columns, formattingOptions, skipRows, rowsCount, nameRepairStrategy, withDefaultHeader)
}

/**
Expand Down Expand Up @@ -239,7 +264,11 @@ public class FormattingOptions(range: String, public val formatter: DataFormatte
* @param skipRows number of rows before header
* @param rowsCount number of rows to read.
* @param nameRepairStrategy handling of column names.
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE]
* @param withDefaultHeader make default header
* The default behavior is [NameRepairStrategy.CHECK_UNIQUE].
* However, when withDefaultHeader is set to true,
* it operates as [NameRepairStrategy.MAKE_UNIQUE],
* ensuring unique column names are generated for unstructured data.
*/
public fun DataFrame.Companion.readExcel(
sheet: Sheet,
Expand All @@ -248,21 +277,36 @@ public fun DataFrame.Companion.readExcel(
skipRows: Int = 0,
rowsCount: Int? = null,
nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE,
withDefaultHeader: Boolean = false,
): AnyFrame {
val columnIndexes: Iterable<Int> = if (columns != null) {
getColumnIndices(columns)
} else {
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
val columnIndexes: Iterable<Int> = when{
withDefaultHeader -> {
val notEmptyRow = sheet.rowIterator().asSequence().find { it != null }
checkNotNull(notEmptyRow){
"There are no defined cells"
}
notEmptyRow.firstCellNum until notEmptyRow.lastCellNum
}
val firstCellNum = headerRow.firstCellNum
check(firstCellNum != (-1).toShort()) {
"There are no defined cells on header row number ${skipRows + 1} (1-based index). Pass `columns` argument to specify what columns to read or make sure the index is correct"
columns != null -> getColumnIndices(columns)
else -> {
val headerRow = checkNotNull(sheet.getRow(skipRows)) {
"Row number ${skipRows + 1} (1-based index) is not defined on the sheet ${sheet.sheetName}"
}
val firstCellNum = headerRow.firstCellNum
check(firstCellNum != (-1).toShort()) {
"There are no defined cells on header row number ${skipRows + 1} (1-based index). Pass `columns` argument to specify what columns to read or make sure the index is correct"
}
headerRow.firstCellNum until headerRow.lastCellNum
}
headerRow.firstCellNum until headerRow.lastCellNum
}

val headerRow: Row? = sheet.getRow(skipRows)
val headerRow: Row? = if(withDefaultHeader){
sheet.shiftRows(0, sheet.lastRowNum, 1)
sheet.createRow(0)
}else{
sheet.getRow(skipRows)
}

val first = skipRows + 1
val last = rowsCount?.let { first + it - 1 } ?: sheet.lastRowNum
val valueRowsRange = (first..last)
Expand All @@ -277,7 +321,11 @@ public fun DataFrame.Companion.readExcel(
?: CellReference.convertNumToColString(index) // Use Excel column names if no data
}

val name = repairNameIfRequired(nameFromCell, columnNameCounters, nameRepairStrategy)
val name = repairNameIfRequired(
nameFromCell,
columnNameCounters,
if (withDefaultHeader) NameRepairStrategy.MAKE_UNIQUE else nameRepairStrategy
)
columnNameCounters[nameFromCell] =
columnNameCounters.getOrDefault(nameFromCell, 0) + 1 // increase the counter for specific column name
val getCellValue: (Cell?) -> Any? = when {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,45 @@ class XlsxTest {
df["col1"].type() shouldBe typeOf<String>()
df shouldBe dataFrameOf("col1")("100", "A100", "B100", "C100")
}

@Test
fun `read with default header unstructured excel file`() {
val df = DataFrame.readExcel(
testResource("unstructured_example.xlsx"),
withDefaultHeader = true,
)
df.columnNames() shouldBe
listOf(
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
)
}

@Test
fun `should work read with default header unstructured excel file with skipRow params`() {
val df = DataFrame.readExcel(
testResource("unstructured_example.xlsx"),
withDefaultHeader = true,
skipRows = 1,
)
df.columnNames() shouldBe
listOf(
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
)
}
}
Binary file not shown.

0 comments on commit 2b3361f

Please sign in to comment.