Skip to content

Commit

Permalink
introducing new dataframe-csv module; preprocess KDocs is enabled. in…
Browse files Browse the repository at this point in the history
…troduces `read(Csv|Tsv|Delim)(Str)` based on Deephaven, `write(Csv|Tsv|Delim)`, and `to(Csv|Tsv|Delim)Str` based on Apache commons csv. Fixes almost all cases of the umbrella issue and has many tests.
  • Loading branch information
Jolanrensen committed Nov 5, 2024
1 parent e26e51a commit 68c8210
Show file tree
Hide file tree
Showing 66 changed files with 98,284 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/generated-sources-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: |
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git add './core/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
git add './core/generated-sources' './dataframe-csv/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
git diff --staged --quiet || git commit -m "Automated commit of generated code"
git push
env:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/generated-sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,18 @@ jobs:
git config --global user.name "GitHub Actions"
- name: Run Gradle task
run: ./gradlew :core:processKDocsMain korro
run: ./gradlew processKDocsMain korro

- name: Check for changes in generated sources
id: git-diff
run: echo "changed=$(if git diff --quiet './core/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'; then echo 'false'; else echo 'true'; fi)" >> $GITHUB_OUTPUT
run: echo "changed=$(if git diff --quiet './core/generated-sources' './dataframe-csv/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'; then echo 'false'; else echo 'true'; fi)" >> $GITHUB_OUTPUT

- name: Commit and push if changes
id: git-commit
if: steps.git-diff.outputs.changed == 'true'
run: |
git checkout -b generated-sources/docs-update-${{ github.run_number }}
git add './core/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
git add './core/generated-sources' './dataframe-csv/generated-sources' './docs/StardustDocs/snippets' './docs/StardustDocs/topics'
git commit -m "Update generated sources with recent changes"
git push origin generated-sources/docs-update-${{ github.run_number }}
echo "commit=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
Expand Down
3 changes: 3 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,15 @@ dependencies {
api(project(":dataframe-excel"))
api(project(":dataframe-openapi"))
api(project(":dataframe-jdbc"))
// TODO enable when it leaves the experimental phase
// api(project(":dataframe-csv"))

kover(project(":core"))
kover(project(":dataframe-arrow"))
kover(project(":dataframe-excel"))
kover(project(":dataframe-openapi"))
kover(project(":dataframe-jdbc"))
kover(project(":dataframe-csv"))
kover(project(":plugins:kotlin-dataframe"))
}

Expand Down
9 changes: 9 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -10289,6 +10289,8 @@ public final class org/jetbrains/kotlinx/dataframe/io/ColType : java/lang/Enum {
public static final field BigDecimal Lorg/jetbrains/kotlinx/dataframe/io/ColType;
public static final field Boolean Lorg/jetbrains/kotlinx/dataframe/io/ColType;
public static final field Char Lorg/jetbrains/kotlinx/dataframe/io/ColType;
public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ColType$Companion;
public static final field DEFAULT Ljava/lang/String;
public static final field Double Lorg/jetbrains/kotlinx/dataframe/io/ColType;
public static final field Duration Lorg/jetbrains/kotlinx/dataframe/io/ColType;
public static final field Instant Lorg/jetbrains/kotlinx/dataframe/io/ColType;
Expand All @@ -10306,11 +10308,18 @@ public final class org/jetbrains/kotlinx/dataframe/io/ColType : java/lang/Enum {
public static fun values ()[Lorg/jetbrains/kotlinx/dataframe/io/ColType;
}

public final class org/jetbrains/kotlinx/dataframe/io/ColType$Companion {
}

public final class org/jetbrains/kotlinx/dataframe/io/CommonKt {
public static final fun asFileOrNull (Ljava/net/URL;)Ljava/io/File;
public static final fun asUrl (Ljava/lang/String;)Ljava/net/URL;
public static final fun catchHttpResponse (Ljava/net/URL;Lkotlin/jvm/functions/Function1;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun isFile (Ljava/net/URL;)Z
public static final fun isProtocolSupported (Ljava/net/URL;)Z
public static final fun isURL (Ljava/lang/String;)Z
public static final fun isUrl (Ljava/lang/String;)Z
public static final fun skippingBomCharacters (Ljava/io/InputStream;)Ljava/io/InputStream;
public static final fun toDataFrame (Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun toDataFrame$default (Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun urlAsFile (Ljava/net/URL;)Ljava/io/File;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import org.jetbrains.kotlinx.dataframe.impl.catchSilent
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse
import org.jetbrains.kotlinx.dataframe.io.isURL
import org.jetbrains.kotlinx.dataframe.io.isUrl
import org.jetbrains.kotlinx.dataframe.io.readJsonStr
import org.jetbrains.kotlinx.dataframe.values
import java.math.BigDecimal
Expand Down Expand Up @@ -210,7 +210,7 @@ internal object Parsers : GlobalParserOptions {
toJavaLocalDateTimeOrNull(formatter) // since we accept a Java DateTimeFormatter
?.toKotlinLocalDateTime()

private fun String.toUrlOrNull(): URL? = if (isURL(this)) catchSilent { URL(this) } else null
private fun String.toUrlOrNull(): URL? = if (isUrl(this)) catchSilent { URL(this) } else null

private fun String.toBooleanOrNull() =
when (uppercase(Locale.getDefault())) {
Expand Down
70 changes: 58 additions & 12 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/common.kt
Original file line number Diff line number Diff line change
@@ -1,28 +1,46 @@
package org.jetbrains.kotlinx.dataframe.io

import com.github.kittinunf.fuel.httpGet
import org.apache.commons.io.input.BOMInputStream
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.util.IS_URL
import org.jetbrains.kotlinx.dataframe.util.IS_URL_IMPORT
import org.jetbrains.kotlinx.dataframe.util.IS_URL_REPLACE
import java.io.File
import java.io.IOException
import java.io.InputStream
import java.net.HttpURLConnection
import java.net.URL

internal fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFrame {
/**
* Opens a stream to [url] to create a [DataFrame] from it.
* If the URL is a file URL, the file is read directly.
* If the URL is an HTTP URL, it's also read directly, but if the server returns an error code,
* the error response is read as JSON and parsed as [DataFrame] too.
*
* Public so it may be used in other modules.
*/
public fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFrame {
val connection = url.openConnection()
if (connection !is HttpURLConnection) {
return connection.inputStream.use(body)
}
try {
return url.openStream().use(body)
} catch (e: IOException) {
if (e.message?.startsWith("Server returned HTTP response code") == true) {
val (_, response, _) = url.toString().httpGet().responseString()
connection.connect()
val code = connection.responseCode
if (code != 200) {
val response = connection.responseMessage
try {
return DataFrame.readJsonStr(response.data.decodeToString())
} catch (e2: Exception) {
throw e
// attempt to read error response as JSON
return DataFrame.readJson(connection.errorStream)
} catch (_: Exception) {
throw RuntimeException("Server returned HTTP response code: $code. Response: $response")
}
}
throw e
return connection.inputStream.use(body)
} finally {
connection.disconnect()
}
}

Expand Down Expand Up @@ -55,7 +73,14 @@ public fun <T> List<List<T>>.toDataFrame(containsColumns: Boolean = false): AnyF
}
}

public fun isURL(path: String): Boolean = listOf("http:", "https:", "ftp:").any { path.startsWith(it) }
@Deprecated(
message = IS_URL,
replaceWith = ReplaceWith(IS_URL_REPLACE, IS_URL_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun isURL(path: String): Boolean = isUrl(path)

public fun isUrl(path: String): Boolean = listOf("http:", "https:", "ftp:").any { path.startsWith(it) }

public fun isFile(url: URL): Boolean = url.protocol == "file"

Expand All @@ -64,3 +89,24 @@ public fun asFileOrNull(url: URL): File? = if (isFile(url)) File(url.path) else
public fun urlAsFile(url: URL): File = File(url.toURI())

public fun isProtocolSupported(url: URL): Boolean = url.protocol in setOf("http", "https", "ftp")

/**
* Converts a file path or URL [String] to a [URL].
* If the path is a file path, the file is checked for existence and not being a directory.
*/
public fun asUrl(fileOrUrl: String): URL =
if (isUrl(fileOrUrl)) {
URL(fileOrUrl).toURI()
} else {
File(fileOrUrl).also {
require(it.exists()) { "File not found: \"$fileOrUrl\"" }
require(it.isFile) { "Not a file: \"$fileOrUrl\"" }
}.toURI()
}.toURL()

/** Skips BOM characters if present. */
public fun InputStream.skippingBomCharacters(): InputStream =
BOMInputStream.builder()
.setInputStream(this)
.setInclude(false)
.get()
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ public fun DataRow.Companion.read(url: URL, header: List<String> = emptyList()):
@OptInRefine
@Interpretable("Read0")
public fun DataFrame.Companion.read(path: String, header: List<String> = emptyList()): AnyFrame =
read(asURL(path), header)
read(asUrl(path), header)

public fun DataRow.Companion.read(path: String, header: List<String> = emptyList()): AnyRow =
DataFrame.read(path, header).single()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ public fun DataFrame.Companion.readJson(
header: List<String> = emptyList(),
keyValuePaths: List<JsonPath> = emptyList(),
typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS,
): AnyFrame = readJson(asURL(path), header, keyValuePaths, typeClashTactic)
): AnyFrame = readJson(asUrl(path), header, keyValuePaths, typeClashTactic)

/**
* @param path URL or file path from where to fetch the Json as [InputStream] to be converted to a [DataRow].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public fun DataFrame.Companion.readTSV(
charset: Charset = Charsets.UTF_8,
parserOptions: ParserOptions? = null,
): DataFrame<*> =
catchHttpResponse(asURL(fileOrUrl)) {
catchHttpResponse(asUrl(fileOrUrl)) {
readDelim(
it,
TAB_CHAR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ internal const val PARSER_OPTIONS = "This constructor is only here for binary co

internal const val PARSER_OPTIONS_COPY = "This function is only here for binary compatibility. $MESSAGE_0_16"

internal const val AS_URL = "This function is replaced by `asUrl()`. $MESSAGE_0_16"
internal const val AS_URL_REPLACE = "asUrl(fileOrUrl)"
internal const val AS_URL_IMPORT = "org.jetbrains.kotlinx.dataframe.io.asUrl"

internal const val IS_URL = "This function is replaced by `isUrl()`. $MESSAGE_0_16"
internal const val IS_URL_REPLACE = "isUrl(path)"
internal const val IS_URL_IMPORT = "org.jetbrains.kotlinx.dataframe.io.isUrl"

// endregion

// region WARNING in 0.16, ERROR in 0.17
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public fun DataFrame.Companion.readArrowIPC(
path: String,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame =
if (isURL(path)) {
if (isUrl(path)) {
readArrowIPC(URL(path), nullability)
} else {
readArrowIPC(File(path), nullability)
Expand Down Expand Up @@ -165,7 +165,7 @@ public fun DataFrame.Companion.readArrowFeather(
path: String,
nullability: NullabilityOptions = NullabilityOptions.Infer,
): AnyFrame =
if (isURL(path)) {
if (isUrl(path)) {
readArrowFeather(URL(path), nullability)
} else {
readArrowFeather(File(path), nullability)
Expand Down
Loading

0 comments on commit 68c8210

Please sign in to comment.