From 77dfa5ae7c8217f3b41c0134584c544df73e1ba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20D=C4=99bowczyk?= Date: Thu, 31 Aug 2023 16:43:34 +0200 Subject: [PATCH 01/34] [runners-flink] #28258 Wait infinitely if the duration value is less than 1ms in FlinkDetachedRunnerResult --- .../apache/beam/runners/flink/FlinkDetachedRunnerResult.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java index ddf8a36a4f4ab..77d0e7d3434ca 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/FlinkDetachedRunnerResult.java @@ -92,7 +92,7 @@ public State waitUntilFinish(Duration duration) { long start = System.currentTimeMillis(); long durationInMillis = duration.getMillis(); State state = State.UNKNOWN; - while ((System.currentTimeMillis() - start) < durationInMillis) { + while (durationInMillis < 1 || (System.currentTimeMillis() - start) < durationInMillis) { state = getState(); if (state.isTerminal()) { return state; From 4184f5ea521941fcba6b90769b2d7f35b7262471 Mon Sep 17 00:00:00 2001 From: Ritesh Ghorse Date: Tue, 19 Sep 2023 22:40:15 -0400 Subject: [PATCH 02/34] Update HuggingFace api doc and add text2audio pipeline task (#28474) * update api doc and add text2audio pipeline task * update doc * update doc * fix indent * correct example snippet --- .../ml/inference/huggingface_inference.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/sdks/python/apache_beam/ml/inference/huggingface_inference.py b/sdks/python/apache_beam/ml/inference/huggingface_inference.py index aee6133637813..3ec063808ae32 100644 --- a/sdks/python/apache_beam/ml/inference/huggingface_inference.py +++ b/sdks/python/apache_beam/ml/inference/huggingface_inference.py @@ -98,6 +98,7 @@ class PipelineTask(str, Enum): TextClassification = 'text-classification' TextGeneration = 'text-generation' Text2TextGeneration = 'text2text-generation' + TextToAudio = 'text-to-audio' TokenClassification = 'token-classification' Translation = 'translation' VideoClassification = 'video-classification' @@ -570,7 +571,7 @@ class HuggingFacePipelineModelHandler(ModelHandler[str, def __init__( self, task: Union[str, PipelineTask] = "", - model=None, + model: str = "", *, inference_fn: PipelineInferenceFn = _default_pipeline_inference_fn, load_pipeline_args: Optional[Dict[str, Any]] = None, @@ -593,9 +594,18 @@ def __init__( Args: task (str or enum.Enum): task supported by HuggingFace Pipelines. Accepts a string task or an enum.Enum from PipelineTask. - model : path to pretrained model on Hugging Face Models Hub to use custom - model for the chosen task. If the model already defines the task then - no need to specify the task parameter. + model (str): path to the pretrained *model-id* on Hugging Face Models Hub + to use custom model for the chosen task. If the `model` already defines + the task then no need to specify the `task` parameter. + Use the *model-id* string instead of an actual model here. + Model-specific kwargs for `from_pretrained(..., **model_kwargs)` can be + specified with `model_kwargs` using `load_pipeline_args`. + + Example Usage:: + model_handler = HuggingFacePipelineModelHandler( + task="text-generation", model="meta-llama/Llama-2-7b-hf", + load_pipeline_args={'model_kwargs':{'quantization_map':config}}) + inference_fn: the inference function to use during RunInference. Default is _default_pipeline_inference_fn. load_pipeline_args (Dict[str, Any]): keyword arguments to provide load From 79d0a8d11095522a036a6b3007f5fed4f6f46b3b Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Tue, 19 Sep 2023 21:36:23 -0700 Subject: [PATCH 03/34] tooltips, footer, humanize (#28538) Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- .../prism/internal/web/assets/style.css | 58 ++++++++++++++++++- .../beam/runners/prism/internal/web/debugz.go | 25 +++++--- .../runners/prism/internal/web/debugz.html | 10 ++-- .../runners/prism/internal/web/index.html | 52 +++++++++-------- 4 files changed, 108 insertions(+), 37 deletions(-) diff --git a/sdks/go/pkg/beam/runners/prism/internal/web/assets/style.css b/sdks/go/pkg/beam/runners/prism/internal/web/assets/style.css index 74f4a6958d29a..d252dc020e639 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/web/assets/style.css +++ b/sdks/go/pkg/beam/runners/prism/internal/web/assets/style.css @@ -101,10 +101,19 @@ footer { color: var(--beam-white); } +#page-container { + position: relative; + min-height: 100vh; +} + +#content-wrap { + padding-bottom: 2.5rem; /* Footer height */ +} + .container { width: 100%; margin: 0 auto; - padding: 80px 20px 40px; + padding: 40px 20px 0px; } .child { @@ -132,6 +141,53 @@ footer { padding: 12px 15px; } +/* Tooltip container */ +.tooltip { + display: inline-block; + border-bottom: 1px dotted var(--beam-black); +} + +/* Tooltip text */ +.tooltip .tooltiptext { + visibility: hidden; + width: max-content; + max-width: 400px; + background-color: var(--dark-grey); + color: var(--beam-white); + text-align: left; + padding: 5px 10px; + border-radius: 6px; + + /* Position the tooltip text */ + position: absolute; + z-index: 1; + bottom: 125%; + left: 50%; + margin-left: -60px; + + /* Fade in tooltip */ + opacity: 0; + transition: opacity 0.3s; +} + +/* Tooltip arrow */ +.tooltip .tooltiptext::after { + content: ""; + position: absolute; + top: 100%; + left: 18%; + margin-left: -5px; + border-width: 5px; + border-style: solid; + border-color: var(--dark-grey) transparent transparent transparent; +} + +/* Show the tooltip text when you mouse over the tooltip container */ +.tooltip:hover .tooltiptext { + visibility: visible; + opacity: 1; +} + @media screen and (max-width: 550px) { header { flex-direction: column; diff --git a/sdks/go/pkg/beam/runners/prism/internal/web/debugz.go b/sdks/go/pkg/beam/runners/prism/internal/web/debugz.go index b34547e927521..015a9103134aa 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/web/debugz.go +++ b/sdks/go/pkg/beam/runners/prism/internal/web/debugz.go @@ -21,6 +21,9 @@ import ( "runtime/metrics" "runtime/pprof" "strings" + "time" + + "github.com/dustin/go-humanize" ) type debugzData struct { @@ -54,16 +57,24 @@ func dumpMetrics() debugzData { name, value := sample.Name, sample.Value m := goRuntimeMetric{ - Name: name, + Name: strings.TrimSpace(name), Description: descs[i].Description, } // Handle each sample. switch value.Kind() { case metrics.KindUint64: - m.Value = fmt.Sprintf("%d", value.Uint64()) + if strings.HasSuffix(name, "bytes") { + m.Value = humanize.Bytes(value.Uint64()) + } else { + m.Value = humanize.FormatInteger("", int(value.Uint64())) + } case metrics.KindFloat64: - m.Value = fmt.Sprintf("%f", value.Float64()) + if strings.HasSuffix(name, "seconds") { + m.Value = time.Duration(float64(time.Second) * value.Float64()).String() + } else { + m.Value = humanize.FormatFloat("", value.Float64()) + } case metrics.KindFloat64Histogram: m.Value = fmt.Sprintf("%f", medianBucket(value.Float64Histogram())) // The histogram may be quite large, so let's just pull out @@ -88,16 +99,16 @@ func dumpMetrics() debugzData { data.Metrics = append(data.Metrics, goRuntimeMetric{ Name: "BUILD INFO", - Value: "n/a", - Description: b.String(), + Value: b.String(), + Description: "result from runtime/debug.ReadBuildInfo()", }) b.Reset() goroutineDump(&b) data.Metrics = append(data.Metrics, goRuntimeMetric{ Name: "GOROUTINES", - Value: "n/a", - Description: b.String(), + Value: b.String(), + Description: "consolidated active goroutines", }) b.Reset() diff --git a/sdks/go/pkg/beam/runners/prism/internal/web/debugz.html b/sdks/go/pkg/beam/runners/prism/internal/web/debugz.html index ebf37f129ae38..175f44da7447d 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/web/debugz.html +++ b/sdks/go/pkg/beam/runners/prism/internal/web/debugz.html @@ -30,14 +30,16 @@ - - + {{ range .Metrics }} - + - {{ else }} diff --git a/sdks/go/pkg/beam/runners/prism/internal/web/index.html b/sdks/go/pkg/beam/runners/prism/internal/web/index.html index fe9bb056e51ce..1aa0ed719d87b 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/web/index.html +++ b/sdks/go/pkg/beam/runners/prism/internal/web/index.html @@ -22,31 +22,33 @@ -
-
- -
-
- {{ if .Error}}{{.Error}}{{end}} -
IDNameStateValue
{{ .Name }} +
{{ .Name }} + {{ .Description }} +
+
{{ .Value }}{{ .Description }}
- - - - - - {{ range .Jobs }} - - - - - - {{ else }} - - - - {{ end }} -
IDNameState
{{ .JobId }}{{ .JobName }}{{ .State }}
No jobs have been run.
- +
+
+
+ +
+
+ {{ if .Error}}{{.Error}}{{end}} + + + + + + + {{ range .Jobs }} + + + + + + {{ else }} + + + + {{ end }} +
IDNameState
{{ .JobId }}{{ .JobName }}{{ .State }}
No jobs have been run.
+
+
From 635f9ef5a0e2f1821df3f442bc348d32557acd4b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 09:26:47 -0400 Subject: [PATCH 04/34] Bump github.com/testcontainers/testcontainers-go in /sdks (#28550) Bumps [github.com/testcontainers/testcontainers-go](https://github.com/testcontainers/testcontainers-go) from 0.23.0 to 0.24.0. - [Release notes](https://github.com/testcontainers/testcontainers-go/releases) - [Commits](https://github.com/testcontainers/testcontainers-go/compare/v0.23.0...v0.24.0) --- updated-dependencies: - dependency-name: github.com/testcontainers/testcontainers-go dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 19 +++++++++++++++---- sdks/go.sum | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 169f882b1eafd..005a711a32524 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -47,7 +47,7 @@ require ( github.com/linkedin/goavro/v2 v2.12.0 github.com/proullon/ramsql v0.1.2 github.com/spf13/cobra v1.7.0 - github.com/testcontainers/testcontainers-go v0.23.0 + github.com/testcontainers/testcontainers-go v0.24.0 github.com/tetratelabs/wazero v1.5.0 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c @@ -71,7 +71,18 @@ require ( golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 ) -require dario.cat/mergo v1.0.0 // indirect +require ( + dario.cat/mergo v1.0.0 // indirect + github.com/Microsoft/hcsshim v0.11.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect + github.com/shirou/gopsutil/v3 v3.23.7 // indirect + github.com/shoenig/go-m1cpu v0.1.6 // indirect + github.com/tklauser/go-sysconf v0.3.11 // indirect + github.com/tklauser/numcpus v0.6.0 // indirect + github.com/yusufpapurcu/wmi v1.2.3 // indirect +) require ( cloud.google.com/go v0.110.7 // indirect @@ -104,10 +115,10 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cncf/udpa/go v0.0.0-20220112060539-c52dc94e7fbe // indirect github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4 // indirect - github.com/containerd/containerd v1.7.3 // indirect + github.com/containerd/containerd v1.7.6 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect github.com/docker/distribution v2.8.2+incompatible // indirect - github.com/docker/docker v24.0.5+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v24.0.6+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect github.com/envoyproxy/go-control-plane v0.11.1 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.2 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 13d194fa01944..d498a8bc7ec26 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -48,7 +48,7 @@ cloud.google.com/go/storage v1.33.0/go.mod h1:Hhh/dogNRGca7IWv1RC2YqEn0c0G77ctA/ dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/AdaLogics/go-fuzz-headers v0.0.0-20230106234847-43070de90fa1 h1:EKPd1INOIyr5hWOWhvpmQpY6tKjeG0hT1s3AMC/9fic= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU= github.com/Azure/azure-pipeline-go v0.2.3/go.mod h1:x841ezTBIMG6O3lAcl8ATHnsOPVl2bqk7S3ta6S6u4k= github.com/Azure/azure-storage-blob-go v0.14.0/go.mod h1:SMqIBi+SuiQH32bvyjngEewEeXoPfKMgWlBDaYf6fck= github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= @@ -64,7 +64,8 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= -github.com/Microsoft/hcsshim v0.10.0-rc.8 h1:YSZVvlIIDD1UxQpJp0h+dnpLUw+TrY0cx8obKsp3bek= +github.com/Microsoft/hcsshim v0.11.0 h1:7EFNIY4igHEXUdj1zXgAyU3fLc7QfOKHbkldRVTBdiM= +github.com/Microsoft/hcsshim v0.11.0/go.mod h1:OEthFdQv/AD2RAdzR6Mm1N1KPCztGKDurW1Z8b8VGMM= github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY= github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 h1:byKBBF2CKWBjjA4J1ZL2JXttJULvWSl50LegTyRZ728= @@ -151,8 +152,8 @@ github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4 h1:/inchEIKaYC1Akx+H+g github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/colinmarc/hdfs/v2 v2.1.1/go.mod h1:M3x+k8UKKmxtFu++uAZ0OtDU8jR3jnaZIAc6yK4Ue0c= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= -github.com/containerd/containerd v1.7.3 h1:cKwYKkP1eTj54bP3wCdXXBymmKRQMrWjkLSWZZJDa8o= -github.com/containerd/containerd v1.7.3/go.mod h1:32FOM4/O0RkNg7AjQj3hDzN9cUGtu+HMvaKUNiqCZB8= +github.com/containerd/containerd v1.7.6 h1:oNAVsnhPoy4BTPQivLgTzI9Oleml9l/+eYIDYXRCYo8= +github.com/containerd/containerd v1.7.6/go.mod h1:SY6lrkkuJT40BVNO37tlYTSnKJnP5AXBc0fhx0q+TJ4= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= @@ -160,15 +161,14 @@ github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:ma github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= -github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= -github.com/docker/docker v24.0.5+incompatible h1:WmgcE4fxyI6EEXxBRxsHnZXrO1pQ3smi0k/jho4HLeY= -github.com/docker/docker v24.0.5+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v24.0.6+incompatible h1:hceabKCtUgDqPu+qm0NgsaXf28Ljf4/pWFL7xjWWDgE= +github.com/docker/docker v24.0.6+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= @@ -197,6 +197,8 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gorp/gorp v2.2.0+incompatible h1:xAUh4QgEeqPPhK3vxZN+bzrim1z5Av6q837gtjUlshc= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= @@ -332,6 +334,8 @@ github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/linkedin/goavro/v2 v2.12.0 h1:rIQQSj8jdAUlKQh6DttK8wCRv4t4QO09g1C4aBWXslg= github.com/linkedin/goavro/v2 v2.12.0/go.mod h1:KXx+erlq+RPlGSPmLF7xGo6SAbh8sCQ53x064+ioxhk= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mattn/go-ieproxy v0.0.1/go.mod h1:pYabZ6IHcRpFh7vIaLfK7rdcWgFEb3SFJ6/gNWuh88E= @@ -375,6 +379,8 @@ github.com/pkg/xattr v0.4.9 h1:5883YPCtkSd8LFbs13nXplj9g9tlrwoJRjgpgMu1/fE= github.com/pkg/xattr v0.4.9/go.mod h1:di8WF84zAKk8jzR1UBTEWh9AUlIZZ7M/JNt8e9B6ktU= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/proullon/ramsql v0.1.2 h1:PTtsy2iml/CW3Lsopyr86dlIs7JyYEmfLrfYvQVXD2U= github.com/proullon/ramsql v0.1.2/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= @@ -390,6 +396,12 @@ github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46/go.mod h1:uAQ5P github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63 h1:J6qvD6rbmOil46orKqJaRPG+zTpoGlBTUdyv8ki63L0= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63/go.mod h1:n+VKSARF5y/tS9XFSP7vWDfS+GUC5vs/YT7M5XDTUEM= +github.com/shirou/gopsutil/v3 v3.23.7 h1:C+fHO8hfIppoJ1WdsVm1RoI0RwXoNdfTK7yWXV0wVj4= +github.com/shirou/gopsutil/v3 v3.23.7/go.mod h1:c4gnmoRC0hQuaLqvxnx1//VXQ0Ms/X9UnJF8pddY5z4= +github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= +github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= +github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= +github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnjqq0k= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= @@ -414,11 +426,16 @@ github.com/stretchr/testify v1.7.5/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= -github.com/testcontainers/testcontainers-go v0.23.0 h1:ERYTSikX01QczBLPZpqsETTBO7lInqEP349phDOVJVs= -github.com/testcontainers/testcontainers-go v0.23.0/go.mod h1:3gzuZfb7T9qfcH2pHpV4RLlWrPjeWNQah6XlYQ32c4I= +github.com/testcontainers/testcontainers-go v0.24.0 h1:eqkq6nNIPVrqpXNyn/s5jDBqPGuWtND2hOMEBrUULIw= +github.com/testcontainers/testcontainers-go v0.24.0/go.mod h1:MGBiAkCm86yXQoCiipmQCqZLVdk1uFqtMqaU1Or0MRk= github.com/tetratelabs/wazero v1.5.0 h1:Yz3fZHivfDiZFUXnWMPUoiW7s8tC1sjdBtlJn08qYa0= github.com/tetratelabs/wazero v1.5.0/go.mod h1:0U0G41+ochRKoPKCJlh0jMg1CHkyfK8kDqiirMmKY8A= +github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= +github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI= +github.com/tklauser/numcpus v0.6.0 h1:kebhY2Qt+3U6RNK7UqpYNA+tJ23IBEGKkB7JQBfDYms= +github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4= github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= @@ -440,6 +457,8 @@ github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7Jul github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yusufpapurcu/wmi v1.2.3 h1:E1ctvB7uKFMOJw3fdOW32DwGE9I7t++CRUEMKvFoFiw= +github.com/yusufpapurcu/wmi v1.2.3/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= @@ -551,6 +570,7 @@ golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191112214154-59a1497f0cea/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -565,6 +585,7 @@ golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200828194041-157a740278f4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -576,7 +597,9 @@ golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= From a70a5845230a5d3ebdbe4ed1a23de92b363b8a97 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Wed, 20 Sep 2023 06:39:51 -0700 Subject: [PATCH 05/34] Add schema-aware text file reading and writing. (#28486) --- sdks/python/apache_beam/yaml/standard_io.yaml | 2 ++ sdks/python/apache_beam/yaml/yaml_io.py | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/sdks/python/apache_beam/yaml/standard_io.yaml b/sdks/python/apache_beam/yaml/standard_io.yaml index e60f0026fd25e..1738110539cee 100644 --- a/sdks/python/apache_beam/yaml/standard_io.yaml +++ b/sdks/python/apache_beam/yaml/standard_io.yaml @@ -51,6 +51,8 @@ 'ReadFromBigQuery': 'apache_beam.yaml.yaml_io.read_from_bigquery' # Disable until https://github.com/apache/beam/issues/28162 is resolved. # 'WriteToBigQuery': 'apache_beam.yaml.yaml_io.write_to_bigquery' + 'ReadFromText': 'apache_beam.yaml.yaml_io.read_from_text' + 'WriteToText': 'apache_beam.yaml.yaml_io.write_to_text' # Declared as a renaming transform to avoid exposing all # (implementation-specific) pandas arguments and aligning with possible Java diff --git a/sdks/python/apache_beam/yaml/yaml_io.py b/sdks/python/apache_beam/yaml/yaml_io.py index 2a9d1be62c6dd..297c07e9abb56 100644 --- a/sdks/python/apache_beam/yaml/yaml_io.py +++ b/sdks/python/apache_beam/yaml/yaml_io.py @@ -28,12 +28,38 @@ import yaml import apache_beam as beam +import apache_beam.io as beam_io from apache_beam.io import ReadFromBigQuery from apache_beam.io import WriteToBigQuery from apache_beam.io.gcp.bigquery import BigQueryDisposition +from apache_beam.typehints.schemas import named_fields_from_element_type from apache_beam.yaml import yaml_provider +def read_from_text(path: str): + # TODO(yaml): Consider passing the filename and offset, possibly even + # by default. + return beam_io.ReadFromText(path) | beam.Map(lambda s: beam.Row(line=s)) + + +@beam.ptransform_fn +def write_to_text(pcoll, path: str): + try: + field_names = [ + name for name, _ in named_fields_from_element_type(pcoll.element_type) + ] + except Exception as exn: + raise ValueError( + "WriteToText requires an input schema with exactly one field.") from exn + if len(field_names) != 1: + raise ValueError( + "WriteToText requires an input schema with exactly one field, got %s" % + field_names) + sole_field_name, = field_names + return pcoll | beam.Map( + lambda x: str(getattr(x, sole_field_name))) | beam.io.WriteToText(path) + + def read_from_bigquery( query=None, table=None, row_restriction=None, fields=None): if query is None: From 275b177fa410f0f54b39e008033fb42a83e22f14 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 10:23:43 -0400 Subject: [PATCH 06/34] Bump google.golang.org/api from 0.141.0 to 0.142.0 in /sdks (#28549) Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.141.0 to 0.142.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.141.0...v0.142.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 4 ++-- sdks/go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 005a711a32524..53596c5d207d9 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -57,7 +57,7 @@ require ( golang.org/x/sync v0.3.0 golang.org/x/sys v0.12.0 golang.org/x/text v0.13.0 - google.golang.org/api v0.141.0 + google.golang.org/api v0.142.0 google.golang.org/genproto v0.0.0-20230821184602-ccc8af3d0e93 google.golang.org/grpc v1.58.1 google.golang.org/protobuf v1.31.0 @@ -171,5 +171,5 @@ require ( golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230911183012-2d3300fd4832 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230913181813-007df8e322eb // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index d498a8bc7ec26..502fdf1e88925 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -670,8 +670,8 @@ google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.141.0 h1:Df6vfMgDoIM6ss0m7H4MPwFwY87WNXHfBIda/Bmfl4E= -google.golang.org/api v0.141.0/go.mod h1:iZqLkdPlXKyG0b90eu6KxVSE4D/ccRF2e/doKD2CnQQ= +google.golang.org/api v0.142.0 h1:mf+7EJ94fi5ZcnpPy+m0Yv2dkz8bKm+UL0snTCuwXlY= +google.golang.org/api v0.142.0/go.mod h1:zJAN5o6HRqR7O+9qJUFOWrZkYE66RH+efPBdTLA4xBA= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -701,8 +701,8 @@ google.golang.org/genproto v0.0.0-20230821184602-ccc8af3d0e93 h1:zv6ieVm8jNcN33A google.golang.org/genproto v0.0.0-20230821184602-ccc8af3d0e93/go.mod h1:yZTlhN0tQnXo3h00fuXNCxJdLdIdnVFVBaRJ5LWBbw4= google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 h1:nIgk/EEq3/YlnmVVXVnm14rC2oxgs1o0ong4sD/rd44= google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5/go.mod h1:5DZzOUPCLYL3mNkQ0ms0F3EuUNZ7py1Bqeq6sxzI7/Q= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230911183012-2d3300fd4832 h1:o4LtQxebKIJ4vkzyhtD2rfUNZ20Zf0ik5YVP5E7G7VE= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230911183012-2d3300fd4832/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230913181813-007df8e322eb h1:Isk1sSH7bovx8Rti2wZK0UZF6oraBDK74uoyLEEVFN0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230913181813-007df8e322eb/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From 93c3224be6d93c5960d50c287a27d696d5e9d66a Mon Sep 17 00:00:00 2001 From: Andrey Devyatkin Date: Wed, 20 Sep 2023 16:49:43 +0200 Subject: [PATCH 07/34] Enable remote gradle cache and build scan for GitHub Actions workflows (#28539) * publish gradle build scans publish gradle build scans publish gradle build scans * added missing gradle configs publish gradle build scans --- .github/actions/setup-action/action.yml | 2 + .../action.yml | 53 +++++++++---------- .../beam_PostCommit_Go_Dataflow_ARM.yml | 16 ++---- .../beam_PostCommit_Java_Avro_Versions.yml | 5 ++ ..._PostCommit_Java_Examples_Dataflow_ARM.yml | 15 +----- .../beam_PostCommit_Java_Examples_Flink.yml | 16 ++---- ...m_PostCommit_Java_IO_Performance_Tests.yml | 11 ++-- ..._PostCommit_Java_ValidatesRunner_Flink.yml | 16 ++---- ...am_PostCommit_Python_Examples_Dataflow.yml | 14 ++--- ...beam_PostCommit_Python_Examples_Direct.yml | 14 ++--- .../beam_PostCommit_Python_Examples_Flink.yml | 14 ++--- .../beam_PostCommit_Python_Examples_Spark.yml | 14 ++--- .../beam_PostCommit_Python_MongoDBIO_IT.yml | 14 ++--- ...mit_Python_ValidatesContainer_Dataflow.yml | 14 ++--- ...on_ValidatesContainer_Dataflow_With_RC.yml | 16 ++---- ...Commit_Python_ValidatesRunner_Dataflow.yml | 14 ++--- ...ostCommit_Python_ValidatesRunner_Flink.yml | 14 ++--- ...ostCommit_Python_ValidatesRunner_Samza.yml | 14 ++--- ...ostCommit_Python_ValidatesRunner_Spark.yml | 14 ++--- .../beam_PreCommit_CommunityMetrics.yml | 11 ++-- .github/workflows/beam_PreCommit_Go.yml | 16 ++---- .../workflows/beam_PreCommit_GoPortable.yml | 8 +-- .github/workflows/beam_PreCommit_GoPrism.yml | 8 +-- .../workflows/beam_PreCommit_ItFramework.yml | 12 ++--- .github/workflows/beam_PreCommit_Java.yml | 5 ++ ...beam_PreCommit_Java_Debezium_IO_Direct.yml | 5 ++ ...PreCommit_Java_ElasticSearch_IO_Direct.yml | 5 ++ .../beam_PreCommit_Java_Examples_Dataflow.yml | 12 ++--- ...reCommit_Java_Examples_Dataflow_Java17.yml | 5 ++ ...t_Java_File-schema-transform_IO_Direct.yml | 5 ++ .../beam_PreCommit_Java_Flink_Versions.yml | 16 ++---- .../beam_PreCommit_Java_PVR_Flink_Batch.yml | 5 ++ .github/workflows/beam_PreCommit_Python.yml | 14 ++--- .../workflows/beam_PreCommit_PythonDocker.yml | 19 ++----- .../workflows/beam_PreCommit_PythonDocs.yml | 16 ++---- .../beam_PreCommit_PythonFormatter.yml | 16 ++---- .../workflows/beam_PreCommit_PythonLint.yml | 21 ++------ .../beam_PreCommit_Python_Coverage.yml | 16 ++---- .../beam_PreCommit_Python_Dataframes.yml | 14 ++--- .../beam_PreCommit_Python_Examples.yml | 14 ++--- .../beam_PreCommit_Python_Integration.yml | 14 ++--- .../beam_PreCommit_Python_PVR_Flink.yml | 5 ++ .../beam_PreCommit_Python_Runners.yml | 14 ++--- .../beam_PreCommit_Python_Transforms.yml | 14 ++--- .github/workflows/beam_PreCommit_RAT.yml | 11 ++-- .github/workflows/beam_PreCommit_Spotless.yml | 5 ++ .../workflows/beam_PreCommit_Typescript.yml | 12 ++--- .github/workflows/beam_PreCommit_Website.yml | 11 ++-- .../beam_PreCommit_Website_Stage_GCS.yml | 12 ++--- .../workflows/beam_PreCommit_Whitespace.yml | 16 ++---- ...Python_ValidatesContainer_Dataflow_ARM.yml | 8 +-- .../beam_Release_NightlySnapshot.yml | 11 ++-- .../beam_Release_Python_NightlySnapshot.yml | 16 ++---- .github/workflows/java_tests.yml | 27 +++++----- .../playground_backend_precommit.yml | 15 ++---- .../tour_of_beam_backend_integration.yml | 9 ++-- .../workflows/update_python_dependencies.yml | 8 ++- 57 files changed, 240 insertions(+), 501 deletions(-) rename .github/actions/{setup-self-hosted-action => setup-environment-action}/action.yml (50%) diff --git a/.github/actions/setup-action/action.yml b/.github/actions/setup-action/action.yml index cb24a065f98c9..da69dd9a97ddc 100644 --- a/.github/actions/setup-action/action.yml +++ b/.github/actions/setup-action/action.yml @@ -70,3 +70,5 @@ runs: shell: bash run: | echo KUBELET_GCLOUD_CONFIG_PATH=/var/lib/kubelet/pods/$POD_UID/volumes/kubernetes.io~empty-dir/gcloud >> $GITHUB_ENV + - name: Setup environment + uses: ./.github/actions/setup-environment-action diff --git a/.github/actions/setup-self-hosted-action/action.yml b/.github/actions/setup-environment-action/action.yml similarity index 50% rename from .github/actions/setup-self-hosted-action/action.yml rename to .github/actions/setup-environment-action/action.yml index ba3bf8d0d5d81..3452a16c132c2 100644 --- a/.github/actions/setup-self-hosted-action/action.yml +++ b/.github/actions/setup-environment-action/action.yml @@ -15,47 +15,42 @@ # specific language governing permissions and limitations # under the License. -name: 'Setup environment for self-hosted runners' -description: 'Setup action to run jobs in a self-hosted runner' +name: 'Setup environment action' +description: 'Setup environment to run jobs' inputs: - requires-py-38: + python-version: required: false - description: 'Set as false if does not require py38 setup' - default: 'true' - requires-py-39: + description: 'Install Python version' + default: '' + java-version: required: false - description: 'Set as false if does not require py39 setup' - default: 'true' - requires-java-8: + description: 'Install Java version' + default: '' + go-version: required: false - description: 'Set as false if does not require java-8 setup' - default: 'true' - requires-go: - required: false - description: 'Set as false if does not require go setup' - default: 'true' + description: 'Install Go version' + default: '' runs: using: "composite" steps: - - name: Install python 3.8 - if: ${{ inputs.requires-py-38 == 'true' }} - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - name: Install python 3.9 - if: ${{ inputs.requires-py-39 == 'true' }} + - name: Install Python + if: ${{ inputs.python-version != '' }} uses: actions/setup-python@v4 with: - python-version: "3.9" - - name: Set Java Version - if: ${{ inputs.requires-java-8 == 'true' }} + python-version: ${{ inputs.python-version }} + - name: Install Java + if: ${{ inputs.java-version != '' }} uses: actions/setup-java@v3 with: distribution: 'temurin' - java-version: 8 - - name: Set Go Version - if: ${{ inputs.requires-go == 'true' }} + java-version: ${{ inputs.java-version }} + - name: Setup Gradle + uses: gradle/gradle-build-action@v2 + with: + cache-read-only: false + - name: Install Go + if: ${{ inputs.go-version != '' }} uses: actions/setup-go@v3 with: - go-version: '1.21' # never set patch, to get latest patch releases. + go-version: ${{ inputs.go-version }} # never set patch, to get latest patch releases. diff --git a/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml b/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml index 64b935888cf04..8e5651e292796 100644 --- a/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml +++ b/.github/workflows/beam_PostCommit_Go_Dataflow_ARM.yml @@ -73,19 +73,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Go - uses: actions/setup-go@v4 - with: - go-version: '1.21' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + go-version: 1.21 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 - name: Authenticate on GCP diff --git a/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml b/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml index 1d551dcebf227..cf3064a6a7de5 100644 --- a/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml +++ b/.github/workflows/beam_PostCommit_Java_Avro_Versions.yml @@ -45,6 +45,11 @@ permissions: security-events: read statuses: read +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PostCommit_Java_Avro_Versions: name: ${{matrix.job_name}} (${{matrix.job_phrase}}) diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml index ec771bd4cefd7..5a42b9f95237d 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml @@ -85,21 +85,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{matrix.java_version}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{matrix.java_version}}) - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-38: false - requires-py-39: false - requires-go: false - - name: Set up Java${{ matrix.java_version }} - uses: actions/setup-java@v3.8.0 - with: - distribution: 'temurin' java-version: ${{ matrix.java_version }} - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - name: Authenticate on GCP diff --git a/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml b/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml index 71e761d32ae74..3aec68316f81e 100644 --- a/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_Examples_Flink.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: run examplesIntegrationTest script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml b/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml index eb1f43c51ed80..471782621fa77 100644 --- a/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml +++ b/.github/workflows/beam_PostCommit_Java_IO_Performance_Tests.yml @@ -80,15 +80,10 @@ jobs: with: ref: v2.50.0 #TODO(https://github.com/apache/beam/issues/28330) automate updating this repository: apache/beam - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: diff --git a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml index dbee3bc30782d..20aac8a30608c 100644 --- a/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_ValidatesRunner_Flink.yml @@ -69,19 +69,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: run validatesRunner script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml index c7122523b79b4..14cdecb356bde 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Dataflow.yml @@ -69,19 +69,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: 3.11 - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Run examplesPostCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml b/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml index 719d764781f02..7c792d7a3c273 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Direct.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml b/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml index be523408d72b1..ccf03918f29d5 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Flink.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml b/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml index 01579d5950070..073ed0aeda643 100644 --- a/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml +++ b/.github/workflows/beam_PostCommit_Python_Examples_Spark.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml b/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml index 7778be56f58e2..be8f0e10dc18a 100644 --- a/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml +++ b/.github/workflows/beam_PostCommit_Python_MongoDBIO_IT.yml @@ -69,19 +69,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: 3.11 - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Run mongodbioIT script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml index 9ea80ea0576a7..ca5753010d1c1 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml index c4354b072c52b..ded9ff0a4bd50 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesContainer_Dataflow_With_RC.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | @@ -97,7 +89,7 @@ jobs: with: gradle-command: :sdks:python:test-suites:dataflow:py${{steps.set_py_ver_clean.outputs.py_ver_clean}}:validatesContainer arguments: | - -PtestRCDependencies=true \ + -PtestRCDependencies=true -PpythonVersion=${{ matrix.python_version }} \ - name: Archive code coverage results uses: actions/upload-artifact@v3 diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml index f91b17f456e48..4119ddd560202 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Dataflow.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml index 8e8205f4a80cd..608bba248b3b1 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Flink.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml index 8a2b42d59e5b8..bd0bbe1d6ff14 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Samza.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml index ba6336052fa4b..6fda3a210aaf6 100644 --- a/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml +++ b/.github/workflows/beam_PostCommit_Python_ValidatesRunner_Spark.yml @@ -71,19 +71,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_CommunityMetrics.yml b/.github/workflows/beam_PreCommit_CommunityMetrics.yml index 7606cc7227e23..7a93873168e09 100644 --- a/.github/workflows/beam_PreCommit_CommunityMetrics.yml +++ b/.github/workflows/beam_PreCommit_CommunityMetrics.yml @@ -78,15 +78,10 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - name: Remove default github maven configuration diff --git a/.github/workflows/beam_PreCommit_Go.yml b/.github/workflows/beam_PreCommit_Go.yml index 4626552d748aa..227f3c7648ab7 100644 --- a/.github/workflows/beam_PreCommit_Go.yml +++ b/.github/workflows/beam_PreCommit_Go.yml @@ -78,19 +78,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Go - uses: actions/setup-go@v4 - with: - go-version: '1.21' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + go-version: 1.21 - name: run goPreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_GoPortable.yml b/.github/workflows/beam_PreCommit_GoPortable.yml index 5d18d7e1bb190..8156df15133c0 100644 --- a/.github/workflows/beam_PreCommit_GoPortable.yml +++ b/.github/workflows/beam_PreCommit_GoPortable.yml @@ -78,11 +78,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-39: false - requires-go: false + python-version: 3.8 + java-version: 8 - name: Run goPortablePreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_GoPrism.yml b/.github/workflows/beam_PreCommit_GoPrism.yml index 073a5d7c4a62a..1a669c157007e 100644 --- a/.github/workflows/beam_PreCommit_GoPrism.yml +++ b/.github/workflows/beam_PreCommit_GoPrism.yml @@ -78,11 +78,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-39: false - requires-go: false + python-version: 3.8 + java-version: 8 - name: Run goPrismPreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_ItFramework.yml b/.github/workflows/beam_PreCommit_ItFramework.yml index 3d976e4d8f95d..e8ec1287be51f 100644 --- a/.github/workflows/beam_PreCommit_ItFramework.yml +++ b/.github/workflows/beam_PreCommit_ItFramework.yml @@ -81,16 +81,10 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-38: false - requires-py-39: false - requires-go: false - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: run ItFrameworkPrecommit script run: ./gradlew -p it build - name: Archive JUnit Test Results diff --git a/.github/workflows/beam_PreCommit_Java.yml b/.github/workflows/beam_PreCommit_Java.yml index 22910f6f99fd7..ab5c17c55f4c9 100644 --- a/.github/workflows/beam_PreCommit_Java.yml +++ b/.github/workflows/beam_PreCommit_Java.yml @@ -141,6 +141,11 @@ permissions: security-events: read statuses: read +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Java: name: ${{matrix.job_name}} (${{matrix.job_phrase}}) diff --git a/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml index 67c746daccc68..db348a7684afa 100644 --- a/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_Debezium_IO_Direct.yml @@ -53,6 +53,11 @@ concurrency: group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' cancel-in-progress: true +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Java_Debezium_IO_Direct: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml index 93baa9518613e..27a3e175e7e11 100644 --- a/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_ElasticSearch_IO_Direct.yml @@ -55,6 +55,11 @@ concurrency: group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' cancel-in-progress: true +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Java_ElasticSearch_IO_Direct: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml index ff1dc5172c290..1dfd9ea1eb4d7 100644 --- a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml +++ b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow.yml @@ -92,16 +92,10 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-38: false - requires-py-39: false - requires-go: false - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: diff --git a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml index d31d1a2c2c163..bc5c457eb7610 100644 --- a/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml +++ b/.github/workflows/beam_PreCommit_Java_Examples_Dataflow_Java17.yml @@ -63,6 +63,11 @@ concurrency: group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' cancel-in-progress: true +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Java_Examples_Dataflow_Java17: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml b/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml index d8b4b1a4983c4..d256bca9ebaad 100644 --- a/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml +++ b/.github/workflows/beam_PreCommit_Java_File-schema-transform_IO_Direct.yml @@ -53,6 +53,11 @@ concurrency: group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' cancel-in-progress: true +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Java_File-schema-transform_IO_Direct: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml b/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml index 855e06c827d12..e4a04839cef44 100644 --- a/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml +++ b/.github/workflows/beam_PreCommit_Java_Flink_Versions.yml @@ -79,19 +79,11 @@ jobs: with: comment_phrase: 'Run Java_Flink_Versions PreCommit' github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: run Java Flink Versions PreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml b/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml index 10652edb94206..9679b1825cf50 100644 --- a/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml +++ b/.github/workflows/beam_PreCommit_Java_PVR_Flink_Batch.yml @@ -59,6 +59,11 @@ permissions: security-events: read statuses: read +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Java_PVR_Flink_Batch: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PreCommit_Python.yml b/.github/workflows/beam_PreCommit_Python.yml index 743da499e81a4..80c69afce6e2d 100644 --- a/.github/workflows/beam_PreCommit_Python.yml +++ b/.github/workflows/beam_PreCommit_Python.yml @@ -79,19 +79,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_PythonDocker.yml b/.github/workflows/beam_PreCommit_PythonDocker.yml index 19ecf593a67c9..669f316549c56 100644 --- a/.github/workflows/beam_PreCommit_PythonDocker.yml +++ b/.github/workflows/beam_PreCommit_PythonDocker.yml @@ -78,23 +78,12 @@ jobs: comment_phrase: ${{matrix.job_phrase}} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Install Go - uses: actions/setup-go@v4 - with: - go-version: '1.16' - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + go-version: 1.16 - name: Setup Buildx uses: docker/setup-buildx-action@v2 with: diff --git a/.github/workflows/beam_PreCommit_PythonDocs.yml b/.github/workflows/beam_PreCommit_PythonDocs.yml index e3ffd5a70cd71..a67e8afa3a4eb 100644 --- a/.github/workflows/beam_PreCommit_PythonDocs.yml +++ b/.github/workflows/beam_PreCommit_PythonDocs.yml @@ -78,19 +78,11 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: run pythonDocsPreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_PythonFormatter.yml b/.github/workflows/beam_PreCommit_PythonFormatter.yml index 08fa9dabc6626..1a3335b370e91 100644 --- a/.github/workflows/beam_PreCommit_PythonFormatter.yml +++ b/.github/workflows/beam_PreCommit_PythonFormatter.yml @@ -77,19 +77,11 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: run pythonFormatterPreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_PythonLint.yml b/.github/workflows/beam_PreCommit_PythonLint.yml index f26f423101d9c..9d290f1ba86d0 100644 --- a/.github/workflows/beam_PreCommit_PythonLint.yml +++ b/.github/workflows/beam_PreCommit_PythonLint.yml @@ -77,23 +77,12 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Go - uses: actions/setup-go@v4 - with: - go-version: '1.16' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 + go-version: 1.16 - name: run pythonLintPreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Python_Coverage.yml b/.github/workflows/beam_PreCommit_Python_Coverage.yml index 9bb0b83c7a634..65002f9da8940 100644 --- a/.github/workflows/beam_PreCommit_Python_Coverage.yml +++ b/.github/workflows/beam_PreCommit_Python_Coverage.yml @@ -77,19 +77,11 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - python-version: '3.8' - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: Run preCommitPyCoverage uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Python_Dataframes.yml b/.github/workflows/beam_PreCommit_Python_Dataframes.yml index ab7d07cc72ad6..a7a0ec1836ce3 100644 --- a/.github/workflows/beam_PreCommit_Python_Dataframes.yml +++ b/.github/workflows/beam_PreCommit_Python_Dataframes.yml @@ -79,19 +79,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase}} ${{ matrix.python_version}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name}} (${{ matrix.job_phrase}} ${{ matrix.python_version}}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_Python_Examples.yml b/.github/workflows/beam_PreCommit_Python_Examples.yml index f82aef86de524..1b03b4c0a35a3 100644 --- a/.github/workflows/beam_PreCommit_Python_Examples.yml +++ b/.github/workflows/beam_PreCommit_Python_Examples.yml @@ -79,19 +79,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_Python_Integration.yml b/.github/workflows/beam_PreCommit_Python_Integration.yml index 2ef26516d15c8..5a1b7bc720f37 100644 --- a/.github/workflows/beam_PreCommit_Python_Integration.yml +++ b/.github/workflows/beam_PreCommit_Python_Integration.yml @@ -79,19 +79,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3.8.0 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml b/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml index 131ec8c634a82..a0011354749aa 100644 --- a/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml +++ b/.github/workflows/beam_PreCommit_Python_PVR_Flink.yml @@ -71,6 +71,11 @@ concurrency: group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' cancel-in-progress: true +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Python_PVR_Flink: name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) diff --git a/.github/workflows/beam_PreCommit_Python_Runners.yml b/.github/workflows/beam_PreCommit_Python_Runners.yml index c9f462d385a95..775af7f39d240 100644 --- a/.github/workflows/beam_PreCommit_Python_Runners.yml +++ b/.github/workflows/beam_PreCommit_Python_Runners.yml @@ -79,19 +79,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_Python_Transforms.yml b/.github/workflows/beam_PreCommit_Python_Transforms.yml index 0627d8d5393af..291dcde8665a0 100644 --- a/.github/workflows/beam_PreCommit_Python_Transforms.yml +++ b/.github/workflows/beam_PreCommit_Python_Transforms.yml @@ -79,19 +79,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: + java-version: 8 python-version: ${{ matrix.python_version }} - - name: Install Java - uses: actions/setup-java@v3 - with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set PY_VER_CLEAN id: set_py_ver_clean run: | diff --git a/.github/workflows/beam_PreCommit_RAT.yml b/.github/workflows/beam_PreCommit_RAT.yml index 5c87644858b74..390413fb7aca9 100644 --- a/.github/workflows/beam_PreCommit_RAT.yml +++ b/.github/workflows/beam_PreCommit_RAT.yml @@ -76,15 +76,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: run RAT script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Spotless.yml b/.github/workflows/beam_PreCommit_Spotless.yml index ba748cd7b136b..552a92e104a1e 100644 --- a/.github/workflows/beam_PreCommit_Spotless.yml +++ b/.github/workflows/beam_PreCommit_Spotless.yml @@ -62,6 +62,11 @@ permissions: security-events: read statuses: read +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + jobs: beam_PreCommit_Spotless: name: ${{matrix.job_name}} (${{matrix.job_phrase}}) diff --git a/.github/workflows/beam_PreCommit_Typescript.yml b/.github/workflows/beam_PreCommit_Typescript.yml index e83e04a1bae4f..21c760f2e5254 100644 --- a/.github/workflows/beam_PreCommit_Typescript.yml +++ b/.github/workflows/beam_PreCommit_Typescript.yml @@ -79,15 +79,11 @@ jobs: comment_phrase: ${{matrix.job_phrase}} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-39: false - requires-go: false - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + python-version: 3.8 + java-version: 8 - name: run typescriptPreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Website.yml b/.github/workflows/beam_PreCommit_Website.yml index 81fc578bc3457..87218dc280334 100644 --- a/.github/workflows/beam_PreCommit_Website.yml +++ b/.github/workflows/beam_PreCommit_Website.yml @@ -78,15 +78,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: run websitePreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml b/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml index cf9423f81670b..f910f9a88da39 100644 --- a/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml +++ b/.github/workflows/beam_PreCommit_Website_Stage_GCS.yml @@ -82,15 +82,11 @@ jobs: github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - name: Echo PR number run: echo "ghprbPullId=${{ github.event.pull_request.number || github.event.issue.number }}" >> $GITHUB_ENV - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-39: false - requires-go: false - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + python-version: 3.8 + java-version: 8 - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: diff --git a/.github/workflows/beam_PreCommit_Whitespace.yml b/.github/workflows/beam_PreCommit_Whitespace.yml index b8ef21c8b0777..03a976cfe444e 100644 --- a/.github/workflows/beam_PreCommit_Whitespace.yml +++ b/.github/workflows/beam_PreCommit_Whitespace.yml @@ -77,19 +77,11 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 + python-version: 3.8 - name: run whitespacePreCommit script uses: ./.github/actions/gradle-command-self-hosted-action with: diff --git a/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml b/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml index b56d81f17ca8f..d4c9178e6c91e 100644 --- a/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml +++ b/.github/workflows/beam_Python_ValidatesContainer_Dataflow_ARM.yml @@ -76,14 +76,10 @@ jobs: comment_phrase: ${{ matrix.job_phrase }} ${{ matrix.python_version }} github_token: ${{ secrets.GITHUB_TOKEN }} github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }} ${{ matrix.python_version }}) - - name: Install Python - uses: actions/setup-python@v4 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: python-version: ${{ matrix.python_version }} - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - name: Authenticate on GCP diff --git a/.github/workflows/beam_Release_NightlySnapshot.yml b/.github/workflows/beam_Release_NightlySnapshot.yml index a80105d2a7cb3..a4be830cd3c50 100644 --- a/.github/workflows/beam_Release_NightlySnapshot.yml +++ b/.github/workflows/beam_Release_NightlySnapshot.yml @@ -61,15 +61,10 @@ jobs: github_job: ${{matrix.job_name}} github_token: ${{ secrets.GITHUB_TOKEN }} comment_phrase: "Release Nightly Snapshot" - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + java-version: 8 - name: Auth on snapshot repository run: | mkdir -p ${HOME}/.m2 diff --git a/.github/workflows/beam_Release_Python_NightlySnapshot.yml b/.github/workflows/beam_Release_Python_NightlySnapshot.yml index 58f879d831482..62019c536969f 100644 --- a/.github/workflows/beam_Release_Python_NightlySnapshot.yml +++ b/.github/workflows/beam_Release_Python_NightlySnapshot.yml @@ -60,19 +60,11 @@ jobs: github_job: ${{matrix.job_name}} github_token: ${{ secrets.GITHUB_TOKEN }} comment_phrase: ${{matrix.job_phrase}} - - name: Install Java - uses: actions/setup-java@v3.8.0 + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' - java-version: '8' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' + java-version: 8 + python-version: 3.8 - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: diff --git a/.github/workflows/java_tests.yml b/.github/workflows/java_tests.yml index d1180eebdf812..ceff29b50d4f5 100644 --- a/.github/workflows/java_tests.yml +++ b/.github/workflows/java_tests.yml @@ -77,11 +77,11 @@ jobs: with: persist-credentials: false submodules: recursive - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-38: false - requires-py-39: false + java-version: 8 + go-version: 1.21 - name: Remove default github maven configuration # This step is a workaround to avoid a decryption issue of Beam's # net.linguica.gradle.maven.settings plugin and github's provided maven @@ -136,12 +136,11 @@ jobs: with: persist-credentials: false submodules: recursive - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - requires-py-38: false - requires-py-39: false - + java-version: 8 + go-version: 1.21 - name: Remove default github maven configuration # This step is a workaround to avoid a decryption issue of Beam's # net.linguica.gradle.maven.settings plugin and github's provided maven @@ -180,11 +179,11 @@ jobs: with: persist-credentials: false submodules: recursive - - name: Setup self-hosted - uses: ./.github/actions/setup-self-hosted-action - with: - requires-py-38: false - requires-py-39: false + - name: Setup environment + uses: ./.github/actions/setup-environment-action + with: + java-version: 8 + go-version: 1.21 - name: Authenticate on GCP uses: google-github-actions/setup-gcloud@v0 with: diff --git a/.github/workflows/playground_backend_precommit.yml b/.github/workflows/playground_backend_precommit.yml index de9edd4f2a940..114ca4aac1cb3 100644 --- a/.github/workflows/playground_backend_precommit.yml +++ b/.github/workflows/playground_backend_precommit.yml @@ -38,19 +38,12 @@ jobs: steps: - name: Check out the repo uses: actions/checkout@v4 - - - uses: actions/setup-python@v4 - with: - python-version: '${{ env.PYTHON_VERSION }}' - - uses: actions/setup-java@v3.8.0 + + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: - distribution: 'zulu' java-version: '${{ env.JAVA_VERSION }}' - - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false + python-version: '${{ env.PYTHON_VERSION }}' - name: Add GOPATH/bin to PATH run: echo "PATH=$PATH:$(go env GOPATH)/bin" >> $GITHUB_ENV diff --git a/.github/workflows/tour_of_beam_backend_integration.yml b/.github/workflows/tour_of_beam_backend_integration.yml index 1eb4b66f58683..8f56d3f2e2fae 100644 --- a/.github/workflows/tour_of_beam_backend_integration.yml +++ b/.github/workflows/tour_of_beam_backend_integration.yml @@ -75,16 +75,13 @@ jobs: working-directory: ./learning/tour-of-beam/backend steps: - uses: actions/checkout@v4 - - uses: actions/setup-go@v4 + + - name: Setup environment + uses: ./.github/actions/setup-environment-action with: # pin to the biggest Go version supported by Cloud Functions runtime go-version: '1.16' - - name: Setup Gradle - uses: gradle/gradle-build-action@v2 - with: - cache-read-only: false - - name: Build Playground router image run: ./gradlew -i playground:backend:containers:router:docker working-directory: ${{ env.GITHUB_WORKSPACE }} diff --git a/.github/workflows/update_python_dependencies.yml b/.github/workflows/update_python_dependencies.yml index 43bed87a42f3c..b4b839c3204c9 100644 --- a/.github/workflows/update_python_dependencies.yml +++ b/.github/workflows/update_python_dependencies.yml @@ -50,7 +50,13 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - name: Setup environment - uses: ./.github/actions/setup-self-hosted-action + uses: ./.github/actions/setup-environment-action + with: + python-version: | + 3.8 + 3.9 + java-version: 8 + go-version: 1.21 - name: Update Python Dependencies uses: ./.github/actions/gradle-command-self-hosted-action with: From a7335cb0b3c65174b5eb04b072fa48f28d1ff7cf Mon Sep 17 00:00:00 2001 From: caneff Date: Wed, 20 Sep 2023 11:26:38 -0400 Subject: [PATCH 08/34] Remove deprecated week and weekofyear for Pandas 2 (#28492) --- sdks/python/apache_beam/dataframe/frames.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index 7929c879bdd27..80af501cfacba 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -5377,11 +5377,12 @@ def func(df, *args, **kwargs): 'second', 'time', 'timetz', - 'week', 'weekday', - 'weekofyear', 'year', ] +# Pandas 2 removed these. +if PD_VERSION < (2, 0): + ELEMENTWISE_DATETIME_PROPERTIES += ['week', 'weekofyear'] for method in ELEMENTWISE_DATETIME_PROPERTIES: setattr(_DeferredDatetimeMethods, From 932744801e67c7d03560114a78881be62aadfa19 Mon Sep 17 00:00:00 2001 From: caneff Date: Wed, 20 Sep 2023 11:28:22 -0400 Subject: [PATCH 09/34] Change pd.core.strings.StringMethods for Pandas 2 compatability. (#28455) --- sdks/python/apache_beam/dataframe/frames.py | 34 +++++++++---------- .../apache_beam/dataframe/frames_test.py | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index 80af501cfacba..a74ccbba041ae 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -4931,9 +4931,9 @@ def __setitem__(self, index, value): class _DeferredStringMethods(frame_base.DeferredBase): - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) - @frame_base.populate_defaults(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) + @frame_base.populate_defaults(pd.Series.str) def cat(self, others, join, **kwargs): """If defined, ``others`` must be a :class:`DeferredSeries` or a ``list`` of ``DeferredSeries``.""" @@ -4973,8 +4973,8 @@ def func(*args): requires_partition_by=requires, preserves_partition_by=partitionings.Arbitrary())) - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) def repeat(self, repeats): """``repeats`` must be an ``int`` or a :class:`DeferredSeries`. Lists are not supported because they make this operation order-sensitive.""" @@ -5011,8 +5011,8 @@ def repeat(self, repeats): raise TypeError("str.repeat(repeats=) value must be an int or a " f"DeferredSeries (encountered {type(repeats)}).") - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) def get_dummies(self, **kwargs): """ Series must be categorical dtype. Please cast to ``CategoricalDtype`` @@ -5094,9 +5094,9 @@ def func(s): requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Arbitrary())) - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) - @frame_base.populate_defaults(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) + @frame_base.populate_defaults(pd.Series.str) def split(self, **kwargs): """ Like other non-deferred methods, dtype must be CategoricalDtype. @@ -5105,9 +5105,9 @@ def split(self, **kwargs): """ return self._split_helper(rsplit=False, **kwargs) - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) - @frame_base.populate_defaults(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) + @frame_base.populate_defaults(pd.Series.str) def rsplit(self, **kwargs): """ Like other non-deferred methods, dtype must be CategoricalDtype. @@ -5185,17 +5185,17 @@ def func(df, *args, **kwargs): return func for method in ELEMENTWISE_STRING_METHODS: - if not hasattr(pd.core.strings.StringMethods, method): + if not hasattr(pd.Series.str, method): # older versions (1.0.x) don't support some of these methods continue setattr(_DeferredStringMethods, method, frame_base._elementwise_method(make_str_func(method), name=method, - base=pd.core.strings.StringMethods)) + base=pd.Series.str)) for method in NON_ELEMENTWISE_STRING_METHODS: - if not hasattr(pd.core.strings.StringMethods, method): + if not hasattr(pd.Series.str, method): # older versions (1.0.x) don't support some of these methods continue setattr(_DeferredStringMethods, @@ -5203,7 +5203,7 @@ def func(df, *args, **kwargs): frame_base._proxy_method( make_str_func(method), name=method, - base=pd.core.strings.StringMethods, + base=pd.Series.str, requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Singleton())) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index 257d77e0a6b3b..4998683461b9b 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -2986,7 +2986,7 @@ class DocstringTest(unittest.TestCase): (frames.DeferredDataFrame, pd.DataFrame), (frames.DeferredSeries, pd.Series), #(frames._DeferredIndex, pd.Index), - (frames._DeferredStringMethods, pd.core.strings.StringMethods), + (frames._DeferredStringMethods, pd.Series.str), ( frames._DeferredCategoricalMethods, pd.core.arrays.categorical.CategoricalAccessor), From 840c85883fcf58834c4c0ff5060e13c5d3500184 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Wed, 20 Sep 2023 08:30:29 -0700 Subject: [PATCH 10/34] [#28187][prism] Basic cross language support. (#28545) * Move wk.Stop() to context cancel. * [prism] Basic Xlang support. --------- Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- .../pkg/beam/core/runtime/exec/fullvalue.go | 6 +++ .../runners/prism/internal/environments.go | 2 + .../beam/runners/prism/internal/execute.go | 52 +++++++++++-------- .../prism/internal/jobservices/management.go | 20 ++++--- .../pkg/beam/runners/prism/internal/stage.go | 16 +++--- .../beam/runners/prism/internal/urns/urns.go | 2 + .../runners/prism/internal/worker/bundle.go | 2 +- .../runners/prism/internal/worker/worker.go | 24 ++++----- .../prism/internal/worker/worker_test.go | 14 +---- sdks/go/test/integration/integration.go | 4 +- 10 files changed, 76 insertions(+), 66 deletions(-) diff --git a/sdks/go/pkg/beam/core/runtime/exec/fullvalue.go b/sdks/go/pkg/beam/core/runtime/exec/fullvalue.go index aaa049510f525..0a9343199a1ca 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/fullvalue.go +++ b/sdks/go/pkg/beam/core/runtime/exec/fullvalue.go @@ -251,6 +251,9 @@ func (s *decodeStream) Read() (*FullValue, error) { } err := s.d.DecodeTo(s.r, &s.ret) if err != nil { + if err == io.EOF { + return nil, io.EOF + } return nil, errors.Wrap(err, "decodeStream value decode failed") } s.next++ @@ -342,6 +345,9 @@ func (s *decodeMultiChunkStream) Read() (*FullValue, error) { if s.chunk == 0 && s.next == 0 { chunk, err := coder.DecodeVarInt(s.r.reader) if err != nil { + if err == io.EOF { + return nil, io.EOF + } return nil, errors.Wrap(err, "decodeMultiChunkStream chunk size decoding failed") } s.chunk = chunk diff --git a/sdks/go/pkg/beam/runners/prism/internal/environments.go b/sdks/go/pkg/beam/runners/prism/internal/environments.go index d4fb6ad5b3e1b..7d54cb366ffeb 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/environments.go +++ b/sdks/go/pkg/beam/runners/prism/internal/environments.go @@ -99,6 +99,7 @@ func externalEnvironment(ctx context.Context, ep *pipepb.ExternalPayload, wk *wo pool.StopWorker(context.Background(), &fnpb.StopWorkerRequest{ WorkerId: wk.ID, }) + wk.Stop() } func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.DockerPayload, wk *worker.W, artifactEndpoint string) error { @@ -170,6 +171,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock // Start goroutine to wait on container state. go func() { defer cli.Close() + defer wk.Stop() statusCh, errCh := cli.ContainerWait(ctx, containerID, container.WaitConditionNotRunning) select { diff --git a/sdks/go/pkg/beam/runners/prism/internal/execute.go b/sdks/go/pkg/beam/runners/prism/internal/execute.go index cf04381b9cbe3..c1ac6ea4488c2 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/execute.go +++ b/sdks/go/pkg/beam/runners/prism/internal/execute.go @@ -20,6 +20,7 @@ import ( "fmt" "io" "sort" + "sync/atomic" "time" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" @@ -46,15 +47,14 @@ func RunPipeline(j *jobservices.Job) { // here, we only want and need the go one, operating // in loopback mode. envs := j.Pipeline.GetComponents().GetEnvironments() - if len(envs) != 1 { - j.Failed(fmt.Errorf("unable to execute multi-environment pipelines;\npipeline has environments: %+v", envs)) - return - } - env, _ := getOnlyPair(envs) - wk, err := makeWorker(env, j) - if err != nil { - j.Failed(err) - return + wks := map[string]*worker.W{} + for envID := range envs { + wk, err := makeWorker(envID, j) + if err != nil { + j.Failed(err) + return + } + wks[envID] = wk } // When this function exits, we cancel the context to clear // any related job resources. @@ -65,15 +65,12 @@ func RunPipeline(j *jobservices.Job) { j.SendMsg("running " + j.String()) j.Running() - if err := executePipeline(j.RootCtx, wk, j); err != nil { + if err := executePipeline(j.RootCtx, wks, j); err != nil { j.Failed(err) return } j.SendMsg("pipeline completed " + j.String()) - // Stop the worker. - wk.Stop() - j.SendMsg("terminating " + j.String()) j.Done() } @@ -95,7 +92,7 @@ func makeWorker(env string, j *jobservices.Job) (*worker.W, error) { // Check for connection succeeding after we've created the environment successfully. timeout := 1 * time.Minute time.AfterFunc(timeout, func() { - if wk.Connected() { + if wk.Connected() || wk.Stopped() { return } err := fmt.Errorf("prism %v didn't get control connection to %v after %v", wk, wk.Endpoint(), timeout) @@ -115,7 +112,7 @@ type processor struct { transformExecuters map[string]transformExecuter } -func executePipeline(ctx context.Context, wk *worker.W, j *jobservices.Job) error { +func executePipeline(ctx context.Context, wks map[string]*worker.W, j *jobservices.Job) error { pipeline := j.Pipeline comps := proto.Clone(pipeline.GetComponents()).(*pipepb.Components) @@ -158,7 +155,12 @@ func executePipeline(ctx context.Context, wk *worker.W, j *jobservices.Job) erro // TODO move this loop and code into the preprocessor instead. stages := map[string]*stage{} var impulses []string - for _, stage := range topo { + + // Inialize the "dataservice cache" to support side inputs. + // TODO(https://github.com/apache/beam/issues/28543), remove this concept. + ds := &worker.DataService{} + + for i, stage := range topo { tid := stage.transforms[0] t := ts[tid] urn := t.GetSpec().GetUrn() @@ -169,11 +171,11 @@ func executePipeline(ctx context.Context, wk *worker.W, j *jobservices.Job) erro if stage.exe != nil { stage.envID = stage.exe.ExecuteWith(t) } - stage.ID = wk.NextStage() + stage.ID = fmt.Sprintf("stage-%03d", i) + wk := wks[stage.envID] switch stage.envID { case "": // Runner Transforms - var onlyOut string for _, out := range t.GetOutputs() { onlyOut = out @@ -232,10 +234,8 @@ func executePipeline(ctx context.Context, wk *worker.W, j *jobservices.Job) erro em.AddStage(stage.ID, inputs, nil, []string{getOnlyValue(t.GetOutputs())}) } stages[stage.ID] = stage - wk.Descriptors[stage.ID] = stage.desc case wk.Env: - // Great! this is for this environment. // Broken abstraction. - if err := buildDescriptor(stage, comps, wk); err != nil { + if err := buildDescriptor(stage, comps, wk, ds); err != nil { return fmt.Errorf("prism error building stage %v: \n%w", stage.ID, err) } stages[stage.ID] = stage @@ -259,7 +259,12 @@ func executePipeline(ctx context.Context, wk *worker.W, j *jobservices.Job) erro maxParallelism := make(chan struct{}, 8) // Execute stages here bundleFailed := make(chan error) - bundles := em.Bundles(ctx, wk.NextInst) + + var instID uint64 + bundles := em.Bundles(ctx, func() string { + return fmt.Sprintf("inst%03d", atomic.AddUint64(&instID, 1)) + }) + for { select { case <-ctx.Done(): @@ -273,7 +278,8 @@ func executePipeline(ctx context.Context, wk *worker.W, j *jobservices.Job) erro go func(rb engine.RunBundle) { defer func() { <-maxParallelism }() s := stages[rb.StageID] - if err := s.Execute(ctx, j, wk, comps, em, rb); err != nil { + wk := wks[s.envID] + if err := s.Execute(ctx, j, wk, ds, comps, em, rb); err != nil { // Ensure we clean up on bundle failure em.FailBundle(rb) bundleFailed <- err diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go index 213e33a783795..0fd7381e17f4b 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go @@ -93,14 +93,18 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (*jo return nil, err } var errs []error - check := func(feature string, got, want any) { - if got != want { - err := unimplementedError{ - feature: feature, - value: got, + check := func(feature string, got any, wants ...any) { + for _, want := range wants { + if got == want { + return } - errs = append(errs, err) } + + err := unimplementedError{ + feature: feature, + value: got, + } + errs = append(errs, err) } // Inspect Transforms for unsupported features. @@ -114,6 +118,8 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (*jo urns.TransformGBK, urns.TransformFlatten, urns.TransformCombinePerKey, + urns.TransformCombineGlobally, // Used by Java SDK + urns.TransformCombineGroupedValues, // Used by Java SDK urns.TransformAssignWindows: // Very few expected transforms types for submitted pipelines. // Most URNs are for the runner to communicate back to the SDK for execution. @@ -154,7 +160,7 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (*jo check("WindowingStrategy.MergeStatus", ws.GetMergeStatus(), pipepb.MergeStatus_NON_MERGING) } if !bypassedWindowingStrategies[wsID] { - check("WindowingStrategy.OnTimeBehavior", ws.GetOnTimeBehavior(), pipepb.OnTimeBehavior_FIRE_IF_NONEMPTY) + check("WindowingStrategy.OnTimeBehavior", ws.GetOnTimeBehavior(), pipepb.OnTimeBehavior_FIRE_IF_NONEMPTY, pipepb.OnTimeBehavior_FIRE_ALWAYS) check("WindowingStrategy.OutputTime", ws.GetOutputTime(), pipepb.OutputTime_END_OF_WINDOW) // Non nil triggers should fail. if ws.GetTrigger().GetDefault() == nil { diff --git a/sdks/go/pkg/beam/runners/prism/internal/stage.go b/sdks/go/pkg/beam/runners/prism/internal/stage.go index 4d8d4621168de..4ce3ce7ffeb6e 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/stage.go +++ b/sdks/go/pkg/beam/runners/prism/internal/stage.go @@ -75,7 +75,7 @@ type stage struct { OutputsToCoders map[string]engine.PColInfo } -func (s *stage) Execute(ctx context.Context, j *jobservices.Job, wk *worker.W, comps *pipepb.Components, em *engine.ElementManager, rb engine.RunBundle) error { +func (s *stage) Execute(ctx context.Context, j *jobservices.Job, wk *worker.W, ds *worker.DataService, comps *pipepb.Components, em *engine.ElementManager, rb engine.RunBundle) error { slog.Debug("Execute: starting bundle", "bundle", rb) var b *worker.B @@ -204,8 +204,8 @@ progress: md := wk.MonitoringMetadata(ctx, unknownIDs) j.AddMetricShortIDs(md) } - // TODO handle side input data properly. - wk.D.Commit(b.OutputData) + // TODO(https://github.com/apache/beam/issues/28543) handle side input data properly. + ds.Commit(b.OutputData) var residualData [][]byte var minOutputWatermark map[string]mtime.Time for _, rr := range resp.GetResidualRoots() { @@ -270,7 +270,7 @@ func portFor(wInCid string, wk *worker.W) []byte { // It assumes that the side inputs are not sourced from PCollections generated by any transform in this stage. // // Because we need the local ids for routing the sources/sinks information. -func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W) error { +func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W, ds *worker.DataService) error { // Assume stage has an indicated primary input coders := map[string]*pipepb.Coder{} @@ -327,7 +327,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W) error { // Update side inputs to point to new PCollection with any replaced coders. transforms[si.transform].GetInputs()[si.local] = newGlobal } - prepSide, err := handleSideInput(si.transform, si.local, si.global, comps, coders, wk) + prepSide, err := handleSideInput(si.transform, si.local, si.global, comps, coders, ds) if err != nil { slog.Error("buildDescriptor: handleSideInputs", err, slog.String("transformID", si.transform)) return err @@ -392,7 +392,7 @@ func buildDescriptor(stg *stage, comps *pipepb.Components, wk *worker.W) error { } // handleSideInput returns a closure that will look up the data for a side input appropriate for the given watermark. -func handleSideInput(tid, local, global string, comps *pipepb.Components, coders map[string]*pipepb.Coder, wk *worker.W) (func(b *worker.B, watermark mtime.Time), error) { +func handleSideInput(tid, local, global string, comps *pipepb.Components, coders map[string]*pipepb.Coder, ds *worker.DataService) (func(b *worker.B, watermark mtime.Time), error) { t := comps.GetTransforms()[tid] sis, err := getSideInputs(t) if err != nil { @@ -412,7 +412,7 @@ func handleSideInput(tid, local, global string, comps *pipepb.Components, coders global, local := global, local return func(b *worker.B, watermark mtime.Time) { - data := wk.D.GetAllData(global) + data := ds.GetAllData(global) if b.IterableSideInputData == nil { b.IterableSideInputData = map[string]map[string]map[typex.Window][][]byte{} @@ -447,7 +447,7 @@ func handleSideInput(tid, local, global string, comps *pipepb.Components, coders global, local := global, local return func(b *worker.B, watermark mtime.Time) { // May be of zero length, but that's OK. Side inputs can be empty. - data := wk.D.GetAllData(global) + data := ds.GetAllData(global) if b.MultiMapSideInputData == nil { b.MultiMapSideInputData = map[string]map[string]map[typex.Window]map[string][][]byte{} } diff --git a/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go b/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go index 9fc2c1a923c5d..bf1e36656661b 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go +++ b/sdks/go/pkg/beam/runners/prism/internal/urns/urns.go @@ -57,7 +57,9 @@ var ( // SDK transforms. TransformParDo = ptUrn(pipepb.StandardPTransforms_PAR_DO) TransformCombinePerKey = ctUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY) + TransformCombineGlobally = ctUrn(pipepb.StandardPTransforms_COMBINE_GLOBALLY) TransformReshuffle = ctUrn(pipepb.StandardPTransforms_RESHUFFLE) + TransformCombineGroupedValues = cmbtUrn(pipepb.StandardPTransforms_COMBINE_GROUPED_VALUES) TransformPreCombine = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_PRECOMBINE) TransformMerge = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_MERGE_ACCUMULATORS) TransformExtract = cmbtUrn(pipepb.StandardPTransforms_COMBINE_PER_KEY_EXTRACT_OUTPUTS) diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go index 98479e3db0710..573bdf4aeb9db 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/bundle.go @@ -93,7 +93,7 @@ func (b *B) Respond(resp *fnpb.InstructionResponse) { } b.responded = true if resp.GetError() != "" { - b.BundleErr = fmt.Errorf("bundle %v failed:%v", resp.GetInstructionId(), resp.GetError()) + b.BundleErr = fmt.Errorf("bundle %v %v failed:%v", resp.GetInstructionId(), b.PBDID, resp.GetError()) close(b.Resp) return } diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go index f33ff178c46d4..4968c9eb433e3 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/worker.go @@ -67,7 +67,7 @@ type W struct { server *grpc.Server // These are the ID sources - inst, bund uint64 + inst uint64 connected, stopped atomic.Bool InstReqs chan *fnpb.InstructionRequest @@ -76,8 +76,6 @@ type W struct { mu sync.Mutex activeInstructions map[string]controlResponder // Active instructions keyed by InstructionID Descriptors map[string]*fnpb.ProcessBundleDescriptor // Stages keyed by PBDID - - D *DataService } type controlResponder interface { @@ -104,8 +102,6 @@ func New(id, env string) *W { activeInstructions: make(map[string]controlResponder), Descriptors: make(map[string]*fnpb.ProcessBundleDescriptor), - - D: &DataService{}, } slog.Debug("Serving Worker components", slog.String("endpoint", wk.Endpoint())) fnpb.RegisterBeamFnControlServer(wk.server, wk) @@ -149,11 +145,7 @@ func (wk *W) Stop() { } func (wk *W) NextInst() string { - return fmt.Sprintf("inst%03d", atomic.AddUint64(&wk.inst, 1)) -} - -func (wk *W) NextStage() string { - return fmt.Sprintf("stage%03d", atomic.AddUint64(&wk.bund, 1)) + return fmt.Sprintf("inst-%v-%03d", wk.Env, atomic.AddUint64(&wk.inst, 1)) } // TODO set logging level. @@ -263,6 +255,11 @@ func (wk *W) Connected() bool { return wk.connected.Load() } +// Stopped indicates that the worker has stopped. +func (wk *W) Stopped() bool { + return wk.stopped.Load() +} + // Control relays instructions to SDKs and back again, coordinated via unique instructionIDs. // // Requests come from the runner, and are sent to the client in the SDK. @@ -312,10 +309,12 @@ func (wk *W) Control(ctrl fnpb.BeamFnControl_ControlServer) error { wk.mu.Lock() // Fail extant instructions slog.Debug("SDK Disconnected", "worker", wk, "ctx_error", ctrl.Context().Err(), "outstanding_instructions", len(wk.activeInstructions)) + + msg := fmt.Sprintf("SDK worker disconnected: %v, %v active instructions", wk.String(), len(wk.activeInstructions)) for instID, b := range wk.activeInstructions { b.Respond(&fnpb.InstructionResponse{ InstructionId: instID, - Error: "SDK Disconnected", + Error: msg, }) } wk.mu.Unlock() @@ -536,7 +535,7 @@ func (wk *W) sendInstruction(ctx context.Context, req *fnpb.InstructionRequest) req.InstructionId = progInst - if wk.stopped.Load() { + if wk.Stopped() { return nil } wk.InstReqs <- req @@ -566,6 +565,7 @@ func (wk *W) MonitoringMetadata(ctx context.Context, unknownIDs []string) *fnpb. // DataService is slated to be deleted in favour of stage based state // management for side inputs. +// TODO(https://github.com/apache/beam/issues/28543), remove this concept. type DataService struct { mu sync.Mutex // TODO actually quick process the data to windows here as well. diff --git a/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go b/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go index ed61f484481ca..6a90b463c45d8 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go +++ b/sdks/go/pkg/beam/runners/prism/internal/worker/worker_test.go @@ -50,18 +50,6 @@ func TestWorker_NextInst(t *testing.T) { } } -func TestWorker_NextStage(t *testing.T) { - w := New("test", "testEnv") - - stageIDs := map[string]struct{}{} - for i := 0; i < 100; i++ { - stageIDs[w.NextStage()] = struct{}{} - } - if got, want := len(stageIDs), 100; got != want { - t.Errorf("calling w.NextStage() got %v unique ids, want %v", got, want) - } -} - func TestWorker_GetProcessBundleDescriptor(t *testing.T) { w := New("test", "testEnv") @@ -189,7 +177,7 @@ func TestWorker_Data_HappyPath(t *testing.T) { b := &B{ InstID: instID, - PBDID: wk.NextStage(), + PBDID: "teststageID", InputData: [][]byte{ {1, 1, 1, 1, 1, 1}, }, diff --git a/sdks/go/test/integration/integration.go b/sdks/go/test/integration/integration.go index bb7f5275a1638..f3cffd1761109 100644 --- a/sdks/go/test/integration/integration.go +++ b/sdks/go/test/integration/integration.go @@ -140,8 +140,8 @@ var portableFilters = []string{ } var prismFilters = []string{ - // The prism runner does not yet support cross-language. - "TestXLang.*", + // The prism runner does not yet support Java's CoGBK. + "TestXLang_CoGroupBy", // The prism runner does not support the TestStream primitive "TestTestStream.*", // The trigger and pane tests uses TestStream From 7dfc0c03b1b793d9f798dc2dd93b0f81547568b3 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 20 Sep 2023 11:45:37 -0400 Subject: [PATCH 11/34] Sync it framework (#28541) * Sync it framework * spotless and checkstyle; exclude not-sync file * change to json extension to bypass RAT check --- .../cassandra/matchers/CassandraAsserts.java | 2 +- .../beam/it/common/utils/PipelineUtils.java | 21 +++++++- .../it/common/utils/PipelineUtilsTest.java | 8 ++- .../elasticsearch/ElasticsearchUtilsTest.java | 2 +- .../org/apache/beam/it/gcp/LoadTestBase.java | 53 ++++++++++++------ .../gcp/bigquery/BigQueryResourceManager.java | 1 + .../gcp/bigtable/BigtableResourceManager.java | 33 ++++++++++++ .../BigtableResourceManagerUtils.java | 2 +- .../dataflow/AbstractPipelineLauncher.java | 13 ++++- .../gcp/dataflow/DefaultPipelineLauncher.java | 6 +-- .../it/gcp/dataflow/DirectRunnerClient.java | 6 +-- .../it/gcp/datagenerator/DataGenerator.java | 54 ++++++++++--------- .../datastore/matchers/DatastoreAsserts.java | 3 +- .../beam/it/gcp/dlp/DlpResourceManager.java | 5 +- .../beam/it/gcp/kms/KMSResourceManager.java | 5 +- .../it/gcp/monitoring/MonitoringClient.java | 12 ++--- .../it/gcp/pubsub/PubsubResourceManager.java | 45 ++++++++++++---- .../src/main/resources/test-artifact.json | 1 + .../beam/it/gcp/bigquery/BigQueryIOLT.java | 18 ++++--- .../beam/it/gcp/bigtable/BigTableIOLT.java | 22 ++++---- .../bigtable/BigtableResourceManagerTest.java | 1 + .../dataflow/ClassicTemplateClientTest.java | 14 ++++- .../gcp/dataflow/FlexTemplateClientTest.java | 14 ++++- .../beam/it/gcp/storage/FileBasedIOLT.java | 14 ++--- .../gcp/storage/GcsResourceManagerTest.java | 2 +- .../src/test/resources/test-artifact.txt | 1 - .../beam/it/jdbc/MSSQLResourceManager.java | 7 +-- .../beam/it/jdbc/MySQLResourceManager.java | 4 +- .../beam/it/jdbc/OracleResourceManager.java | 8 +-- .../beam/it/jdbc/PostgresResourceManager.java | 9 ++-- .../beam/it/kafka/KafkaResourceManager.java | 11 ++-- .../org/apache/beam/it/kafka/KafkaIOLT.java | 4 +- .../it/mongodb/MongoDBResourceManager.java | 10 ++-- .../it/mongodb/matchers/MongoDBAsserts.java | 2 +- .../beam/it/neo4j/Neo4jResourceManager.java | 11 ++-- .../beam/it/splunk/SplunkResourceManager.java | 10 ++-- .../TestContainerResourceManager.java | 15 +++--- .../it/truthmatchers/LaunchInfoSubject.java | 2 +- .../beam/it/truthmatchers/RecordsSubject.java | 2 +- 39 files changed, 303 insertions(+), 150 deletions(-) create mode 100644 it/google-cloud-platform/src/main/resources/test-artifact.json delete mode 100644 it/google-cloud-platform/src/test/resources/test-artifact.txt diff --git a/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java b/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java index 61f730bf3579d..6aecc6609cfb4 100644 --- a/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java +++ b/it/cassandra/src/main/java/org/apache/beam/it/cassandra/matchers/CassandraAsserts.java @@ -31,7 +31,7 @@ public class CassandraAsserts { /** - * Convert Cassandra {@link Row} list to a list of maps. + * Convert Cassandra {@link com.datastax.oss.driver.api.core.cql.Row} list to a list of maps. * * @param rows Rows to parse. * @return List of maps to use in {@link RecordsSubject}. diff --git a/it/common/src/main/java/org/apache/beam/it/common/utils/PipelineUtils.java b/it/common/src/main/java/org/apache/beam/it/common/utils/PipelineUtils.java index c696457bbdd99..d249d43d3789d 100644 --- a/it/common/src/main/java/org/apache/beam/it/common/utils/PipelineUtils.java +++ b/it/common/src/main/java/org/apache/beam/it/common/utils/PipelineUtils.java @@ -27,6 +27,7 @@ import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.CaseFormat; +import org.apache.commons.lang3.RandomStringUtils; /** Utilities to make working with Dataflow easier. */ public class PipelineUtils { @@ -73,6 +74,15 @@ public static boolean waitUntil( } } + /** + * Creates a job name. Method uses {@link #createJobName(String, int)}} without a random suffix. + * + * @see #createJobName(String, int) + */ + public static String createJobName(String prefix) { + return createJobName(prefix, 0); + } + /** * Creates a job name. * @@ -83,17 +93,24 @@ public static boolean waitUntil( * same prefix are requested in a short period of time. * * @param prefix a prefix for the job + * @param randomChars if the string should contain random chars at the end, to increase the + * likelihood of being unique. * @return the prefix plus some way of identifying it separate from other jobs with the same * prefix */ - public static String createJobName(String prefix) { + public static String createJobName(String prefix, int randomChars) { String convertedPrefix = CaseFormat.UPPER_CAMEL.converterTo(CaseFormat.LOWER_HYPHEN).convert(prefix); String formattedTimestamp = DateTimeFormatter.ofPattern("yyyyMMddHHmmssSSS") .withZone(ZoneId.of("UTC")) .format(Instant.now()); - return String.format("%s-%s", convertedPrefix, formattedTimestamp); + + String suffix = ""; + if (randomChars > 0) { + suffix = "-" + RandomStringUtils.randomAlphanumeric(randomChars).toLowerCase(); + } + return String.format("%s-%s%s", convertedPrefix, formattedTimestamp, suffix); } /** Get raw job name (without prefix) from a jobName generated by createJobName. */ diff --git a/it/common/src/test/java/org/apache/beam/it/common/utils/PipelineUtilsTest.java b/it/common/src/test/java/org/apache/beam/it/common/utils/PipelineUtilsTest.java index acf203b06e6e9..316283cdf7d28 100644 --- a/it/common/src/test/java/org/apache/beam/it/common/utils/PipelineUtilsTest.java +++ b/it/common/src/test/java/org/apache/beam/it/common/utils/PipelineUtilsTest.java @@ -37,7 +37,13 @@ public void testCreateJobName() { @Test public void testCreateJobNameWithUppercase() { - assertThat(createJobName("testWithUpperCase")).matches("test-with-upper-case" + "-\\d{17}"); + assertThat(createJobName("testWithUpperCase")).matches("test-with-upper-case-\\d{17}"); + } + + @Test + public void testCreateJobNameWithUppercaseSuffix() { + assertThat(createJobName("testWithUpperCase", 8)) + .matches("test-with-upper-case-\\d{17}-[a-z0-9]{8}"); } @Test diff --git a/it/elasticsearch/src/test/java/org/apache/beam/it/elasticsearch/ElasticsearchUtilsTest.java b/it/elasticsearch/src/test/java/org/apache/beam/it/elasticsearch/ElasticsearchUtilsTest.java index eb250a1c5f828..61d6b5d57c2c4 100644 --- a/it/elasticsearch/src/test/java/org/apache/beam/it/elasticsearch/ElasticsearchUtilsTest.java +++ b/it/elasticsearch/src/test/java/org/apache/beam/it/elasticsearch/ElasticsearchUtilsTest.java @@ -34,7 +34,7 @@ public class ElasticsearchUtilsTest { @Test public void testGenerateIndexNameShouldReplaceForwardSlash() { String testBaseString = "Test/DB/Name"; - String actual = generateIndexName(testBaseString); + String actual = ElasticsearchUtils.generateIndexName(testBaseString); assertThat(actual).matches("test-db-name-\\d{8}-\\d{6}-\\d{6}"); } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java index f6e359fed9639..d9c1990ef079a 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java @@ -18,6 +18,7 @@ package org.apache.beam.it.gcp; import static org.apache.beam.it.common.logging.LogStrings.formatForLogging; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.RUNNER_V2; import com.google.api.gax.core.CredentialsProvider; import com.google.api.gax.core.FixedCredentialsProvider; @@ -49,6 +50,7 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.junit.After; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Rule; import org.junit.rules.TestRule; import org.junit.rules.TestWatcher; @@ -107,11 +109,14 @@ protected void starting(Description description) { } }; - @Before - @SuppressFBWarnings("ST_WRITE_TO_STATIC_FROM_INSTANCE_METHOD") - public void setUp() throws IOException { + @BeforeClass + public static void setUpClass() { project = TestProperties.project(); region = TestProperties.region(); + } + + @Before + public void setUp() throws IOException { monitoringClient = MonitoringClient.builder(CREDENTIALS_PROVIDER).build(); pipelineLauncher = launcher(); pipelineOperator = new PipelineOperator(pipelineLauncher); @@ -239,7 +244,7 @@ private void computeDataflowMetrics( metrics.put("EstimatedDataProcessedGB", dataProcessed / 1e9d); } metrics.putAll(getCpuUtilizationMetrics(launchInfo.jobId(), workerTimeInterval)); - metrics.putAll(getThroughputMetrics(launchInfo.jobId(), config, workerTimeInterval)); + metrics.putAll(getThroughputMetrics(launchInfo, config, workerTimeInterval)); } /** @@ -349,25 +354,30 @@ protected Map getCpuUtilizationMetrics(String jobId, TimeInterva /** * Computes throughput metrics of the given pcollection in dataflow job. * - * @param jobId dataflow job id + * @param jobInfo dataflow job LaunchInfo * @param config the {@class MetricsConfiguration} * @param timeInterval interval for the monitoring query * @return throughput metrics of the pcollection */ protected Map getThroughputMetrics( - String jobId, MetricsConfiguration config, TimeInterval timeInterval) { + LaunchInfo jobInfo, MetricsConfiguration config, TimeInterval timeInterval) { + String jobId = jobInfo.jobId(); + String iColl = + RUNNER_V2.equals(jobInfo.runner()) + ? config.inputPCollectionV2() + : config.inputPCollection(); + String oColl = + RUNNER_V2.equals(jobInfo.runner()) + ? config.outputPCollectionV2() + : config.outputPCollection(); List inputThroughputBytesPerSec = - monitoringClient.getThroughputBytesPerSecond( - project, jobId, config.inputPCollection(), timeInterval); + monitoringClient.getThroughputBytesPerSecond(project, jobId, iColl, timeInterval); List inputThroughputElementsPerSec = - monitoringClient.getThroughputElementsPerSecond( - project, jobId, config.inputPCollection(), timeInterval); + monitoringClient.getThroughputElementsPerSecond(project, jobId, iColl, timeInterval); List outputThroughputBytesPerSec = - monitoringClient.getThroughputBytesPerSecond( - project, jobId, config.outputPCollection(), timeInterval); + monitoringClient.getThroughputBytesPerSecond(project, jobId, oColl, timeInterval); List outputThroughputElementsPerSec = - monitoringClient.getThroughputElementsPerSecond( - project, jobId, config.outputPCollection(), timeInterval); + monitoringClient.getThroughputElementsPerSecond(project, jobId, oColl, timeInterval); return getThroughputMetrics( inputThroughputBytesPerSec, inputThroughputElementsPerSec, @@ -495,22 +505,31 @@ public abstract static class MetricsConfiguration { */ public abstract @Nullable String inputPCollection(); + /** Input PCollection name under Dataflow runner v2. */ + public abstract @Nullable String inputPCollectionV2(); + /** * Input PCollection of the Dataflow job to query additional metrics. If not provided, the * metrics for inputPCollection will not be calculated. */ public abstract @Nullable String outputPCollection(); - public static Builder builder() { + public abstract @Nullable String outputPCollectionV2(); + + public static MetricsConfiguration.Builder builder() { return new AutoValue_LoadTestBase_MetricsConfiguration.Builder(); } @AutoValue.Builder public abstract static class Builder { - public abstract Builder setInputPCollection(@Nullable String value); + public abstract MetricsConfiguration.Builder setInputPCollection(@Nullable String value); + + public abstract MetricsConfiguration.Builder setInputPCollectionV2(@Nullable String value); + + public abstract MetricsConfiguration.Builder setOutputPCollection(@Nullable String value); - public abstract Builder setOutputPCollection(@Nullable String value); + public abstract MetricsConfiguration.Builder setOutputPCollectionV2(@Nullable String value); public abstract MetricsConfiguration build(); } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigquery/BigQueryResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigquery/BigQueryResourceManager.java index 80bf5cfd93821..d6d348f524b29 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigquery/BigQueryResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigquery/BigQueryResourceManager.java @@ -461,6 +461,7 @@ public synchronized void cleanupAll() throws BigQueryResourceManagerException { projectId, dataset.getDatasetId().getDataset(), table.getTableId().getTable())); } bigQuery.delete(dataset.getDatasetId()); + dataset = null; } } catch (Exception e) { throw new BigQueryResourceManagerException("Failed to delete resources.", e); diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java index 1e6750cc81e41..713880229281e 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java @@ -33,6 +33,7 @@ import com.google.cloud.bigtable.admin.v2.models.AppProfile.MultiClusterRoutingPolicy; import com.google.cloud.bigtable.admin.v2.models.AppProfile.RoutingPolicy; import com.google.cloud.bigtable.admin.v2.models.AppProfile.SingleClusterRoutingPolicy; +import com.google.cloud.bigtable.admin.v2.models.Cluster; import com.google.cloud.bigtable.admin.v2.models.CreateAppProfileRequest; import com.google.cloud.bigtable.admin.v2.models.CreateInstanceRequest; import com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; @@ -54,6 +55,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; import javax.annotation.Nullable; import org.apache.beam.it.common.ResourceManager; import org.apache.commons.lang3.StringUtils; @@ -93,6 +96,8 @@ public class BigtableResourceManager implements ResourceManager { private final Set cdcEnabledTables; private boolean hasInstance; + private Iterable clusters; + private final boolean usingStaticInstance; private BigtableResourceManager(Builder builder) throws IOException { @@ -111,6 +116,7 @@ private BigtableResourceManager(Builder builder) throws IOException { this.createdTables = new ArrayList<>(); this.createdAppProfiles = new ArrayList<>(); this.cdcEnabledTables = new HashSet<>(); + this.clusters = new ArrayList<>(); // Check if RM was configured to use static Bigtable instance. if (builder.useStaticInstance) { @@ -223,6 +229,7 @@ public synchronized void createInstance(Iterable "Failed to create instance " + instanceId + ".", e); } hasInstance = true; + this.clusters = clusters; LOG.info("Successfully created instance {}.", instanceId); } @@ -544,6 +551,32 @@ public synchronized ImmutableList readTable(String tableId, @Nullable Long return tableRows; } + /** Get all the cluster names of the current instance. */ + public List getClusterNames() { + return StreamSupport.stream(getClusters().spliterator(), false) + .map(BigtableResourceManagerCluster::clusterId) + .collect(Collectors.toList()); + } + + private Iterable getClusters() { + if (usingStaticInstance && this.clusters == null) { + try (BigtableInstanceAdminClient instanceAdminClient = + bigtableResourceManagerClientFactory.bigtableInstanceAdminClient()) { + List managedClusters = new ArrayList<>(); + for (Cluster cluster : instanceAdminClient.listClusters(instanceId)) { + managedClusters.add( + BigtableResourceManagerCluster.create( + cluster.getId(), + cluster.getZone(), + cluster.getServeNodes(), + cluster.getStorageType())); + } + this.clusters = managedClusters; + } + } + return this.clusters; + } + /** * Deletes all created resources (instance and tables) and cleans up all Bigtable clients, making * the manager object unusable. diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java index eb2323e529744..a893493d766ea 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java @@ -34,7 +34,7 @@ public final class BigtableResourceManagerUtils { private static final Pattern ILLEGAL_INSTANCE_ID_CHARS = Pattern.compile("[^a-z0-9-]"); private static final String REPLACE_INSTANCE_ID_CHAR = "-"; private static final int MIN_TABLE_ID_LENGTH = 1; - private static final int MAX_TABLE_ID_LENGTH = 30; + private static final int MAX_TABLE_ID_LENGTH = 40; private static final Pattern ILLEGAL_TABLE_CHARS = Pattern.compile("[^a-zA-Z0-9-_.]"); private static final String REPLACE_TABLE_ID_CHAR = "-"; diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/AbstractPipelineLauncher.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/AbstractPipelineLauncher.java index 08688d88b104d..b5c9535953b78 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/AbstractPipelineLauncher.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/AbstractPipelineLauncher.java @@ -58,6 +58,11 @@ public abstract class AbstractPipelineLauncher implements PipelineLauncher { private static final Logger LOG = LoggerFactory.getLogger(AbstractPipelineLauncher.class); private static final Pattern CURRENT_METRICS = Pattern.compile(".*Current.*"); + public static final String LEGACY_RUNNER = "Dataflow Legacy Runner"; + public static final String RUNNER_V2 = "Dataflow Runner V2"; + public static final String PARAM_RUNNER = "runner"; + public static final String PARAM_JOB_TYPE = "jobType"; + public static final String PARAM_JOB_ID = "jobId"; protected final List launchedJobs = new ArrayList<>(); @@ -244,12 +249,12 @@ protected JobState handleJobState(Job job) { */ protected LaunchInfo.Builder getJobInfoBuilder(LaunchConfig options, JobState state, Job job) { Map labels = job.getLabels(); - String runner = "Dataflow Legacy Runner"; + String runner = LEGACY_RUNNER; Environment environment = job.getEnvironment(); if (environment != null && environment.getExperiments() != null && environment.getExperiments().contains("use_runner_v2")) { - runner = "Dataflow Runner V2"; + runner = RUNNER_V2; } LaunchInfo.Builder builder = LaunchInfo.builder() @@ -266,6 +271,10 @@ protected LaunchInfo.Builder getJobInfoBuilder(LaunchConfig options, JobState st // tests Map parameters = new HashMap<>(options.parameters()); options.environment().forEach((key, val) -> parameters.put(key, val.toString())); + // attach basic job info to parameters so that these are exported for load tests + parameters.put(PARAM_RUNNER, runner); + parameters.put(PARAM_JOB_TYPE, job.getType()); + parameters.put(PARAM_JOB_ID, job.getId()); builder.setParameters(ImmutableMap.copyOf(parameters)); if (labels != null && !labels.isEmpty()) { // template job diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java index 7918dd6227d9d..ad2dcafc007bd 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DefaultPipelineLauncher.java @@ -99,7 +99,7 @@ public class DefaultPipelineLauncher extends AbstractPipelineLauncher { .put(PipelineResult.State.UNRECOGNIZED, JobState.UNKNOWN) .build(); - private DefaultPipelineLauncher(Builder builder) { + private DefaultPipelineLauncher(DefaultPipelineLauncher.Builder builder) { super( new Dataflow( Utils.getDefaultTransport(), @@ -109,8 +109,8 @@ private DefaultPipelineLauncher(Builder builder) { : new HttpCredentialsAdapter(builder.getCredentials()))); } - public static Builder builder(Credentials credentials) { - return new Builder(credentials); + public static DefaultPipelineLauncher.Builder builder(Credentials credentials) { + return new DefaultPipelineLauncher.Builder(credentials); } @Override diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java index 8017009ff3787..57f8ad40c1b6e 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dataflow/DirectRunnerClient.java @@ -53,8 +53,8 @@ public class DirectRunnerClient implements PipelineLauncher { this.mainClass = builder.getMainClass(); } - public static Builder builder(Class mainClass) { - return new Builder(mainClass); + public static DirectRunnerClient.Builder builder(Class mainClass) { + return new DirectRunnerClient.Builder(mainClass); } @Override @@ -172,7 +172,7 @@ public Class getMainClass() { return mainClass; } - public Builder setCredentials(Credentials value) { + public DirectRunnerClient.Builder setCredentials(Credentials value) { credentials = value; return this; } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java index 99016b5dd3a46..832a75defd95b 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datagenerator/DataGenerator.java @@ -61,14 +61,16 @@ private DataGenerator(Builder builder) { .build(); } - public static Builder builderWithSchemaLocation(String testName, String schemaLocation) { - return new Builder(testName + "-data-generator") + public static DataGenerator.Builder builderWithSchemaLocation( + String testName, String schemaLocation) { + return new DataGenerator.Builder(testName + "-data-generator") .setSchemaLocation(schemaLocation) .setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED); } - public static Builder builderWithSchemaTemplate(String testName, String schemaTemplate) { - return new Builder(testName + "-data-generator") + public static DataGenerator.Builder builderWithSchemaTemplate( + String testName, String schemaTemplate) { + return new DataGenerator.Builder(testName + "-data-generator") .setSchemaTemplate(schemaTemplate) .setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED); } @@ -129,27 +131,27 @@ public Map getParameters() { return parameters; } - public Builder setSchemaTemplate(String value) { + public DataGenerator.Builder setSchemaTemplate(String value) { parameters.put("schemaTemplate", value); return this; } - public Builder setSchemaLocation(String value) { + public DataGenerator.Builder setSchemaLocation(String value) { parameters.put("schemaLocation", value); return this; } - public Builder setMessagesLimit(String value) { + public DataGenerator.Builder setMessagesLimit(String value) { parameters.put(MESSAGES_LIMIT, value); return this; } - public Builder setQPS(String value) { + public DataGenerator.Builder setQPS(String value) { parameters.put("qps", value); return this; } - public Builder setSinkType(String value) { + public DataGenerator.Builder setSinkType(String value) { parameters.put("sinkType", value); return this; } @@ -164,87 +166,87 @@ public Builder setNumWorkers(String value) { return this; } - public Builder setMaxNumWorkers(String value) { + public DataGenerator.Builder setMaxNumWorkers(String value) { parameters.put("maxNumWorkers", value); return this; } - public Builder setAutoscalingAlgorithm(AutoscalingAlgorithmType value) { + public DataGenerator.Builder setAutoscalingAlgorithm(AutoscalingAlgorithmType value) { parameters.put("autoscalingAlgorithm", value.toString()); return this; } - public Builder setOutputDirectory(String value) { + public DataGenerator.Builder setOutputDirectory(String value) { parameters.put("outputDirectory", value); return this; } - public Builder setOutputType(String value) { + public DataGenerator.Builder setOutputType(String value) { parameters.put("outputType", value); return this; } - public Builder setNumShards(String value) { + public DataGenerator.Builder setNumShards(String value) { parameters.put("numShards", value); return this; } - public Builder setAvroSchemaLocation(String value) { + public DataGenerator.Builder setAvroSchemaLocation(String value) { parameters.put("avroSchemaLocation", value); return this; } - public Builder setTopic(String value) { + public DataGenerator.Builder setTopic(String value) { parameters.put("topic", value); return this; } - public Builder setProjectId(String value) { + public DataGenerator.Builder setProjectId(String value) { parameters.put("projectId", value); return this; } - public Builder setSpannerInstanceName(String value) { + public DataGenerator.Builder setSpannerInstanceName(String value) { parameters.put("spannerInstanceName", value); return this; } - public Builder setSpannerDatabaseName(String value) { + public DataGenerator.Builder setSpannerDatabaseName(String value) { parameters.put("spannerDatabaseName", value); return this; } - public Builder setSpannerTableName(String value) { + public DataGenerator.Builder setSpannerTableName(String value) { parameters.put("spannerTableName", value); return this; } - public Builder setDriverClassName(String value) { + public DataGenerator.Builder setDriverClassName(String value) { parameters.put("driverClassName", value); return this; } - public Builder setConnectionUrl(String value) { + public DataGenerator.Builder setConnectionUrl(String value) { parameters.put("connectionUrl", value); return this; } - public Builder setUsername(String value) { + public DataGenerator.Builder setUsername(String value) { parameters.put("username", value); return this; } - public Builder setPassword(String value) { + public DataGenerator.Builder setPassword(String value) { parameters.put("password", value); return this; } - public Builder setConnectionProperties(String value) { + public DataGenerator.Builder setConnectionProperties(String value) { parameters.put("connectionProperties", value); return this; } - public Builder setStatement(String value) { + public DataGenerator.Builder setStatement(String value) { parameters.put("statement", value); return this; } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java index ef67a5a5c4fb0..78fa7543150fd 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/datastore/matchers/DatastoreAsserts.java @@ -61,7 +61,8 @@ public static List> datastoreResultsToRecords(Collection results) { diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java index f59794af3e1ff..de818a1bbff18 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/dlp/DlpResourceManager.java @@ -113,8 +113,9 @@ public void cleanupAll() { * @param project the GCP project ID * @return a new instance of Builder */ - public static Builder builder(String project, CredentialsProvider credentialsProvider) { - return new Builder(project, credentialsProvider); + public static DlpResourceManager.Builder builder( + String project, CredentialsProvider credentialsProvider) { + return new DlpResourceManager.Builder(project, credentialsProvider); } /** A builder class for creating instances of {@link DlpResourceManager}. */ diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java index 7e1a403c73525..2cad6d0b9faba 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/kms/KMSResourceManager.java @@ -72,8 +72,9 @@ private KMSResourceManager(Builder builder) { this.keyRing = null; } - public static Builder builder(String projectId, CredentialsProvider credentialsProvider) { - return new Builder(projectId, credentialsProvider); + public static KMSResourceManager.Builder builder( + String projectId, CredentialsProvider credentialsProvider) { + return new KMSResourceManager.Builder(projectId, credentialsProvider); } /** diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java index 0fc5614a36300..06591ea4fe0ae 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/monitoring/MonitoringClient.java @@ -150,8 +150,8 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aligner.ALIGN_MEAN) - .setCrossSeriesReducer(Reducer.REDUCE_MEAN) + .setPerSeriesAligner(Aggregation.Aligner.ALIGN_MEAN) + .setCrossSeriesReducer(Aggregation.Reducer.REDUCE_MEAN) .addGroupByFields("resource.instance_id") .build(); ListTimeSeriesRequest request = @@ -188,7 +188,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aligner.ALIGN_MEAN) + .setPerSeriesAligner(Aggregation.Aligner.ALIGN_MEAN) .setCrossSeriesReducer(Reducer.REDUCE_MAX) .build(); ListTimeSeriesRequest request = @@ -225,7 +225,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aligner.ALIGN_MEAN) + .setPerSeriesAligner(Aggregation.Aligner.ALIGN_MEAN) .setCrossSeriesReducer(Reducer.REDUCE_MAX) .build(); ListTimeSeriesRequest request = @@ -269,7 +269,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aligner.ALIGN_RATE) + .setPerSeriesAligner(Aggregation.Aligner.ALIGN_RATE) .build(); ListTimeSeriesRequest request = ListTimeSeriesRequest.newBuilder() @@ -312,7 +312,7 @@ public List listTimeSeriesAsLong(ListTimeSeriesRequest request) { Aggregation aggregation = Aggregation.newBuilder() .setAlignmentPeriod(Duration.newBuilder().setSeconds(60).build()) - .setPerSeriesAligner(Aligner.ALIGN_RATE) + .setPerSeriesAligner(Aggregation.Aligner.ALIGN_RATE) .build(); ListTimeSeriesRequest request = ListTimeSeriesRequest.newBuilder() diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/pubsub/PubsubResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/pubsub/PubsubResourceManager.java index 3a684d34c045f..738620c15b7ea 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/pubsub/PubsubResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/pubsub/PubsubResourceManager.java @@ -20,6 +20,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import com.google.api.gax.core.CredentialsProvider; +import com.google.api.gax.rpc.DeadlineExceededException; import com.google.cloud.pubsub.v1.Publisher; import com.google.cloud.pubsub.v1.SchemaServiceClient; import com.google.cloud.pubsub.v1.SchemaServiceSettings; @@ -42,12 +43,16 @@ import com.google.pubsub.v1.Topic; import com.google.pubsub.v1.TopicName; import com.google.pubsub.v1.UpdateTopicRequest; +import dev.failsafe.Failsafe; +import dev.failsafe.RetryPolicy; import java.io.IOException; +import java.time.Duration; import java.util.Collections; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.beam.it.common.ResourceManager; +import org.apache.beam.it.common.utils.ExceptionUtils; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.slf4j.Logger; @@ -66,6 +71,12 @@ public final class PubsubResourceManager implements ResourceManager { private static final int DEFAULT_ACK_DEADLINE_SECONDS = 600; private static final String RESOURCE_NAME_SEPARATOR = "-"; + // Retry settings for client operations + private static final int FAILSAFE_MAX_RETRIES = 5; + private static final Duration FAILSAFE_RETRY_DELAY = Duration.ofSeconds(10); + private static final Duration FAILSAFE_RETRY_MAX_DELAY = Duration.ofSeconds(60); + private static final double FAILSAFE_RETRY_JITTER = 0.1; + private final String testId; private final String projectId; private final PubsubPublisherFactory publisherFactory; @@ -184,11 +195,14 @@ public SubscriptionName createSubscription(TopicName topicName, String subscript LOG.info("Creating subscription '{}' for topic '{}'", subscriptionName, topicName); Subscription subscription = - subscriptionAdminClient.createSubscription( - getSubscriptionName(subscriptionName), - topicName, - PushConfig.getDefaultInstance(), - DEFAULT_ACK_DEADLINE_SECONDS); + Failsafe.with(retryOnDeadlineExceeded()) + .get( + () -> + subscriptionAdminClient.createSubscription( + getSubscriptionName(subscriptionName), + topicName, + PushConfig.getDefaultInstance(), + DEFAULT_ACK_DEADLINE_SECONDS)); SubscriptionName reference = PubsubUtils.toSubscriptionName(subscription); createdSubscriptions.add(getSubscriptionName(subscriptionName)); @@ -299,17 +313,19 @@ public synchronized void cleanupAll() { try { for (SubscriptionName subscription : createdSubscriptions) { LOG.info("Deleting subscription '{}'", subscription); - subscriptionAdminClient.deleteSubscription(subscription); + Failsafe.with(retryOnDeadlineExceeded()) + .run(() -> subscriptionAdminClient.deleteSubscription(subscription)); } for (TopicName topic : createdTopics) { LOG.info("Deleting topic '{}'", topic); - topicAdminClient.deleteTopic(topic); + Failsafe.with(retryOnDeadlineExceeded()).run(() -> topicAdminClient.deleteTopic(topic)); } for (SchemaName schemaName : createdSchemas) { LOG.info("Deleting schema '{}'", schemaName); - schemaServiceClient.deleteSchema(schemaName); + Failsafe.with(retryOnDeadlineExceeded()) + .run(() -> schemaServiceClient.deleteSchema(schemaName)); } } finally { subscriptionAdminClient.close(); @@ -342,7 +358,8 @@ private void checkIsUsable() throws IllegalStateException { private TopicName createTopicInternal(TopicName topicName) { LOG.info("Creating topic '{}'...", topicName.toString()); - Topic topic = topicAdminClient.createTopic(topicName); + Topic topic = + Failsafe.with(retryOnDeadlineExceeded()).get(() -> topicAdminClient.createTopic(topicName)); TopicName reference = PubsubUtils.toTopicName(topic); createdTopics.add(reference); @@ -355,6 +372,16 @@ private boolean isNotUsable() { return topicAdminClient.isShutdown() || subscriptionAdminClient.isShutdown(); } + private static RetryPolicy retryOnDeadlineExceeded() { + return RetryPolicy.builder() + .handleIf( + exception -> ExceptionUtils.containsType(exception, DeadlineExceededException.class)) + .withMaxRetries(FAILSAFE_MAX_RETRIES) + .withBackoff(FAILSAFE_RETRY_DELAY, FAILSAFE_RETRY_MAX_DELAY) + .withJitter(FAILSAFE_RETRY_JITTER) + .build(); + } + /** Builder for {@link PubsubResourceManager}. */ public static final class Builder { diff --git a/it/google-cloud-platform/src/main/resources/test-artifact.json b/it/google-cloud-platform/src/main/resources/test-artifact.json new file mode 100644 index 0000000000000..551c80d14a660 --- /dev/null +++ b/it/google-cloud-platform/src/main/resources/test-artifact.json @@ -0,0 +1 @@ +["This is a test artifact."] \ No newline at end of file diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryIOLT.java index 03f6e8abfd414..a9ae68142778e 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryIOLT.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigquery/BigQueryIOLT.java @@ -99,12 +99,8 @@ public final class BigQueryIOLT extends IOLoadTestBase { private static final String READ_ELEMENT_METRIC_NAME = "read_count"; private Configuration configuration; private String tempLocation; - private TableSchema schema; - private static final String READ_PCOLLECTION = "Counting element.out0"; - private static final String WRITE_PCOLLECTION = "Map records.out0"; - @Rule public TestPipeline writePipeline = TestPipeline.create(); @Rule public TestPipeline readPipeline = TestPipeline.create(); @@ -268,7 +264,7 @@ private void testWrite(BigQueryIO.Write writeIO) throws IOException { .withCustomGcsTempLocation(ValueProvider.StaticValueProvider.of(tempLocation))); PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder("test-bigquery-write") + PipelineLauncher.LaunchConfig.builder("write-bigquery") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(writePipeline) .addParameter("runner", configuration.runner) @@ -284,7 +280,10 @@ private void testWrite(BigQueryIO.Write writeIO) throws IOException { // export metrics MetricsConfiguration metricsConfig = - MetricsConfiguration.builder().setInputPCollection(WRITE_PCOLLECTION).build(); + MetricsConfiguration.builder() + .setInputPCollection("Map records.out0") + .setInputPCollectionV2("Map records/ParMultiDo(MapKVToV).out0") + .build(); try { exportMetricsToBigQuery(launchInfo, getMetrics(launchInfo, metricsConfig)); } catch (ParseException | InterruptedException e) { @@ -301,7 +300,7 @@ private void testRead() throws IOException { .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder("test-bigquery-read") + PipelineLauncher.LaunchConfig.builder("read-bigquery") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(readPipeline) .addParameter("runner", configuration.runner) @@ -326,7 +325,10 @@ private void testRead() throws IOException { // export metrics MetricsConfiguration metricsConfig = - MetricsConfiguration.builder().setOutputPCollection(READ_PCOLLECTION).build(); + MetricsConfiguration.builder() + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") + .build(); try { exportMetricsToBigQuery(launchInfo, getMetrics(launchInfo, metricsConfig)); } catch (ParseException | InterruptedException e) { diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java index fc7bd87707fc7..e232ed31cb5a3 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigTableIOLT.java @@ -115,8 +115,6 @@ public void teardown() { /** Run integration test with configurations specified by TestProperties. */ @Test public void testWriteAndRead() throws IOException { - final String readPCollection = "Counting element.out0"; - final String writePCollection = "Map records.out0"; tableId = generateTableId(testName); resourceManager.createTable( @@ -149,8 +147,10 @@ public void testWriteAndRead() throws IOException { // export metrics MetricsConfiguration metricsConfig = MetricsConfiguration.builder() - .setInputPCollection(writePCollection) - .setOutputPCollection(readPCollection) + .setInputPCollection("Map records.out0") + .setInputPCollectionV2("Map records/ParMultiDo(MapToBigTableFormat).out0") + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") .build(); try { exportMetricsToBigQuery(writeInfo, getMetrics(writeInfo, metricsConfig)); @@ -174,7 +174,7 @@ private PipelineLauncher.LaunchInfo testWrite() throws IOException { .apply("Write to BigTable", writeIO); PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder("test-bigtable-write") + PipelineLauncher.LaunchConfig.builder("write-bigtable") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(writePipeline) .addParameter("runner", configuration.getRunner()) @@ -196,7 +196,7 @@ private PipelineLauncher.LaunchInfo testRead() throws IOException { .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder("test-bigtable-read") + PipelineLauncher.LaunchConfig.builder("read-bigtable") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(readPipeline) .addParameter("runner", configuration.getRunner()) @@ -227,18 +227,18 @@ static Configuration of(long numRows, int pipelineTimeout, String runner, int va @AutoValue.Builder abstract static class Builder { - abstract Builder setNumRows(long numRows); + abstract Configuration.Builder setNumRows(long numRows); - abstract Builder setPipelineTimeout(int timeOutMinutes); + abstract Configuration.Builder setPipelineTimeout(int timeOutMinutes); - abstract Builder setRunner(String runner); + abstract Configuration.Builder setRunner(String runner); - abstract Builder setValueSizeBytes(int valueSizeBytes); + abstract Configuration.Builder setValueSizeBytes(int valueSizeBytes); abstract Configuration build(); } - abstract Builder toBuilder(); + abstract Configuration.Builder toBuilder(); } /** Maps long number to the BigTable format record. */ diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java index 65745aea49be5..f8673ed696ccc 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java @@ -442,6 +442,7 @@ public void testCleanupAllShouldWorkWhenBigtableDoesNotThrowAnyError() { setupReadyTable(); testManager.createTable(TABLE_ID, ImmutableList.of("cf1")); + when(bigtableResourceManagerClientFactory.bigtableTableAdminClient().exists(anyString())) .thenReturn(true); testManager.readTable(TABLE_ID); diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/ClassicTemplateClientTest.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/ClassicTemplateClientTest.java index cfd56e596e525..88c35589f2be8 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/ClassicTemplateClientTest.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/ClassicTemplateClientTest.java @@ -18,6 +18,10 @@ package org.apache.beam.it.gcp.dataflow; import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.LEGACY_RUNNER; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.PARAM_JOB_ID; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.PARAM_JOB_TYPE; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.PARAM_RUNNER; import static org.junit.Assert.assertThrows; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -153,8 +157,14 @@ public void testLaunchNewJob() throws IOException { .setSdk("Apache Beam Java") .setVersion("2.42.0") .setJobType("JOB_TYPE_BATCH") - .setRunner("Dataflow Legacy Runner") - .setParameters(ImmutableMap.of(PARAM_KEY, PARAM_VALUE)) + .setRunner(AbstractPipelineLauncher.LEGACY_RUNNER) + .setParameters( + ImmutableMap.builder() + .put(PARAM_KEY, PARAM_VALUE) + .put(PARAM_JOB_ID, JOB_ID) + .put(PARAM_RUNNER, LEGACY_RUNNER) + .put(PARAM_JOB_TYPE, "JOB_TYPE_BATCH") + .build()) .build(); assertThat(actual).isEqualTo(expected); } diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/FlexTemplateClientTest.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/FlexTemplateClientTest.java index 4088efe675143..06f44437414a3 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/FlexTemplateClientTest.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/dataflow/FlexTemplateClientTest.java @@ -18,6 +18,10 @@ package org.apache.beam.it.gcp.dataflow; import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.LEGACY_RUNNER; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.PARAM_JOB_ID; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.PARAM_JOB_TYPE; +import static org.apache.beam.it.gcp.dataflow.AbstractPipelineLauncher.PARAM_RUNNER; import static org.junit.Assert.assertThrows; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -158,8 +162,14 @@ public void testLaunchNewJob() throws IOException { .setSdk("Apache Beam Java") .setVersion("2.42.0") .setJobType("JOB_TYPE_BATCH") - .setRunner("Dataflow Legacy Runner") - .setParameters(ImmutableMap.of(PARAM_KEY, PARAM_VALUE)) + .setRunner(LEGACY_RUNNER) + .setParameters( + ImmutableMap.builder() + .put(PARAM_KEY, PARAM_VALUE) + .put(PARAM_JOB_ID, JOB_ID) + .put(PARAM_RUNNER, LEGACY_RUNNER) + .put(PARAM_JOB_TYPE, "JOB_TYPE_BATCH") + .build()) .build(); assertThat(actual).isEqualTo(expected); } diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java index fd1bc1772f2db..704f8337c66ff 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/FileBasedIOLT.java @@ -90,7 +90,7 @@ public class FileBasedIOLT extends IOLoadTestBase { @Rule public TestPipeline readPipeline = TestPipeline.create(); - private static final Map TEST_CONFIGS_PRESET; + private static final Map TEST_CONFIGS_PRESET; static { try { @@ -160,8 +160,6 @@ public void setup() { @Test public void testTextIOWriteThenRead() throws IOException { - final String readPCollection = "Counting element.out0"; - final String writePCollection = "Map records.out0"; TextIO.TypedWrite write = TextIO.write() @@ -182,7 +180,7 @@ public void testTextIOWriteThenRead() throws IOException { .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); PipelineLauncher.LaunchConfig writeOptions = - PipelineLauncher.LaunchConfig.builder("test-textio-write") + PipelineLauncher.LaunchConfig.builder("write-textio") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(writePipeline) .addParameter("runner", configuration.runner) @@ -196,7 +194,7 @@ public void testTextIOWriteThenRead() throws IOException { assertThatResult(writeResult).isLaunchFinished(); PipelineLauncher.LaunchConfig readOptions = - PipelineLauncher.LaunchConfig.builder("test-textio-read") + PipelineLauncher.LaunchConfig.builder("read-textio") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(readPipeline) .addParameter("runner", configuration.runner) @@ -222,8 +220,10 @@ public void testTextIOWriteThenRead() throws IOException { // export metrics MetricsConfiguration metricsConfig = MetricsConfiguration.builder() - .setInputPCollection(writePCollection) - .setOutputPCollection(readPCollection) + .setInputPCollection("Map records.out0") + .setInputPCollectionV2("Map records/ParMultiDo(MapKVToString).out0") + .setOutputPCollection("Counting element.out0") + .setOutputPCollectionV2("Counting element/ParMultiDo(Counting).out0") .build(); try { exportMetricsToBigQuery(writeInfo, getMetrics(writeInfo, metricsConfig)); diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/GcsResourceManagerTest.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/GcsResourceManagerTest.java index 3ec96da810073..0153573feaed3 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/GcsResourceManagerTest.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/storage/GcsResourceManagerTest.java @@ -71,7 +71,7 @@ public final class GcsResourceManagerTest { @Mock private Blob blob; private GcsResourceManager gcsClient; - private static final String ARTIFACT_NAME = "test-artifact.txt"; + private static final String ARTIFACT_NAME = "test-artifact.json"; private static final Path LOCAL_PATH; private static final byte[] TEST_ARTIFACT_CONTENTS; diff --git a/it/google-cloud-platform/src/test/resources/test-artifact.txt b/it/google-cloud-platform/src/test/resources/test-artifact.txt deleted file mode 100644 index 22c4e1d122a70..0000000000000 --- a/it/google-cloud-platform/src/test/resources/test-artifact.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test artifact. \ No newline at end of file diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java index 0bcb16c610952..c515b2c4844f7 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MSSQLResourceManager.java @@ -61,13 +61,14 @@ private MSSQLResourceManager(Builder builder) { } @VisibleForTesting - > MSSQLResourceManager(T container, Builder builder) { + > MSSQLResourceManager( + T container, Builder builder) { super(container, builder); initialized = true; } - public static Builder builder(String testId) { - return new Builder(testId); + public static MSSQLResourceManager.Builder builder(String testId) { + return new MSSQLResourceManager.Builder(testId); } private synchronized void createDatabase(String databaseName) { diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java index e1bf3640b53d8..688c26dfb56da 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/MySQLResourceManager.java @@ -49,8 +49,8 @@ private MySQLResourceManager(Builder builder) { super(container, builder); } - public static Builder builder(String testId) { - return new Builder(testId); + public static MySQLResourceManager.Builder builder(String testId) { + return new MySQLResourceManager.Builder(testId); } @Override diff --git a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java index f44e939936d28..8054d26c33f70 100644 --- a/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java +++ b/it/jdbc/src/main/java/org/apache/beam/it/jdbc/OracleResourceManager.java @@ -45,7 +45,7 @@ public class OracleResourceManager extends AbstractJDBCResourceManager( DockerImageName.parse(builder.containerImageName).withTag(builder.containerImageTag)), @@ -46,12 +46,13 @@ private PostgresResourceManager(Builder builder) { } @VisibleForTesting - PostgresResourceManager(PostgreSQLContainer container, Builder builder) { + PostgresResourceManager( + PostgreSQLContainer container, PostgresResourceManager.Builder builder) { super(container, builder); } - public static Builder builder(String testId) { - return new Builder(testId); + public static PostgresResourceManager.Builder builder(String testId) { + return new PostgresResourceManager.Builder(testId); } @Override diff --git a/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java b/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java index d9a647dbeebdd..7f7fb5b695698 100644 --- a/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java +++ b/it/kafka/src/main/java/org/apache/beam/it/kafka/KafkaResourceManager.java @@ -71,13 +71,16 @@ public class KafkaResourceManager extends TestContainerResourceManager 0; @@ -102,8 +105,8 @@ private KafkaResourceManager(Builder builder) { : AdminClient.create(ImmutableMap.of("bootstrap.servers", this.connectionString)); } - public static Builder builder(String testId) { - return new Builder(testId); + public static KafkaResourceManager.Builder builder(String testId) { + return new KafkaResourceManager.Builder(testId); } /** Returns the kafka bootstrap server connection string. */ diff --git a/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOLT.java b/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOLT.java index a03030664de4c..ce6ad877c375f 100644 --- a/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOLT.java +++ b/it/kafka/src/test/java/org/apache/beam/it/kafka/KafkaIOLT.java @@ -175,7 +175,7 @@ private PipelineLauncher.LaunchInfo testWrite() throws IOException { .apply("Write to Kafka", writeIO.withTopic(kafkaTopic)); PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder("test-kafka-write") + PipelineLauncher.LaunchConfig.builder("write-kafka") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(writePipeline) .addParameter("runner", configuration.getRunner()) @@ -195,7 +195,7 @@ private PipelineLauncher.LaunchInfo testRead() throws IOException { .apply("Counting element", ParDo.of(new CountingFn<>(READ_ELEMENT_METRIC_NAME))); PipelineLauncher.LaunchConfig options = - PipelineLauncher.LaunchConfig.builder("test-kafka-read") + PipelineLauncher.LaunchConfig.builder("read-kafka") .setSdk(PipelineLauncher.Sdk.JAVA) .setPipeline(readPipeline) .addParameter("runner", configuration.getRunner()) diff --git a/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java b/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java index ed0e556bf0df4..80216b14ac0e6 100644 --- a/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java +++ b/it/mongodb/src/main/java/org/apache/beam/it/mongodb/MongoDBResourceManager.java @@ -69,7 +69,7 @@ public class MongoDBResourceManager extends TestContainerResourceManager( @@ -79,7 +79,10 @@ private Neo4jResourceManager(Builder builder) { @VisibleForTesting @SuppressWarnings("nullness") - Neo4jResourceManager(@Nullable Driver neo4jDriver, Neo4jContainer container, Builder builder) { + Neo4jResourceManager( + @Nullable Driver neo4jDriver, + Neo4jContainer container, + Neo4jResourceManager.Builder builder) { super(container, builder); this.adminPassword = builder.adminPassword; @@ -98,8 +101,8 @@ private Neo4jResourceManager(Builder builder) { } } - public static Builder builder(String testId) { - return new Builder(testId); + public static Neo4jResourceManager.Builder builder(String testId) { + return new Neo4jResourceManager.Builder(testId); } /** Returns the URI connection string to the Neo4j Database. */ diff --git a/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java b/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java index 0115a791eefe6..1ef4726df43aa 100644 --- a/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java +++ b/it/splunk/src/main/java/org/apache/beam/it/splunk/SplunkResourceManager.java @@ -85,7 +85,7 @@ public class SplunkResourceManager extends TestContainerResourceManagerOptionally, a static resource can be specified by calling the useStaticContainer() method in - * the {@link Builder} class. A static resource is a pre-configured database or other resource that - * is ready to be connected to by the resource manager. This could be a pre-existing TestContainer - * that has not been closed, a local database instance, a remote VM, or any other source that can be - * connected to. If a static container is used, the host and port must also be configured using the - * Builder's setHost() and setPort() methods, respectively. + * the {@link TestContainerResourceManager.Builder} class. A static resource is a pre-configured + * database or other resource that is ready to be connected to by the resource manager. This could + * be a pre-existing TestContainer that has not been closed, a local database instance, a remote VM, + * or any other source that can be connected to. If a static container is used, the host and port + * must also be configured using the Builder's setHost() and setPort() methods, respectively. */ public abstract class TestContainerResourceManager> implements ResourceManager { @@ -48,11 +48,12 @@ public abstract class TestContainerResourceManager private final String host; protected int port; - protected > TestContainerResourceManager(T container, B builder) { + protected > TestContainerResourceManager( + T container, B builder) { this(container, builder, null); } - protected > TestContainerResourceManager( + protected > TestContainerResourceManager( T container, B builder, @Nullable Callable setup) { this.container = container; this.usingStaticContainer = builder.useStaticContainer; diff --git a/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/LaunchInfoSubject.java b/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/LaunchInfoSubject.java index a496ecce94483..30a27c9ad2592 100644 --- a/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/LaunchInfoSubject.java +++ b/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/LaunchInfoSubject.java @@ -43,7 +43,7 @@ public static Factory launchInfo() { } /** - * Check if the subject reflects succeeded states. A successfully {@link LaunchInfo} does not mean + * Check if the subject reflects succeeded states. A successful {@link LaunchInfo} does not mean * that the pipeline finished and no errors happened, it just means that the job was able to get * itself into an active state (RUNNING, UPDATED). */ diff --git a/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java b/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java index 75d5ce3a67cd5..39a0c0cebedcd 100644 --- a/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java +++ b/it/truthmatchers/src/main/java/org/apache/beam/it/truthmatchers/RecordsSubject.java @@ -81,7 +81,7 @@ public void hasRecordSubset(Map subset) { Map expected = convertMapToTreeMap(subset); for (Map candidate : actual) { boolean match = true; - for (Entry entry : subset.entrySet()) { + for (Map.Entry entry : subset.entrySet()) { if (!candidate.containsKey(entry.getKey()) || !candidate.get(entry.getKey()).equals(entry.getValue())) { match = false; From 93de970ea664d4219d1658c05f220ee747709e1a Mon Sep 17 00:00:00 2001 From: "gabry.wu" Date: Wed, 20 Sep 2023 23:56:01 +0800 Subject: [PATCH 12/34] add setJoinSubsetType to inject joinSubsetType to BeamSqlSeekableTable (#28477) * add setJoinSubsetType to inject joinSubsetType to BeamSqlSeekableTable * 1. delete setJoinSubsetType 2. move joinSubsetType to setUp method * add comments to Breaking Changes section * Update sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/BeamSqlSeekableTable.java --- CHANGES.md | 1 + .../beam/sdk/extensions/sql/BeamSqlSeekableTable.java | 9 +++++++-- .../sql/impl/transform/BeamJoinTransforms.java | 2 +- .../sql/impl/rel/BeamSideInputLookupJoinRelTest.java | 9 +++++++++ 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index bbe9d539531b7..a990a5fd73043 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -75,6 +75,7 @@ * Removed fastjson library dependency for Beam SQL. Table property is changed to be based on jackson ObjectNode (Java) ([#24154](https://github.com/apache/beam/issues/24154)). * Removed TensorFlow from Beam Python container images [PR](https://github.com/apache/beam/pull/28424). If you have been negatively affected by this change, please comment on [#20605](https://github.com/apache/beam/issues/20605). * Removed the parameter `t reflect.Type` from `parquetio.Write`. The element type is derived from the input PCollection (Go) ([#28490](https://github.com/apache/beam/issues/28490)) +* Refactor BeamSqlSeekableTable.setUp adding a parameter joinSubsetType. [#28283](https://github.com/apache/beam/issues/28283) ## Deprecations diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/BeamSqlSeekableTable.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/BeamSqlSeekableTable.java index 7b924cf6b6da2..4dc9bd5777ff6 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/BeamSqlSeekableTable.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/BeamSqlSeekableTable.java @@ -20,6 +20,7 @@ import java.io.Serializable; import java.util.List; import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.values.Row; @@ -28,8 +29,12 @@ * FROM FACT_TABLE JOIN LOOKUP_TABLE ON ...}. */ public interface BeamSqlSeekableTable extends Serializable { - /** prepare the instance. */ - default void setUp() {} + /** + * prepare the instance. + * + * @param joinSubsetType joining subset schema + */ + default void setUp(Schema joinSubsetType) {} default void startBundle( DoFn.StartBundleContext context, PipelineOptions pipelineOptions) {} diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/transform/BeamJoinTransforms.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/transform/BeamJoinTransforms.java index e4d62c2b5de7e..d25f98729bd4c 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/transform/BeamJoinTransforms.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/transform/BeamJoinTransforms.java @@ -153,7 +153,7 @@ public PCollection expand(PCollection input) { new DoFn() { @Setup public void setup() { - seekableTable.setUp(); + seekableTable.setUp(joinSubsetType); } @StartBundle diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamSideInputLookupJoinRelTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamSideInputLookupJoinRelTest.java index 2e2971ebd6e95..b5fd03045cbc1 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamSideInputLookupJoinRelTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamSideInputLookupJoinRelTest.java @@ -34,6 +34,7 @@ import org.apache.beam.sdk.values.POutput; import org.apache.beam.sdk.values.Row; import org.hamcrest.core.StringContains; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; @@ -47,11 +48,18 @@ public class BeamSideInputLookupJoinRelTest extends BaseRelTest { /** Test table for JOIN-AS-LOOKUP. */ public static class SiteLookupTable extends SchemaBaseBeamTable implements BeamSqlSeekableTable { + private Schema joinSubsetType; public SiteLookupTable(Schema schema) { super(schema); } + @Override + public void setUp(Schema joinSubsetType) { + this.joinSubsetType = joinSubsetType; + Assert.assertNotNull(joinSubsetType); + } + @Override public PCollection.IsBounded isBounded() { return PCollection.IsBounded.BOUNDED; @@ -69,6 +77,7 @@ public POutput buildIOWriter(PCollection input) { @Override public List seekRow(Row lookupSubRow) { + Assert.assertEquals(joinSubsetType, lookupSubRow.getSchema()); if (lookupSubRow.getInt32("site_id") == 2) { return Arrays.asList(Row.withSchema(getSchema()).addValues(2, "SITE1").build()); } From f676d93030cf9d0c849337f1e3a4efff1d8f2509 Mon Sep 17 00:00:00 2001 From: caneff Date: Wed, 20 Sep 2023 12:34:41 -0400 Subject: [PATCH 13/34] When comparing Series, sort the values in Dataframe tests (#28557) --- sdks/python/apache_beam/dataframe/frames_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index 4998683461b9b..30d9924805153 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -193,6 +193,9 @@ def _run_test( if expected.index.is_unique: expected = expected.sort_index() actual = actual.sort_index() + elif isinstance(expected, pd.Series): + expected = expected.sort_values() + actual = actual.sort_values() else: expected = expected.sort_values(list(expected.columns)) actual = actual.sort_values(list(actual.columns)) From ef0d8d4041cd55f153ebb3503486fb8d558e812a Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Wed, 20 Sep 2023 17:30:55 +0000 Subject: [PATCH 14/34] Label Python external SchemaTransform with its URN (#28540) --- sdks/python/apache_beam/transforms/external.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/transforms/external.py b/sdks/python/apache_beam/transforms/external.py index 4b8e708bfc5cd..44bf2398a6dd4 100644 --- a/sdks/python/apache_beam/transforms/external.py +++ b/sdks/python/apache_beam/transforms/external.py @@ -185,6 +185,14 @@ def __init__(self, identifier, **kwargs): self._identifier = identifier self._kwargs = kwargs + def identifier(self): + """ + The URN referencing this SchemaTransform + + :return: str + """ + return self._identifier + def build(self): schema_proto, payload = self._get_schema_proto_and_payload(**self._kwargs) payload = external_transforms_pb2.SchemaTransformPayload( @@ -194,7 +202,7 @@ def build(self): return payload -class ExplicitSchemaTransformPayloadBuilder(PayloadBuilder): +class ExplicitSchemaTransformPayloadBuilder(SchemaTransformPayloadBuilder): def __init__(self, identifier, schema_proto, **kwargs): self._identifier = identifier self._schema_proto = schema_proto @@ -414,7 +422,7 @@ def __init__( def expand(self, pcolls): # Expand the transform using the expansion service. - return pcolls | ExternalTransform( + return pcolls | self._payload_builder.identifier() >> ExternalTransform( common_urns.schematransform_based_expand.urn, self._payload_builder, self._expansion_service) From 955cd8920f4ce28964a2f6ab54b5a4a705531062 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 13:52:32 -0400 Subject: [PATCH 15/34] Bump actions/checkout from 3 to 4 (#28552) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml | 2 +- .github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml | 2 +- .../workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml | 2 +- .github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml | 2 +- .github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml | 2 +- .github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml | 2 +- .github/workflows/beam_PostCommit_Javadoc.yml | 2 +- .github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml index cf51623070784..ef90fbad5bf09 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow.yml @@ -90,7 +90,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Dataflow Runner Nexmark Tests' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml index 0ee017ecce220..3eb93e6687f88 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2.yml @@ -90,7 +90,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Dataflow Runner V2 Nexmark Tests' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml index 001e37775048a..06438510400bd 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Dataflow_V2_Java.yml @@ -92,7 +92,7 @@ jobs: (contains(github.event.comment.body, 'Run Dataflow Runner V2 Java') && contains(github.event.comment.body, 'Nexmark Tests')) steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml index 00727ba64d489..2386d7e26f380 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Direct.yml @@ -85,7 +85,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Direct Runner Nexmark Tests' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml index fce3ee045065c..9123c90796052 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Flink.yml @@ -84,7 +84,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Flink Runner Nexmark Tests' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml b/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml index a3735a00738fa..7492eb9b82621 100644 --- a/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml +++ b/.github/workflows/beam_PostCommit_Java_Nexmark_Spark.yml @@ -84,7 +84,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Spark Runner Nexmark Tests' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Javadoc.yml b/.github/workflows/beam_PostCommit_Javadoc.yml index f60f9dafd535c..240b0e43d2716 100644 --- a/.github/workflows/beam_PostCommit_Javadoc.yml +++ b/.github/workflows/beam_PostCommit_Javadoc.yml @@ -64,7 +64,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Javadoc PostCommit' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: diff --git a/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml b/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml index a493d6a1656bb..81c9b4a8b4848 100644 --- a/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml +++ b/.github/workflows/beam_PostCommit_Python_Nexmark_Direct.yml @@ -107,7 +107,7 @@ jobs: github.event_name == 'schedule' || github.event.comment.body == 'Run Python Direct Runner Nexmark Tests' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup repository uses: ./.github/actions/setup-action with: From 3b7e1dcc4e621f977eab9d19dae739328f9169c0 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Wed, 20 Sep 2023 10:53:40 -0700 Subject: [PATCH 16/34] Typescript changes for Beam 2.51.0 (#28553) * Revert dependbot changes. If we upgrade proto, we need to regenerate the protos. * Add needed dataflow experiments. This is probably related to BEAM-28399; best to fix here to be explicit. * Pin to patch version. --- sdks/typescript/package-lock.json | 238 ++---------------- sdks/typescript/package.json | 4 +- .../src/apache_beam/runners/dataflow.ts | 2 + 3 files changed, 22 insertions(+), 222 deletions(-) diff --git a/sdks/typescript/package-lock.json b/sdks/typescript/package-lock.json index 51a10d2b4a8fd..e4556449fde47 100644 --- a/sdks/typescript/package-lock.json +++ b/sdks/typescript/package-lock.json @@ -1,15 +1,15 @@ { "name": "apache-beam", - "version": "2.50.0-SNAPSHOT", + "version": "2.50.0", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "apache-beam", - "version": "2.50.0-SNAPSHOT", + "version": "2.50.0", "dependencies": { "@google-cloud/pubsub": "^2.19.4", - "@grpc/grpc-js": "^1.8.8", + "@grpc/grpc-js": "^1.4.6", "@protobuf-ts/grpc-transport": "^2.1.0", "@protobuf-ts/plugin": "^2.1.0", "bson": "^4.6.0", @@ -19,7 +19,7 @@ "fast-deep-equal": "^3.1.3", "find-git-root": "^1.0.4", "long": "^4.0.0", - "protobufjs": "^7.2.4", + "protobufjs": "^6.11.3", "queue-typescript": "^1.0.1", "serialize-closures": "^0.2.7", "ts-closure-transform": "^0.1.7", @@ -190,73 +190,17 @@ } }, "node_modules/@grpc/grpc-js": { - "version": "1.8.8", - "resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.8.8.tgz", - "integrity": "sha512-4gfDqMLXTrorvYTKA1jL22zLvVwiHJ73t6Re1OHwdCFRjdGTDOVtSJuaWhtHaivyeDGg0LeCkmU77MTKoV3wPA==", + "version": "1.4.6", + "resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.4.6.tgz", + "integrity": "sha512-Byau4xiXfIixb1PnW30V/P9mkrZ05lknyNqiK+cVY9J5hj3gecxd/anwaUbAM8j834zg1x78NvAbwGnMfWEu7A==", "dependencies": { - "@grpc/proto-loader": "^0.7.0", + "@grpc/proto-loader": "^0.6.4", "@types/node": ">=12.12.47" }, "engines": { "node": "^8.13.0 || >=10.10.0" } }, - "node_modules/@grpc/grpc-js/node_modules/@grpc/proto-loader": { - "version": "0.7.7", - "resolved": "https://registry.npmjs.org/@grpc/proto-loader/-/proto-loader-0.7.7.tgz", - "integrity": "sha512-1TIeXOi8TuSCQprPItwoMymZXxWT0CPxUhkrkeCUH+D8U7QDwQ6b7SUz2MaLuWM2llT+J/TVFLmQI5KtML3BhQ==", - "dependencies": { - "@types/long": "^4.0.1", - "lodash.camelcase": "^4.3.0", - "long": "^4.0.0", - "protobufjs": "^7.0.0", - "yargs": "^17.7.2" - }, - "bin": { - "proto-loader-gen-types": "build/bin/proto-loader-gen-types.js" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/@grpc/grpc-js/node_modules/cliui": { - "version": "8.0.1", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", - "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", - "dependencies": { - "string-width": "^4.2.0", - "strip-ansi": "^6.0.1", - "wrap-ansi": "^7.0.0" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/@grpc/grpc-js/node_modules/yargs": { - "version": "17.7.2", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", - "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", - "dependencies": { - "cliui": "^8.0.1", - "escalade": "^3.1.1", - "get-caller-file": "^2.0.5", - "require-directory": "^2.1.1", - "string-width": "^4.2.3", - "y18n": "^5.0.5", - "yargs-parser": "^21.1.1" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/@grpc/grpc-js/node_modules/yargs-parser": { - "version": "21.1.1", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", - "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", - "engines": { - "node": ">=12" - } - }, "node_modules/@grpc/proto-loader": { "version": "0.6.9", "resolved": "https://registry.npmjs.org/@grpc/proto-loader/-/proto-loader-0.6.9.tgz", @@ -275,31 +219,6 @@ "node": ">=6" } }, - "node_modules/@grpc/proto-loader/node_modules/protobufjs": { - "version": "6.11.3", - "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.3.tgz", - "integrity": "sha512-xL96WDdCZYdU7Slin569tFX712BxsxslWwAfAhCYjQKGTq7dAU91Lomy6nLLhh/dyGhk/YH4TwTSRxTzhuHyZg==", - "hasInstallScript": true, - "dependencies": { - "@protobufjs/aspromise": "^1.1.2", - "@protobufjs/base64": "^1.1.2", - "@protobufjs/codegen": "^2.0.4", - "@protobufjs/eventemitter": "^1.1.0", - "@protobufjs/fetch": "^1.1.0", - "@protobufjs/float": "^1.0.2", - "@protobufjs/inquire": "^1.1.0", - "@protobufjs/path": "^1.1.2", - "@protobufjs/pool": "^1.1.0", - "@protobufjs/utf8": "^1.1.0", - "@types/long": "^4.0.1", - "@types/node": ">=13.7.0", - "long": "^4.0.0" - }, - "bin": { - "pbjs": "bin/pbjs", - "pbts": "bin/pbts" - } - }, "node_modules/@humanwhocodes/config-array": { "version": "0.9.5", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.9.5.tgz", @@ -3293,7 +3212,7 @@ "protobufjs": "^6.11.2" } }, - "node_modules/proto3-json-serializer/node_modules/protobufjs": { + "node_modules/protobufjs": { "version": "6.11.3", "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.3.tgz", "integrity": "sha512-xL96WDdCZYdU7Slin569tFX712BxsxslWwAfAhCYjQKGTq7dAU91Lomy6nLLhh/dyGhk/YH4TwTSRxTzhuHyZg==", @@ -3318,34 +3237,6 @@ "pbts": "bin/pbts" } }, - "node_modules/protobufjs": { - "version": "7.2.4", - "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.4.tgz", - "integrity": "sha512-AT+RJgD2sH8phPmCf7OUZR8xGdcJRga4+1cOaXJ64hvcSkVhNcRHOwIxUatPH15+nj59WAGTDv3LSGZPEQbJaQ==", - "hasInstallScript": true, - "dependencies": { - "@protobufjs/aspromise": "^1.1.2", - "@protobufjs/base64": "^1.1.2", - "@protobufjs/codegen": "^2.0.4", - "@protobufjs/eventemitter": "^1.1.0", - "@protobufjs/fetch": "^1.1.0", - "@protobufjs/float": "^1.0.2", - "@protobufjs/inquire": "^1.1.0", - "@protobufjs/path": "^1.1.2", - "@protobufjs/pool": "^1.1.0", - "@protobufjs/utf8": "^1.1.0", - "@types/node": ">=13.7.0", - "long": "^5.0.0" - }, - "engines": { - "node": ">=12.0.0" - } - }, - "node_modules/protobufjs/node_modules/long": { - "version": "5.2.3", - "resolved": "https://registry.npmjs.org/long/-/long-5.2.3.tgz", - "integrity": "sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==" - }, "node_modules/punycode": { "version": "1.4.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", @@ -4296,55 +4187,12 @@ } }, "@grpc/grpc-js": { - "version": "1.8.8", - "resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.8.8.tgz", - "integrity": "sha512-4gfDqMLXTrorvYTKA1jL22zLvVwiHJ73t6Re1OHwdCFRjdGTDOVtSJuaWhtHaivyeDGg0LeCkmU77MTKoV3wPA==", + "version": "1.4.6", + "resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.4.6.tgz", + "integrity": "sha512-Byau4xiXfIixb1PnW30V/P9mkrZ05lknyNqiK+cVY9J5hj3gecxd/anwaUbAM8j834zg1x78NvAbwGnMfWEu7A==", "requires": { - "@grpc/proto-loader": "^0.7.0", + "@grpc/proto-loader": "^0.6.4", "@types/node": ">=12.12.47" - }, - "dependencies": { - "@grpc/proto-loader": { - "version": "0.7.7", - "resolved": "https://registry.npmjs.org/@grpc/proto-loader/-/proto-loader-0.7.7.tgz", - "integrity": "sha512-1TIeXOi8TuSCQprPItwoMymZXxWT0CPxUhkrkeCUH+D8U7QDwQ6b7SUz2MaLuWM2llT+J/TVFLmQI5KtML3BhQ==", - "requires": { - "@types/long": "^4.0.1", - "lodash.camelcase": "^4.3.0", - "long": "^4.0.0", - "protobufjs": "^7.0.0", - "yargs": "^17.7.2" - } - }, - "cliui": { - "version": "8.0.1", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", - "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", - "requires": { - "string-width": "^4.2.0", - "strip-ansi": "^6.0.1", - "wrap-ansi": "^7.0.0" - } - }, - "yargs": { - "version": "17.7.2", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", - "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", - "requires": { - "cliui": "^8.0.1", - "escalade": "^3.1.1", - "get-caller-file": "^2.0.5", - "require-directory": "^2.1.1", - "string-width": "^4.2.3", - "y18n": "^5.0.5", - "yargs-parser": "^21.1.1" - } - }, - "yargs-parser": { - "version": "21.1.1", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", - "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==" - } } }, "@grpc/proto-loader": { @@ -4357,28 +4205,6 @@ "long": "^4.0.0", "protobufjs": "^6.10.0", "yargs": "^16.2.0" - }, - "dependencies": { - "protobufjs": { - "version": "6.11.3", - "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.3.tgz", - "integrity": "sha512-xL96WDdCZYdU7Slin569tFX712BxsxslWwAfAhCYjQKGTq7dAU91Lomy6nLLhh/dyGhk/YH4TwTSRxTzhuHyZg==", - "requires": { - "@protobufjs/aspromise": "^1.1.2", - "@protobufjs/base64": "^1.1.2", - "@protobufjs/codegen": "^2.0.4", - "@protobufjs/eventemitter": "^1.1.0", - "@protobufjs/fetch": "^1.1.0", - "@protobufjs/float": "^1.0.2", - "@protobufjs/inquire": "^1.1.0", - "@protobufjs/path": "^1.1.2", - "@protobufjs/pool": "^1.1.0", - "@protobufjs/utf8": "^1.1.0", - "@types/long": "^4.0.1", - "@types/node": ">=13.7.0", - "long": "^4.0.0" - } - } } }, "@humanwhocodes/config-array": { @@ -6583,34 +6409,12 @@ "integrity": "sha512-A60IisqvnuI45qNRygJjrnNjX2TMdQGMY+57tR3nul3ZgO2zXkR9OGR8AXxJhkqx84g0FTnrfi3D5fWMSdANdQ==", "requires": { "protobufjs": "^6.11.2" - }, - "dependencies": { - "protobufjs": { - "version": "6.11.3", - "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.3.tgz", - "integrity": "sha512-xL96WDdCZYdU7Slin569tFX712BxsxslWwAfAhCYjQKGTq7dAU91Lomy6nLLhh/dyGhk/YH4TwTSRxTzhuHyZg==", - "requires": { - "@protobufjs/aspromise": "^1.1.2", - "@protobufjs/base64": "^1.1.2", - "@protobufjs/codegen": "^2.0.4", - "@protobufjs/eventemitter": "^1.1.0", - "@protobufjs/fetch": "^1.1.0", - "@protobufjs/float": "^1.0.2", - "@protobufjs/inquire": "^1.1.0", - "@protobufjs/path": "^1.1.2", - "@protobufjs/pool": "^1.1.0", - "@protobufjs/utf8": "^1.1.0", - "@types/long": "^4.0.1", - "@types/node": ">=13.7.0", - "long": "^4.0.0" - } - } } }, "protobufjs": { - "version": "7.2.4", - "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.4.tgz", - "integrity": "sha512-AT+RJgD2sH8phPmCf7OUZR8xGdcJRga4+1cOaXJ64hvcSkVhNcRHOwIxUatPH15+nj59WAGTDv3LSGZPEQbJaQ==", + "version": "6.11.3", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.3.tgz", + "integrity": "sha512-xL96WDdCZYdU7Slin569tFX712BxsxslWwAfAhCYjQKGTq7dAU91Lomy6nLLhh/dyGhk/YH4TwTSRxTzhuHyZg==", "requires": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", @@ -6622,15 +6426,9 @@ "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", + "@types/long": "^4.0.1", "@types/node": ">=13.7.0", - "long": "^5.0.0" - }, - "dependencies": { - "long": { - "version": "5.2.3", - "resolved": "https://registry.npmjs.org/long/-/long-5.2.3.tgz", - "integrity": "sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==" - } + "long": "^4.0.0" } }, "punycode": { diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index 041121bd41ea7..35a1e8134e295 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -36,7 +36,7 @@ }, "dependencies": { "@google-cloud/pubsub": "^2.19.4", - "@grpc/grpc-js": "^1.8.8", + "@grpc/grpc-js": "~1.4.6", "@protobuf-ts/grpc-transport": "^2.1.0", "@protobuf-ts/plugin": "^2.1.0", "bson": "^4.6.0", @@ -46,7 +46,7 @@ "fast-deep-equal": "^3.1.3", "find-git-root": "^1.0.4", "long": "^4.0.0", - "protobufjs": "^7.2.4", + "protobufjs": "~6.11.3", "queue-typescript": "^1.0.1", "serialize-closures": "^0.2.7", "ts-closure-transform": "^0.1.7", diff --git a/sdks/typescript/src/apache_beam/runners/dataflow.ts b/sdks/typescript/src/apache_beam/runners/dataflow.ts index 950e630d82d9a..e7da1f7ada51a 100644 --- a/sdks/typescript/src/apache_beam/runners/dataflow.ts +++ b/sdks/typescript/src/apache_beam/runners/dataflow.ts @@ -33,6 +33,8 @@ export function dataflowRunner(runnerOptions: { options: Object = {} ): Promise { var augmentedOptions = { experiments: [] as string[], ...options }; + augmentedOptions.experiments.push("use_runner_v2"); + augmentedOptions.experiments.push("use_portable_job_submission"); augmentedOptions.experiments.push("use_sibling_sdk_workers"); return new PortableRunner( runnerOptions as any, From 534f93acd18fc18e7c56a7495ca0a8434676cbc7 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 20 Sep 2023 13:55:52 -0400 Subject: [PATCH 17/34] Introduce PeriodicImpulse.stopAfter() (#28503) * Use it in streaming BigQueryIO integration test --- .../beam/runners/dataflow/DataflowRunner.java | 2 +- .../beam/sdk/transforms/PeriodicImpulse.java | 98 ++++++++++++++++--- .../beam/sdk/transforms/PeriodicSequence.java | 3 + .../bigquery/BigQueryIOStorageWriteIT.java | 42 ++++++-- 4 files changed, 123 insertions(+), 22 deletions(-) diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index 02f6f9acd7a64..17aea34045ff1 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -1755,7 +1755,7 @@ void maybeRecordPCollectionWithAutoSharding(PCollection pcol) { options.isEnableStreamingEngine(), "Runner determined sharding not available in Dataflow for GroupIntoBatches for" + " non-Streaming-Engine jobs. In order to use runner determined sharding, please use" - + " --streaming --enable_streaming_engine"); + + " --streaming --experiments=enable_streaming_engine"); pCollectionsPreservedKeys.add(pcol); pcollectionsRequiringAutoSharding.add(pcol); } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicImpulse.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicImpulse.java index 3679c3eb10f5f..db4f141ee624f 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicImpulse.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicImpulse.java @@ -17,11 +17,15 @@ */ package org.apache.beam.sdk.transforms; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; + +import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.FixedWindows; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.joda.time.Instant; @@ -34,28 +38,58 @@ */ public class PeriodicImpulse extends PTransform> { - Instant startTimestamp = Instant.now(); - Instant stopTimestamp = BoundedWindow.TIMESTAMP_MAX_VALUE; - Duration fireInterval = Duration.standardMinutes(1); + Instant startTimestamp; + Instant stopTimestamp; + @Nullable Duration stopDuration; + Duration fireInterval; boolean applyWindowing = false; boolean catchUpToNow = true; - private PeriodicImpulse() {} + private PeriodicImpulse() { + this.startTimestamp = Instant.now(); + this.stopTimestamp = BoundedWindow.TIMESTAMP_MAX_VALUE; + this.fireInterval = Duration.standardMinutes(1); + } public static PeriodicImpulse create() { return new PeriodicImpulse(); } + /** + * Assign a timestamp when the pipeliene starts to produce data. + * + *

Cannot be used along with {@link #stopAfter}. + */ public PeriodicImpulse startAt(Instant startTime) { + checkArgument(stopDuration == null, "startAt and stopAfter cannot be set at the same time"); this.startTimestamp = startTime; return this; } + /** + * Assign a timestamp when the pipeliene stops producing data. + * + *

Cannot be used along with {@link #stopAfter}. + */ public PeriodicImpulse stopAt(Instant stopTime) { + checkArgument(stopDuration == null, "stopAt and stopAfter cannot be set at the same time"); this.stopTimestamp = stopTime; return this; } + /** + * For internal use only; no backwards-compatibility guarantees. + * + *

Assign a time interval at which the pipeliene produces data. This is different from setting + * {@link #startAt} and {@link #stopAt}, as the first timestamp is determined at run time + * (pipeline starts processing). + */ + @Internal + public PeriodicImpulse stopAfter(Duration duration) { + this.stopDuration = duration; + return this; + } + public PeriodicImpulse withInterval(Duration interval) { this.fireInterval = interval; return this; @@ -67,10 +101,13 @@ public PeriodicImpulse applyWindowing() { } /** - * The default behavior is that PeriodicImpulse emits all instants until Instant.now(), then + * For internal use only; no backwards-compatibility guarantees. + * + *

The default behavior is that PeriodicImpulse emits all instants until Instant.now(), then * starts firing at the specified interval. If this is set to false, the PeriodicImpulse will * perform the interval wait before firing each instant. */ + @Internal public PeriodicImpulse catchUpToNow(boolean catchUpToNow) { this.catchUpToNow = catchUpToNow; return this; @@ -78,20 +115,51 @@ public PeriodicImpulse catchUpToNow(boolean catchUpToNow) { @Override public PCollection expand(PBegin input) { - PCollection result = - input - .apply( - Create.of( - new PeriodicSequence.SequenceDefinition( - startTimestamp, stopTimestamp, fireInterval, catchUpToNow))) - .apply(PeriodicSequence.create()); + PCollection seqDef; + if (stopDuration != null) { + // nonnull guaranteed + Duration d = stopDuration; + seqDef = + input + .apply(Impulse.create()) + .apply(ParDo.of(new RuntimeSequenceFn(d, fireInterval, catchUpToNow))); + } else { + seqDef = + input.apply( + Create.of( + new PeriodicSequence.SequenceDefinition( + startTimestamp, stopTimestamp, fireInterval, catchUpToNow))); + } + PCollection result = seqDef.apply(PeriodicSequence.create()); if (this.applyWindowing) { result = - result.apply( - Window.into(FixedWindows.of(Duration.millis(fireInterval.getMillis())))); + result.apply(Window.into(FixedWindows.of(Duration.millis(fireInterval.getMillis())))); } - return result; } + + /** + * A DoFn generated a SequenceDefinition at run time. This enables set first element timestamp at + * pipeline start processing data. + */ + private static class RuntimeSequenceFn extends DoFn { + Duration stopDuration; + Duration fireInterval; + boolean catchUpToNow; + + RuntimeSequenceFn(Duration stopDuration, Duration fireInterval, boolean catchUpToNow) { + this.stopDuration = stopDuration; + this.fireInterval = fireInterval; + this.catchUpToNow = catchUpToNow; + } + + @ProcessElement + public void process(ProcessContext c) { + Instant now = Instant.now(); + c.output( + new PeriodicSequence.SequenceDefinition( + now, now.plus(stopDuration), fireInterval, catchUpToNow)); + } + } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java index b3cd2afde697d..12cbecd04b02d 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/PeriodicSequence.java @@ -22,6 +22,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import java.util.Objects; +import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.range.OffsetRange; import org.apache.beam.sdk.schemas.JavaFieldSchema; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; @@ -67,6 +68,8 @@ public SequenceDefinition(Instant first, Instant last, Duration duration) { this.catchUpToNow = true; } + /** catchUpToNow is experimental; no backwards-compatibility guarantees. */ + @Internal public SequenceDefinition( Instant first, Instant last, Duration duration, boolean catchUpToNow) { this.first = first; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java index 81de67f385028..fc3ce0be4b691 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOStorageWriteIT.java @@ -33,9 +33,16 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.PeriodicImpulse; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.values.PBegin; +import org.apache.beam.sdk.values.PCollection; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Duration; +import org.joda.time.Instant; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -81,11 +88,32 @@ public void processElement(ProcessContext c) { } } - private GenerateSequence stream(int rowCount) { - int timestampIntervalInMilliseconds = 10; - return GenerateSequence.from(0) - .to(rowCount) - .withRate(1, Duration.millis(timestampIntervalInMilliseconds)); + static class UnboundedStream extends PTransform> { + + private final int rowCount; + + public UnboundedStream(int rowCount) { + this.rowCount = rowCount; + } + + @Override + public PCollection expand(PBegin input) { + int timestampIntervalInMillis = 10; + PeriodicImpulse impulse = + PeriodicImpulse.create() + .stopAfter(Duration.millis((long) timestampIntervalInMillis * rowCount - 1)) + .withInterval(Duration.millis(timestampIntervalInMillis)); + return input + .apply(impulse) + .apply( + MapElements.via( + new SimpleFunction() { + @Override + public Long apply(Instant input) { + return input.getMillis(); + } + })); + } } private void runBigQueryIOStorageWritePipeline( @@ -102,7 +130,9 @@ private void runBigQueryIOStorageWritePipeline( new TableFieldSchema().setName("str").setType("STRING"))); Pipeline p = Pipeline.create(bqOptions); - p.apply("Input", isStreaming ? stream(rowCount) : GenerateSequence.from(0).to(rowCount)) + p.apply( + "Input", + isStreaming ? new UnboundedStream(rowCount) : GenerateSequence.from(0).to(rowCount)) .apply("GenerateMessage", ParDo.of(new FillRowFn())) .apply( "WriteToBQ", From 0b131c9ae7cafd7a43f875b0df1fb714683bdcda Mon Sep 17 00:00:00 2001 From: caneff Date: Wed, 20 Sep 2023 15:16:51 -0400 Subject: [PATCH 18/34] Change handling of copy=None defaults for Pandas 2 (#28523) --- sdks/python/apache_beam/dataframe/frame_base.py | 8 ++++++++ .../apache_beam/dataframe/frame_base_test.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py index 48a4c29d0589a..4e89e473b7301 100644 --- a/sdks/python/apache_beam/dataframe/frame_base.py +++ b/sdks/python/apache_beam/dataframe/frame_base.py @@ -674,11 +674,19 @@ def wrap(func): if removed_args: defaults_to_populate -= set(removed_args) + # In pandas 2, many methods rely on the default copy=None + # to mean that copy is the value of copy_on_write. Since + # copy_on_write will always be true for Beam, just fill it + # in here. In pandas 1, the default was True anyway. + if 'copy' in arg_to_default and arg_to_default['copy'] is None: + arg_to_default['copy'] = True + @functools.wraps(func) def wrapper(**kwargs): for name in defaults_to_populate: if name not in kwargs: kwargs[name] = arg_to_default[name] + return func(**kwargs) return wrapper diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py index b3077320720fe..0a73905339fd7 100644 --- a/sdks/python/apache_beam/dataframe/frame_base_test.py +++ b/sdks/python/apache_beam/dataframe/frame_base_test.py @@ -174,6 +174,21 @@ def func(self, a, **kwargs): 'a': 2, 'b': 4, 'c': 6, 'kw_only': 8 }) + def test_populate_defaults_overwrites_copy(self): + class Base(object): + def func(self, a=1, b=2, c=3, *, copy=None): + pass + + class Proxy(object): + @frame_base.args_to_kwargs(Base) + @frame_base.populate_defaults(Base) + def func(self, a, copy, **kwargs): + return dict(kwargs, a=a, copy=copy) + + proxy = Proxy() + self.assertEqual(proxy.func(), {'a': 1, 'copy': True}) + self.assertEqual(proxy.func(copy=False), {'a': 1, 'copy': False}) + if __name__ == '__main__': unittest.main() From 5fb13e05d545d1277af6a5049f5978504e509b36 Mon Sep 17 00:00:00 2001 From: Pranav Bhandari Date: Wed, 20 Sep 2023 16:59:03 -0400 Subject: [PATCH 19/34] Make launcher method public in LoadTestBase. (#28568) --- .../src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java | 2 +- .../src/main/java/org/apache/beam/it/gcp/LoadTestBase.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java index 32f262f2eac11..6b728a6a60db8 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/IOLoadTestBase.java @@ -62,7 +62,7 @@ public void tearDownBase() throws IOException { } @Override - PipelineLauncher launcher() { + public PipelineLauncher launcher() { return DefaultPipelineLauncher.builder(CREDENTIALS).build(); } diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java index d9c1990ef079a..14bb05394de26 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java @@ -128,7 +128,7 @@ public void tearDownLoadTestBase() throws IOException { monitoringClient.cleanupAll(); } - abstract PipelineLauncher launcher(); + public abstract PipelineLauncher launcher(); /** * Exports the metrics of given dataflow job to BigQuery. From 17db78c10dcb732a1ddcd29049edddfea6357aac Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Wed, 20 Sep 2023 14:40:38 -0700 Subject: [PATCH 20/34] [prism] Auto remove containers after stop. (#28570) Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- sdks/go/pkg/beam/runners/prism/internal/environments.go | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/go/pkg/beam/runners/prism/internal/environments.go b/sdks/go/pkg/beam/runners/prism/internal/environments.go index 7d54cb366ffeb..3a429920fb289 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/environments.go +++ b/sdks/go/pkg/beam/runners/prism/internal/environments.go @@ -155,6 +155,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock }, &container.HostConfig{ NetworkMode: "host", Mounts: mounts, + AutoRemove: true, }, nil, nil, "") if err != nil { cli.Close() From 1d94f5ffb84e3acbd5850639c740df5fb04a0080 Mon Sep 17 00:00:00 2001 From: caneff Date: Wed, 20 Sep 2023 18:18:14 -0400 Subject: [PATCH 21/34] Fix test for new group keys behavior for Pandas 2 (#28566) --- sdks/python/apache_beam/dataframe/frames_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index 30d9924805153..e3555b50187b7 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -1837,8 +1837,8 @@ def test_groupby_apply_preserves_column_order(self): df = GROUPBY_DF self._run_test( - lambda df: df[['foo', 'group', 'bar']].groupby('group').apply( - lambda x: x), + lambda df: df[['foo', 'group', 'bar']].groupby( + 'group', group_keys=False).apply(lambda x: x), df) def test_groupby_transform(self): From c5e6c7962e60ee8366bfc92edf0812341e940020 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Wed, 20 Sep 2023 16:37:17 -0700 Subject: [PATCH 22/34] Refactor and cleanup yaml MapToFields. (#28462) * Avoid the use of MetaProviders, which was always kind of hacky. We may want to remove this infrastructure altogether as it does not play nicely with provider inference. * Split MapToFields into separate mapping, filtering, and exploding operations. * Allow MapToFields to act on non-schema'd PCollections. The various langauge flavors of these UDFs are now handled by a preprocessing step. This will make it easier to extend to other langauges, including in particular possible multiple (equivalent) implementations of javascript to minimize cross-langauge boundary crossings. --------- Co-authored-by: Danny McCormick --- sdks/python/apache_beam/transforms/core.py | 8 + sdks/python/apache_beam/yaml/readme_test.py | 23 +- sdks/python/apache_beam/yaml/yaml_mapping.md | 35 +- sdks/python/apache_beam/yaml/yaml_mapping.py | 367 ++++++++++-------- .../apache_beam/yaml/yaml_mapping_test.py | 32 +- sdks/python/apache_beam/yaml/yaml_provider.py | 7 +- .../python/apache_beam/yaml/yaml_transform.py | 15 + .../apache_beam/yaml/yaml_transform_test.py | 14 +- sdks/python/apache_beam/yaml/yaml_udf_test.py | 38 +- 9 files changed, 311 insertions(+), 228 deletions(-) diff --git a/sdks/python/apache_beam/transforms/core.py b/sdks/python/apache_beam/transforms/core.py index 66ac8fbad9673..671af54e47be3 100644 --- a/sdks/python/apache_beam/transforms/core.py +++ b/sdks/python/apache_beam/transforms/core.py @@ -2258,6 +2258,10 @@ def __init__(self, pcoll, exception_handling_args, upstream_errors=()): self._exception_handling_args = exception_handling_args self._upstream_errors = upstream_errors + @property + def element_type(self): + return self._pcoll.element_type + def main_output_tag(self): return self._exception_handling_args.get('main_tag', 'good') @@ -2309,6 +2313,10 @@ def __init__(self, pvalue, exception_handling_args=None): else: self._pvalue = _PValueWithErrors(pvalue, exception_handling_args) + @property + def element_type(self): + return self._pvalue.element_type + def __or__(self, transform): return self.apply(transform) diff --git a/sdks/python/apache_beam/yaml/readme_test.py b/sdks/python/apache_beam/yaml/readme_test.py index 958d9cb5783a7..d918d18e11dd0 100644 --- a/sdks/python/apache_beam/yaml/readme_test.py +++ b/sdks/python/apache_beam/yaml/readme_test.py @@ -32,6 +32,7 @@ import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.typehints import trivial_inference +from apache_beam.yaml import yaml_mapping from apache_beam.yaml import yaml_provider from apache_beam.yaml import yaml_transform @@ -85,13 +86,16 @@ def guess_name_and_type(expr): typ, = [t for t in typ.__args__ if t is not type(None)] return name, typ - output_schema = [ - guess_name_and_type(expr) for expr in m.group(1).split(',') - ] - output_element = beam.Row(**{name: typ() for name, typ in output_schema}) - return next(iter(inputs.values())) | beam.Map( - lambda _: output_element).with_output_types( - trivial_inference.instance_to_type(output_element)) + if m.group(1) == '*': + return inputs['PCOLLECTION'] | beam.Filter(lambda _: True) + else: + output_schema = [ + guess_name_and_type(expr) for expr in m.group(1).split(',') + ] + output_element = beam.Row(**{name: typ() for name, typ in output_schema}) + return next(iter(inputs.values())) | beam.Map( + lambda _: output_element).with_output_types( + trivial_inference.instance_to_type(output_element)) class FakeReadFromPubSub(beam.PTransform): @@ -204,12 +208,13 @@ def test(self): ] options['render_leaf_composite_nodes'] = ['.*'] test_provider = TestProvider(TEST_TRANSFORMS) + test_sql_mapping_provider = yaml_mapping.SqlMappingProvider(test_provider) p = beam.Pipeline(options=PipelineOptions(**options)) yaml_transform.expand_pipeline( p, modified_yaml, - {t: test_provider - for t in test_provider.provided_transforms()}) + yaml_provider.merge_providers( + [test_provider, test_sql_mapping_provider])) if test_type == 'BUILD': return p.run().wait_until_finish() diff --git a/sdks/python/apache_beam/yaml/yaml_mapping.md b/sdks/python/apache_beam/yaml/yaml_mapping.md index b5e84e1a80542..653b4abe8b89e 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping.md +++ b/sdks/python/apache_beam/yaml/yaml_mapping.md @@ -131,7 +131,7 @@ Currently, in addition to Python, SQL expressions are supported as well Sometimes it may be desirable to emit more (or less) than one record for each input record. This can be accomplished by mapping to an iterable type and -noting that the specific field should be exploded, e.g. +following the mapping with an Explode operation, e.g. ``` - type: MapToFields @@ -140,7 +140,9 @@ noting that the specific field should be exploded, e.g. fields: new_col: "[col1.upper(), col1.lower(), col1.title()]" another_col: "col2 + col3" - explode: new_col +- type: Explode + config: + fields: new_col ``` will result in three output records for every input record. @@ -155,7 +157,9 @@ product over all fields should be taken. For example fields: new_col: "[col1.upper(), col1.lower(), col1.title()]" another_col: "[col2 - 1, col2, col2 + 1]" - explode: [new_col, another_col] +- type: Explode + config: + fields: [new_col, another_col] cross_product: true ``` @@ -168,38 +172,27 @@ will emit nine records whereas fields: new_col: "[col1.upper(), col1.lower(), col1.title()]" another_col: "[col2 - 1, col2, col2 + 1]" - explode: [new_col, another_col] +- type: Explode + config: + fields: [new_col, another_col] cross_product: false ``` will only emit three. -If one is only exploding existing fields, a simpler `Explode` transform may be -used instead +The `Explode` operation can be used on its own if the field in question is +already an iterable type. ``` - type: Explode config: - explode: [col1] + fields: [col1] ``` ## Filtering Sometimes it can be desirable to only keep records that satisfy a certain -criteria. This can be accomplished by specifying a keep parameter, e.g. - -``` -- type: MapToFields - config: - language: python - fields: - new_col: "col1.upper()" - another_col: "col2 + col3" - keep: "col2 > 0" -``` - -Like explode, there is a simpler `Filter` transform useful when no mapping is -being done +criteria. This can be accomplished with a `Filter` transform, e.g. ``` - type: Filter diff --git a/sdks/python/apache_beam/yaml/yaml_mapping.py b/sdks/python/apache_beam/yaml/yaml_mapping.py index b6dea894b3e9c..221c6f018d67d 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping.py +++ b/sdks/python/apache_beam/yaml/yaml_mapping.py @@ -17,6 +17,14 @@ """This module defines the basic MapToFields operation.""" import itertools +from typing import Any +from typing import Callable +from typing import Collection +from typing import Dict +from typing import Iterable +from typing import Mapping +from typing import Optional +from typing import Union import js2py @@ -139,18 +147,73 @@ def _as_callable(original_fields, expr, transform_name, language): 'Supported languages are "javascript" and "python."') +def exception_handling_args(error_handling_spec): + if error_handling_spec: + return { + 'dead_letter_tag' if k == 'output' else k: v + for (k, v) in error_handling_spec.items() + } + else: + return None + + +def _map_errors_to_standard_format(): + # TODO(https://github.com/apache/beam/issues/24755): Switch to MapTuple. + return beam.Map( + lambda x: beam.Row(element=x[0], msg=str(x[1][1]), stack=str(x[1][2]))) + + +def maybe_with_exception_handling(inner_expand): + def expand(self, pcoll): + wrapped_pcoll = beam.core._MaybePValueWithErrors( + pcoll, self._exception_handling_args) + return inner_expand(self, wrapped_pcoll).as_result( + _map_errors_to_standard_format()) + + return expand + + +def maybe_with_exception_handling_transform_fn(transform_fn): + def expand(pcoll, error_handling=None, **kwargs): + wrapped_pcoll = beam.core._MaybePValueWithErrors( + pcoll, exception_handling_args(error_handling)) + return transform_fn(wrapped_pcoll, + **kwargs).as_result(_map_errors_to_standard_format()) + + return expand + + # TODO(yaml): This should be available in all environments, in which case # we choose the one that matches best. class _Explode(beam.PTransform): - def __init__(self, fields, cross_product): + def __init__( + self, + fields: Union[str, Collection[str]], + cross_product: Optional[bool] = None, + error_handling: Optional[Mapping[str, Any]] = None): + if isinstance(fields, str): + fields = [fields] + if cross_product is None: + if len(fields) > 1: + raise ValueError( + 'cross_product must be specified true or false ' + 'when exploding multiple fields') + else: + # Doesn't matter. + cross_product = True self._fields = fields self._cross_product = cross_product - self._exception_handling_args = None + # TODO(yaml): Support standard error handling argument. + self._exception_handling_args = exception_handling_args(error_handling) + @maybe_with_exception_handling def expand(self, pcoll): all_fields = [ x for x, _ in named_fields_from_element_type(pcoll.element_type) ] + for field in self._fields: + if field not in all_fields: + raise ValueError(f'Exploding unknown field "{field}"') to_explode = self._fields def explode_cross_product(base, fields): @@ -171,12 +234,12 @@ def explode_zip(base, fields): yield beam.Row(**copy) return ( - beam.core._MaybePValueWithErrors(pcoll, self._exception_handling_args) + pcoll | beam.FlatMap( lambda row: (explode_cross_product if self._cross_product else explode_zip) ({name: getattr(row, name) - for name in all_fields}, to_explode))).as_result() + for name in all_fields}, to_explode))) def infer_output_type(self, input_type): return row_type.RowTypeConstraint.from_fields([( @@ -190,189 +253,171 @@ def with_exception_handling(self, **kwargs): return self -# TODO(yaml): Should Filter and Explode be distinct operations from Project? -# We'll want these per-language. @beam.ptransform.ptransform_fn -def _PythonProjectionTransform( - pcoll, - *, - fields, - transform_name, - language, - keep=None, - explode=(), - cross_product=True, - error_handling=None): - original_fields = [ - name for (name, _) in named_fields_from_element_type(pcoll.element_type) - ] +@maybe_with_exception_handling_transform_fn +def _PyJsFilter( + pcoll, keep: Union[str, Dict[str, str]], language: Optional[str] = None): - if error_handling is None: - error_handling_args = None + input_schema = dict(named_fields_from_element_type(pcoll.element_type)) + if isinstance(keep, str) and keep in input_schema: + keep_fn = lambda row: getattr(row, keep) else: - error_handling_args = { - 'dead_letter_tag' if k == 'output' else k: v - for (k, v) in error_handling.items() - } + keep_fn = _as_callable(list(input_schema.keys()), keep, "keep", language) + return pcoll | beam.Filter(keep_fn) - pcoll = beam.core._MaybePValueWithErrors(pcoll, error_handling_args) - if keep: - if isinstance(keep, str) and keep in original_fields: - keep_fn = lambda row: getattr(row, keep) - else: - keep_fn = _as_callable(original_fields, keep, transform_name, language) - filtered = pcoll | beam.Filter(keep_fn) - else: - filtered = pcoll +def is_expr(v): + return isinstance(v, str) or (isinstance(v, dict) and 'expression' in v) - projected = filtered | beam.Select( - **{ - name: _as_callable(original_fields, expr, transform_name, language) - for (name, expr) in fields.items() - }) - if explode: - result = projected | _Explode(explode, cross_product=cross_product) - else: - result = projected - - return result.as_result( - # TODO(https://github.com/apache/beam/issues/24755): Switch to MapTuple. - beam.Map( - lambda x: beam.Row( - element=x[0], msg=str(x[1][1]), stack=str(x[1][2])))) - - -@beam.ptransform.ptransform_fn -def MapToFields( - pcoll, - yaml_create_transform, - *, - fields, - keep=None, - explode=(), - cross_product=None, - append=False, - drop=(), - language=None, - error_handling=None, - transform_name="MapToFields", - **language_keywords): - if isinstance(explode, str): - explode = [explode] - if cross_product is None: - if len(explode) > 1: - # TODO(robertwb): Consider if true is an OK default. - raise ValueError( - 'cross_product must be specified true or false ' - 'when exploding multiple fields') - else: - # Doesn't matter. - cross_product = True +def normalize_fields(pcoll, fields, drop=(), append=False, language='generic'): + try: + input_schema = dict(named_fields_from_element_type(pcoll.element_type)) + except ValueError as exn: + if drop: + raise ValueError("Can only drop fields on a schema'd input.") from exn + if append: + raise ValueError("Can only append fields on a schema'd input.") from exn + elif any(is_expr(x) for x in fields.values()): + raise ValueError("Can only use expressions on a schema'd input.") from exn + input_schema = {} - input_schema = dict(named_fields_from_element_type(pcoll.element_type)) + if isinstance(drop, str): + drop = [drop] if drop and not append: raise ValueError("Can only drop fields if append is true.") for name in drop: if name not in input_schema: raise ValueError(f'Dropping unknown field "{name}"') - for name in explode: - if not (name in fields or (append and name in input_schema)): - raise ValueError(f'Exploding unknown field "{name}"') if append: for name in fields: if name in input_schema and name not in drop: - raise ValueError(f'Redefinition of field "{name}"') + raise ValueError( + f'Redefinition of field "{name}". ' + 'Cannot append a field that already exists in original input.') + + if language == 'generic': + for expr in fields.values(): + if not isinstance(expr, str): + raise ValueError( + "Missing language specification. " + "Must specify a language when using a map with custom logic.") + missing = set(fields.values()) - set(input_schema.keys()) + if missing: + raise ValueError( + f"Missing language specification or unknown input fields: {missing}") if append: - fields = { + return input_schema, { **{name: name for name in input_schema.keys() if name not in drop}, **fields } + else: + return input_schema, fields - if language is None: - for name, expr in fields.items(): - if not isinstance(expr, str) or expr not in input_schema: - # TODO(robertw): Could consider defaulting to SQL, or another - # lowest-common-denominator expression language. - raise ValueError("Missing language specification.") - - # We should support this for all languages. - language = "python" - - if language in ("sql", "calcite"): - if error_handling: - raise ValueError('Error handling unsupported for sql.') - selects = [f'{expr} AS {name}' for (name, expr) in fields.items()] - query = "SELECT " + ", ".join(selects) + " FROM PCOLLECTION" - if keep: - query += " WHERE " + keep - - result = pcoll | yaml_create_transform({ - 'type': 'Sql', - 'config': { - 'query': query, **language_keywords - }, - }, [pcoll]) - if explode: - # TODO(yaml): Implement via unnest. - result = result | _Explode(explode, cross_product) - - return result - - elif language == 'python' or language == 'javascript': - return pcoll | yaml_create_transform({ - 'type': 'PyTransform', - 'config': { - 'constructor': __name__ + '._PythonProjectionTransform', - 'kwargs': { - 'fields': fields, - 'transform_name': transform_name, - 'language': language, - 'keep': keep, - 'explode': explode, - 'cross_product': cross_product, - 'error_handling': error_handling, - }, - **language_keywords - }, - }, [pcoll]) - else: - # TODO(yaml): Support javascript expressions and UDFs. - # TODO(yaml): Support java by fully qualified name. - # TODO(yaml): Maybe support java lambdas? - raise ValueError( - f'Unknown language: {language}. ' - 'Supported languages are "sql" (alias calcite) and "python."') +@beam.ptransform.ptransform_fn +@maybe_with_exception_handling_transform_fn +def _PyJsMapToFields(pcoll, language='generic', **mapping_args): + input_schema, fields = normalize_fields( + pcoll, language=language, **mapping_args) + original_fields = list(input_schema.keys()) + + return pcoll | beam.Select( + **{ + name: _as_callable(original_fields, expr, name, language) + for (name, expr) in fields.items() + }) + + +class SqlMappingProvider(yaml_provider.Provider): + def __init__(self, sql_provider=None): + if sql_provider is None: + sql_provider = yaml_provider.beam_jar( + urns={'Sql': 'beam:external:java:sql:v1'}, + gradle_target='sdks:java:extensions:sql:expansion-service:shadowJar') + self._sql_provider = sql_provider + + def available(self): + return self._sql_provider.available() + + def cache_artifacts(self): + return self._sql_provider.cache_artifacts() + + def provided_transforms(self) -> Iterable[str]: + return [ + 'Filter-sql', + 'Filter-calcite', + 'MapToFields-sql', + 'MapToFields-calcite' + ] + + def create_transform( + self, + typ: str, + args: Mapping[str, Any], + yaml_create_transform: Callable[ + [Mapping[str, Any], Iterable[beam.PCollection]], beam.PTransform] + ) -> beam.PTransform: + if typ.startswith('Filter-'): + return _SqlFilterTransform( + self._sql_provider, yaml_create_transform, **args) + if typ.startswith('MapToFields-'): + return _SqlMapToFieldsTransform( + self._sql_provider, yaml_create_transform, **args) + else: + raise NotImplementedError(typ) + + def underlying_provider(self): + return self._sql_provider + + def to_json(self): + return {'type': "SqlMappingProvider"} + + +@beam.ptransform.ptransform_fn +def _SqlFilterTransform( + pcoll, sql_provider, yaml_create_transform, keep, language): + return pcoll | sql_provider.create_transform( + 'Sql', {'query': f'SELECT * FROM PCOLLECTION WHERE {keep}'}, + yaml_create_transform) -def create_mapping_provider(): +@beam.ptransform.ptransform_fn +def _SqlMapToFieldsTransform( + pcoll, sql_provider, yaml_create_transform, **mapping_args): + _, fields = normalize_fields(pcoll, **mapping_args) + + def extract_expr(name, v): + if isinstance(v, str): + return v + elif 'expression' in v: + return v['expression'] + else: + raise ValueError("Only expressions allowed in SQL at {name}.") + + selects = [ + f'({extract_expr(name, expr)}) AS {name}' + for (name, expr) in fields.items() + ] + query = "SELECT " + ", ".join(selects) + " FROM PCOLLECTION" + return pcoll | sql_provider.create_transform( + 'Sql', {'query': query}, yaml_create_transform) + + +def create_mapping_providers(): # These are MetaInlineProviders because their expansion is in terms of other # YamlTransforms, but in a way that needs to be deferred until the input # schema is known. - return yaml_provider.MetaInlineProvider({ - 'MapToFields': MapToFields, - 'Filter': ( - lambda yaml_create_transform, - keep, - **kwargs: MapToFields( - yaml_create_transform, - keep=keep, - fields={}, - append=True, - transform_name='Filter', - **kwargs)), - 'Explode': ( - lambda yaml_create_transform, - explode, - **kwargs: MapToFields( - yaml_create_transform, - explode=explode, - fields={}, - append=True, - transform_name='Explode', - **kwargs)), - }) + return [ + yaml_provider.InlineProvider({ + 'Explode': _Explode, + 'Filter-python': _PyJsFilter, + 'Filter-javascript': _PyJsFilter, + 'MapToFields-python': _PyJsMapToFields, + 'MapToFields-javascript': _PyJsMapToFields, + 'MapToFields-generic': _PyJsMapToFields, + }), + SqlMappingProvider(), + ] diff --git a/sdks/python/apache_beam/yaml/yaml_mapping_test.py b/sdks/python/apache_beam/yaml/yaml_mapping_test.py index 728476b1fd5d7..55032aeae52e9 100644 --- a/sdks/python/apache_beam/yaml/yaml_mapping_test.py +++ b/sdks/python/apache_beam/yaml/yaml_mapping_test.py @@ -82,18 +82,18 @@ def test_filter(self): elements = p | beam.Create(DATA) result = elements | YamlTransform( ''' - type: MapToFields + type: Filter input: input config: language: python - fields: - label: label keep: "rank > 0" ''') assert_that( - result, equal_to([ - beam.Row(label='37a'), - beam.Row(label='389a'), + result + | beam.Map(lambda named_tuple: beam.Row(**named_tuple._asdict())), + equal_to([ + beam.Row(label='37a', conductor=37, rank=1), + beam.Row(label='389a', conductor=389, rank=2), ])) def test_explode(self): @@ -105,15 +105,19 @@ def test_explode(self): ]) result = elements | YamlTransform( ''' - type: MapToFields + type: chain input: input - config: - language: python - append: true - fields: - range: "range(a)" - explode: [range, b] - cross_product: true + transforms: + - type: MapToFields + config: + language: python + append: true + fields: + range: "range(a)" + - type: Explode + config: + fields: [range, b] + cross_product: true ''') assert_that( result, diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py index d01852a69c396..0cd9bdcadcc34 100644 --- a/sdks/python/apache_beam/yaml/yaml_provider.py +++ b/sdks/python/apache_beam/yaml/yaml_provider.py @@ -209,6 +209,7 @@ def provider_from_spec(cls, spec): def register_provider_type(cls, type_name): def apply(constructor): cls._provider_types[type_name] = constructor + return constructor return apply @@ -709,19 +710,21 @@ def merge_providers(*provider_sets): transform_type: [provider] for transform_type in provider.provided_transforms() } + elif isinstance(provider_set, list): + provider_set = merge_providers(*provider_set) for transform_type, providers in provider_set.items(): result[transform_type].extend(providers) return result def standard_providers(): - from apache_beam.yaml.yaml_mapping import create_mapping_provider + from apache_beam.yaml.yaml_mapping import create_mapping_providers from apache_beam.yaml.yaml_io import io_providers with open(os.path.join(os.path.dirname(__file__), 'standard_providers.yaml')) as fin: standard_providers = yaml.load(fin, Loader=SafeLoader) return merge_providers( create_builtin_provider(), - create_mapping_provider(), + create_mapping_providers(), io_providers(), parse_providers(standard_providers)) diff --git a/sdks/python/apache_beam/yaml/yaml_transform.py b/sdks/python/apache_beam/yaml/yaml_transform.py index da9bf526cd596..78546aa28cb12 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform.py +++ b/sdks/python/apache_beam/yaml/yaml_transform.py @@ -879,8 +879,23 @@ def ensure_transforms_have_providers(spec): f'Unknown type or missing provider for {identify_object(spec)}') return spec + def preprocess_langauges(spec): + if spec['type'] in ('Filter', 'MapToFields'): + language = spec.get('config', {}).get('language', 'generic') + new_type = spec['type'] + '-' + language + if known_transforms and new_type not in known_transforms: + if language == 'generic': + raise ValueError(f'Missing language for {identify_object(spec)}') + else: + raise ValueError( + f'Unknown language {language} for {identify_object(spec)}') + return dict(spec, type=new_type, name=spec.get('name', spec['type'])) + else: + return spec + for phase in [ ensure_transforms_have_types, + preprocess_langauges, ensure_transforms_have_providers, preprocess_source_sink, preprocess_chain, diff --git a/sdks/python/apache_beam/yaml/yaml_transform_test.py b/sdks/python/apache_beam/yaml/yaml_transform_test.py index 993f9ea6639ba..ebf12710d3f23 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform_test.py +++ b/sdks/python/apache_beam/yaml/yaml_transform_test.py @@ -419,21 +419,27 @@ def test_mapping_errors(self): input: Create config: fn: "lambda x: beam.Row(num=x, str='a' * x or 'bbb')" + - type: Filter + input: ToRow + config: + language: python + keep: + str[1] >= 'a' + error_handling: + output: errors - type: MapToFields name: MapWithErrorHandling - input: ToRow + input: Filter config: language: python fields: num: num inverse: float(1 / num) - keep: - str[1] >= 'a' error_handling: output: errors - type: PyMap name: TrimErrors - input: MapWithErrorHandling.errors + input: [MapWithErrorHandling.errors, Filter.errors] config: fn: "lambda x: x.msg" - type: MapToFields diff --git a/sdks/python/apache_beam/yaml/yaml_udf_test.py b/sdks/python/apache_beam/yaml/yaml_udf_test.py index bb15cd494757e..5e9faa08253cd 100644 --- a/sdks/python/apache_beam/yaml/yaml_udf_test.py +++ b/sdks/python/apache_beam/yaml/yaml_udf_test.py @@ -28,6 +28,10 @@ from apache_beam.yaml.yaml_transform import YamlTransform +def AsRows(): + return beam.Map(lambda named_tuple: beam.Row(**named_tuple._asdict())) + + class YamlUDFMappingTest(unittest.TestCase): def __init__(self, method_name='runYamlMappingTest'): super().__init__(method_name) @@ -59,12 +63,11 @@ def test_map_to_fields_filter_inline_js(self): callable: "function label_map(x) {return x.label + 'x'}" conductor: callable: "function conductor_map(x) {return x.conductor + 1}" - keep: - callable: "function filter(x) {return x.rank > 0}" ''') assert_that( result, equal_to([ + beam.Row(label='11ax', conductor=12), beam.Row(label='37ax', conductor=38), beam.Row(label='389ax', conductor=390), ])) @@ -84,12 +87,11 @@ def test_map_to_fields_filter_inline_py(self): callable: "lambda x: x.label + 'x'" conductor: callable: "lambda x: x.conductor + 1" - keep: - callable: "lambda x: x.rank > 0" ''') assert_that( result, equal_to([ + beam.Row(label='11ax', conductor=12), beam.Row(label='37ax', conductor=38), beam.Row(label='389ax', conductor=390), ])) @@ -104,11 +106,11 @@ def test_filter_inline_js(self): input: input config: language: javascript - keep: + keep: callable: "function filter(x) {return x.rank > 0}" ''') assert_that( - result, + result | AsRows(), equal_to([ beam.Row(label='37a', conductor=37, rank=1), beam.Row(label='389a', conductor=389, rank=2), @@ -124,11 +126,11 @@ def test_filter_inline_py(self): input: input config: language: python - keep: + keep: callable: "lambda x: x.rank > 0" ''') assert_that( - result, + result | AsRows(), equal_to([ beam.Row(label='37a', conductor=37, rank=1), beam.Row(label='389a', conductor=389, rank=2), @@ -144,11 +146,12 @@ def test_filter_expression_js(self): input: input config: language: javascript - keep: + keep: expression: "label.toUpperCase().indexOf('3') == -1 && conductor" ''') assert_that( - result, equal_to([ + result | AsRows(), + equal_to([ beam.Row(label='11a', conductor=11, rank=0), ])) @@ -162,11 +165,12 @@ def test_filter_expression_py(self): input: input config: language: python - keep: + keep: expression: "'3' not in label" ''') assert_that( - result, equal_to([ + result | AsRows(), + equal_to([ beam.Row(label='11a', conductor=11, rank=0), ])) @@ -175,7 +179,7 @@ def test_filter_inline_js_file(self): function f(x) { return x.rank > 0 } - + function g(x) { return x.rank > 1 } @@ -193,12 +197,12 @@ def test_filter_inline_js_file(self): input: input config: language: javascript - keep: + keep: path: {path} name: "f" ''') assert_that( - result, + result | AsRows(), equal_to([ beam.Row(label='37a', conductor=37, rank=1), beam.Row(label='389a', conductor=389, rank=2), @@ -225,12 +229,12 @@ def g(x): input: input config: language: python - keep: + keep: path: {path} name: "f" ''') assert_that( - result, + result | AsRows(), equal_to([ beam.Row(label='37a', conductor=37, rank=1), beam.Row(label='389a', conductor=389, rank=2), From 451561784c6877b40567b26c6855bf3115b3aefa Mon Sep 17 00:00:00 2001 From: kennknowles Date: Thu, 21 Sep 2023 00:44:14 +0000 Subject: [PATCH 23/34] Moving to 2.52.0-SNAPSHOT on master branch. --- .asf.yaml | 1 + .../apache/beam/gradle/BeamModulePlugin.groovy | 2 +- gradle.properties | 4 ++-- release/src/main/scripts/jenkins_jobs.txt | 16 +--------------- sdks/go/pkg/beam/core/core.go | 2 +- sdks/python/apache_beam/version.py | 2 +- sdks/typescript/package.json | 2 +- 7 files changed, 8 insertions(+), 21 deletions(-) diff --git a/.asf.yaml b/.asf.yaml index 9ef2ea74a3f01..0657d888cb2c6 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -49,6 +49,7 @@ github: protected_branches: master: {} + release-2.51.0: {} release-2.50.0: {} release-2.49.0: {} release-2.48.0: {} diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 9f341c5673fd9..4f26534c533ad 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -491,7 +491,7 @@ class BeamModulePlugin implements Plugin { // Automatically use the official release version if we are performing a release // otherwise append '-SNAPSHOT' - project.version = '2.51.0' + project.version = '2.52.0' if (!isRelease(project)) { project.version += '-SNAPSHOT' } diff --git a/gradle.properties b/gradle.properties index b74e9694de3a1..6bad220e641ba 100644 --- a/gradle.properties +++ b/gradle.properties @@ -30,8 +30,8 @@ signing.gnupg.useLegacyGpg=true # buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy. # To build a custom Beam version make sure you change it in both places, see # https://github.com/apache/beam/issues/21302. -version=2.51.0-SNAPSHOT -sdk_version=2.51.0.dev +version=2.52.0-SNAPSHOT +sdk_version=2.52.0.dev javaVersion=1.8 diff --git a/release/src/main/scripts/jenkins_jobs.txt b/release/src/main/scripts/jenkins_jobs.txt index e85c8c3bfbae4..65ccc4d9dec2e 100644 --- a/release/src/main/scripts/jenkins_jobs.txt +++ b/release/src/main/scripts/jenkins_jobs.txt @@ -1,7 +1,5 @@ -Run Beam Metrics deployment,beam_PostCommit_BeamMetrics_Publish_PR Run Chicago Taxi on Dataflow,beam_PostCommit_Python_Chicago_Taxi_Dataflow_PR Run Chicago Taxi on Flink,beam_PostCommit_Python_Chicago_Taxi_Flink_PR -Run CommunityMetrics PreCommit,beam_PreCommit_CommunityMetrics_Phrase Run Dataflow Runner Nexmark Tests,beam_PostCommit_Java_Nexmark_Dataflow_PR Run Dataflow Runner Tpcds Tests,beam_PostCommit_Java_Tpcds_Dataflow_PR Run Dataflow Runner V2 Java 11 Nexmark Tests,beam_PostCommit_Java_Nexmark_DataflowV2_Java11_PR @@ -14,7 +12,6 @@ Run Dataflow ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_Dataflow_PR Run Direct Runner Nexmark Tests,beam_PostCommit_Java_Nexmark_Direct_PR Run Direct ValidatesRunner Java 11,beam_PostCommit_Java_ValidatesRunner_Direct_Java11_PR Run Direct ValidatesRunner Java 17,beam_PostCommit_Java_ValidatesRunner_Direct_Java17_PR -Run Direct ValidatesRunner in Java 11,beam_PostCommit_Java11_ValidatesRunner_Direct_PR Run Direct ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_Direct_PR Run Flink Runner Nexmark Tests,beam_PostCommit_Java_Nexmark_Flink_PR Run Flink Runner Tpcds Tests,beam_PostCommit_Java_Tpcds_Flink_PR @@ -22,10 +19,8 @@ Run Flink ValidatesRunner Java 11,beam_PostCommit_Java_ValidatesRunner_Flink_Jav Run Flink ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_Flink_PR Run Go Flink ValidatesRunner,beam_PostCommit_Go_VR_Flink_PR Run Go PostCommit,beam_PostCommit_Go_PR -Run Go PreCommit,beam_PreCommit_Go_Phrase Run Go Samza ValidatesRunner,beam_PostCommit_Go_VR_Samza_PR Run Go Spark ValidatesRunner,beam_PostCommit_Go_VR_Spark_PR -Run GoPortable PreCommit,beam_PreCommit_GoPortable_Phrase Run Java 11 Examples on Dataflow Runner V2,beam_PostCommit_Java_Examples_Dataflow_V2_java11_PR Run Java 17 Examples on Dataflow Runner V2,beam_PostCommit_Java_Examples_Dataflow_V2_java17_PR Run Java Dataflow V2 ValidatesRunner Streaming,beam_PostCommit_Java_VR_Dataflow_V2_Streaming_PR @@ -36,7 +31,6 @@ Run Java Examples_Flink,beam_PostCommit_Java_Examples_Flink_PR Run Java Examples_Spark,beam_PostCommit_Java_Examples_Spark_PR Run Java Flink PortableValidatesRunner Streaming,beam_PostCommit_Java_PVR_Flink_Streaming_PR Run Java InfluxDbIO_IT,beam_PostCommit_Java_InfluxDbIO_IT_PR -Run Java Portability examples on Dataflow with Java 11,beam_PostCommit_Java11_Examples_Dataflow_Portability_PR Run Java PostCommit,beam_PostCommit_Java_PR Run Java PreCommit,beam_PreCommit_Java_Phrase Run Java Samza PortableValidatesRunner,beam_PostCommit_Java_PVR_Samza_PR @@ -46,7 +40,6 @@ Run Java Spark PortableValidatesRunner Batch,beam_PostCommit_Java_PVR_Spark_Batc Run Java Spark v3 PortableValidatesRunner Streaming,beam_PostCommit_Java_PVR_Spark3_Streaming_PR Run Java examples on Dataflow Java 11,beam_PostCommit_Java_Examples_Dataflow_Java11_PR Run Java examples on Dataflow Java 17,beam_PostCommit_Java_Examples_Dataflow_Java17_PR -Run Java examples on Dataflow with Java 11,beam_PostCommit_Java11_Examples_Dataflow_PR Run Java_Amazon-Web-Services2_IO_Direct PreCommit,beam_PreCommit_Java_Amazon-Web-Services2_IO_Direct_Phrase Run Java_Amazon-Web-Services_IO_Direct PreCommit,beam_PreCommit_Java_Amazon-Web-Services_IO_Direct_Phrase Run Java_Amqp_IO_Direct PreCommit,beam_PreCommit_Java_Amqp_IO_Direct_Phrase @@ -96,7 +89,6 @@ Run Jpms Direct Java 11 PostCommit,beam_PostCommit_Java_Jpms_Direct_Java11_PR Run Jpms Direct Java 17 PostCommit,beam_PostCommit_Java_Jpms_Direct_Java17_PR Run Jpms Flink Java 11 PostCommit,beam_PostCommit_Java_Jpms_Flink_Java11_PR Run Jpms Spark Java 11 PostCommit,beam_PostCommit_Java_Jpms_Spark_Java11_PR -Run Kotlin_Examples PreCommit,beam_PreCommit_Kotlin_Examples_Phrase Run PortableJar_Flink PostCommit,beam_PostCommit_PortableJar_Flink_PR Run PortableJar_Spark PostCommit,beam_PostCommit_PortableJar_Spark_PR Run Portable_Python PreCommit,beam_PreCommit_Portable_Python_Phrase @@ -126,9 +118,6 @@ Run Python RC Dataflow ValidatesContainer,beam_PostCommit_Py_ValCont_with_RC_PR Run Python Samza ValidatesRunner,beam_PostCommit_Python_VR_Samza_PR Run Python Spark ValidatesRunner,beam_PostCommit_Python_VR_Spark_PR Run PythonDocker PreCommit,beam_PreCommit_PythonDocker_Phrase -Run PythonDocs PreCommit,beam_PreCommit_PythonDocs_Phrase -Run PythonFormatter PreCommit,beam_PreCommit_PythonFormatter_Phrase -Run PythonLint PreCommit,beam_PreCommit_PythonLint_Phrase Run Python_Coverage PreCommit,beam_PreCommit_Python_Coverage_Phrase Run Python_Dataframes PreCommit,beam_PreCommit_Python_Dataframes_Phrase Run Python_Examples PreCommit,beam_PreCommit_Python_Examples_Phrase @@ -139,7 +128,6 @@ Run Python_Transforms PreCommit,beam_PreCommit_Python_Transforms_Phrase Run Python_Xlang_Gcp_Dataflow PostCommit,beam_PostCommit_Python_Xlang_Gcp_Dataflow_PR Run Python_Xlang_Gcp_Direct PostCommit,beam_PostCommit_Python_Xlang_Gcp_Direct_PR Run Python_Xlang_IO_Dataflow PostCommit,beam_PostCommit_Python_Xlang_IO_Dataflow_PR -Run RAT PreCommit,beam_PreCommit_RAT_Phrase Run SQL PostCommit,beam_PostCommit_SQL_PR Run SQL PreCommit,beam_PreCommit_SQL_Phrase Run SQL_Java11 PreCommit,beam_PreCommit_SQL_Java11_Phrase @@ -150,11 +138,9 @@ Run Spark Runner Tpcds Tests,beam_PostCommit_Java_Tpcds_Spark_PR Run Spark StructuredStreaming ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming_PR Run Spark ValidatesRunner Java 11,beam_PostCommit_Java_ValidatesRunner_Spark_Java11_PR Run Spark ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_Spark_PR -Run Spotless PreCommit,beam_PreCommit_Spotless_Phrase +Run TransformService_Direct PostCommit,beam_PostCommit_TransformService_Direct_PR Run Twister2 ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_Twister2_PR -Run Typescript PreCommit,beam_PreCommit_Typescript_Phrase Run ULR Loopback ValidatesRunner,beam_PostCommit_Java_ValidatesRunner_ULR_PR -Run Whitespace PreCommit,beam_PreCommit_Whitespace_Phrase Run XVR_Direct PostCommit,beam_PostCommit_XVR_Direct_PR Run XVR_Flink PostCommit,beam_PostCommit_XVR_Flink_PR Run XVR_GoUsingJava_Dataflow PostCommit,beam_PostCommit_XVR_GoUsingJava_Dataflow_PR diff --git a/sdks/go/pkg/beam/core/core.go b/sdks/go/pkg/beam/core/core.go index 9f5ee60083794..ed62a2e9eac01 100644 --- a/sdks/go/pkg/beam/core/core.go +++ b/sdks/go/pkg/beam/core/core.go @@ -27,7 +27,7 @@ const ( // SdkName is the human readable name of the SDK for UserAgents. SdkName = "Apache Beam SDK for Go" // SdkVersion is the current version of the SDK. - SdkVersion = "2.51.0.dev" + SdkVersion = "2.52.0.dev" // DefaultDockerImage represents the associated image for this release. DefaultDockerImage = "apache/beam_go_sdk:" + SdkVersion diff --git a/sdks/python/apache_beam/version.py b/sdks/python/apache_beam/version.py index d13356ac2c6b2..a69e3839fff3e 100644 --- a/sdks/python/apache_beam/version.py +++ b/sdks/python/apache_beam/version.py @@ -17,4 +17,4 @@ """Apache Beam SDK version information and utilities.""" -__version__ = '2.51.0.dev' +__version__ = '2.52.0.dev' diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index 35a1e8134e295..b582b3d5c07f2 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -1,6 +1,6 @@ { "name": "apache-beam", - "version": "2.51.0-SNAPSHOT", + "version": "2.52.0-SNAPSHOT", "devDependencies": { "@google-cloud/bigquery": "^5.12.0", "@types/mocha": "^9.0.0", From bb24ca71b62e88be17f91810f3793df73b2e2be3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:17:31 -0400 Subject: [PATCH 24/34] Bump github.com/testcontainers/testcontainers-go in /sdks (#28576) Bumps [github.com/testcontainers/testcontainers-go](https://github.com/testcontainers/testcontainers-go) from 0.24.0 to 0.24.1. - [Release notes](https://github.com/testcontainers/testcontainers-go/releases) - [Commits](https://github.com/testcontainers/testcontainers-go/compare/v0.24.0...v0.24.1) --- updated-dependencies: - dependency-name: github.com/testcontainers/testcontainers-go dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 53596c5d207d9..ac7db57479178 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -47,7 +47,7 @@ require ( github.com/linkedin/goavro/v2 v2.12.0 github.com/proullon/ramsql v0.1.2 github.com/spf13/cobra v1.7.0 - github.com/testcontainers/testcontainers-go v0.24.0 + github.com/testcontainers/testcontainers-go v0.24.1 github.com/tetratelabs/wazero v1.5.0 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c diff --git a/sdks/go.sum b/sdks/go.sum index 502fdf1e88925..2b404990716ae 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -428,8 +428,8 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= -github.com/testcontainers/testcontainers-go v0.24.0 h1:eqkq6nNIPVrqpXNyn/s5jDBqPGuWtND2hOMEBrUULIw= -github.com/testcontainers/testcontainers-go v0.24.0/go.mod h1:MGBiAkCm86yXQoCiipmQCqZLVdk1uFqtMqaU1Or0MRk= +github.com/testcontainers/testcontainers-go v0.24.1 h1:gJdZuQIVWnMJTo+CmQMEP7/CAagNk/0jbcUPn3OWvD8= +github.com/testcontainers/testcontainers-go v0.24.1/go.mod h1:MGBiAkCm86yXQoCiipmQCqZLVdk1uFqtMqaU1Or0MRk= github.com/tetratelabs/wazero v1.5.0 h1:Yz3fZHivfDiZFUXnWMPUoiW7s8tC1sjdBtlJn08qYa0= github.com/tetratelabs/wazero v1.5.0/go.mod h1:0U0G41+ochRKoPKCJlh0jMg1CHkyfK8kDqiirMmKY8A= github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= From 4d1983c3b0e2d50b57fc685616bb7e1e405e825a Mon Sep 17 00:00:00 2001 From: Aleksandr Dudko <116064902+aleksandr-dudko@users.noreply.github.com> Date: Thu, 21 Sep 2023 18:53:13 +0400 Subject: [PATCH 25/34] Add GitHub Workflow Replacements for job_PerformanceTests_BiqQueryIO_Read_Python and job_PerformanceTests_BiqQueryIO_Write_Python_Batch (#28497) * Add GitHub Workflow Replacements for job_PerformanceTests_BiqQueryIO_Read_Python and job_PerformanceTests_BiqQueryIO_Write_Python_Batch * Add comments for set_config --- ...erformanceTests_BiqQueryIO_Read_Python.yml | 97 +++++++++++++++++++ ...nceTests_BiqQueryIO_Write_Python_Batch.yml | 97 +++++++++++++++++++ ...erformanceTests_BiqQueryIO_Read_Python.txt | 33 +++++++ ...rformanceTests_BiqQueryIO_Write_Python.txt | 33 +++++++ 4 files changed, 260 insertions(+) create mode 100644 .github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml create mode 100644 .github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml create mode 100644 .github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Read_Python.txt create mode 100644 .github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Write_Python.txt diff --git a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml new file mode 100644 index 0000000000000..224689ee90867 --- /dev/null +++ b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Read_Python.yml @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Performance BigQueryIO Read Python + +on: + issue_comment: + types: [created] + schedule: + - cron: '0 2 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PerformanceTests_BiqQueryIO_Read_Python: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + github.event.comment.body == 'Run BigQueryIO Read Performance Test Python' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PerformanceTests_BiqQueryIO_Read_Python"] + job_phrase: ["Run BigQueryIO Read Performance Test Python"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + # This code block prepares configuration data for a task. + - name: Prepare config + id: set_config + shell: bash + run: | + # Capture the current date and time in a specific format + CURDATE=$(date '+%m%d%H%M%S' --utc) + # Reads the configurations file and exclude lines starting with # and then remove line breaks + CURCONFIG=$(grep -v "^#.*" ./.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Read_Python.txt | tr '\n' ' ') + # Appends the value of CURDATE to the original configuration + CONFIGWITHDATE=$(echo "${CURCONFIG/bqio_read_python_/bqio_read_python_$CURDATE}") + # Assigns $CONFIGWITHDATE to a variable named prepared_config + echo "prepared_config=$CONFIGWITHDATE" >> $GITHUB_OUTPUT + - name: run BigQueryIO Read Python Performance Test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.io.gcp.bigquery_read_perf_test \ + -PpythonVersion=3.8 \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{steps.set_config.outputs.prepared_config}}' \ No newline at end of file diff --git a/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml new file mode 100644 index 0000000000000..24cd1b25ca73e --- /dev/null +++ b/.github/workflows/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch.yml @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Performance BigQueryIO Write Python Batch + +on: + issue_comment: + types: [created] + schedule: + - cron: '0 1 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: write + checks: write + contents: read + deployments: read + id-token: none + issues: write + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_PerformanceTests_BiqQueryIO_Write_Python_Batch: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + github.event.comment.body == 'Run BigQueryIO Write Performance Test Python' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 240 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_PerformanceTests_BiqQueryIO_Write_Python_Batch"] + job_phrase: ["Run BigQueryIO Write Performance Test Python"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + # This code block prepares configuration data for a task. + - name: Prepare config + id: set_config + shell: bash + run: | + # capture the current date and time in a specific format + CURDATE=$(date '+%m%d%H%M%S' --utc) + # reads the configurations file and exclude lines starting with # and then remove line breaks + CURCONFIG=$(grep -v "^#.*" ./.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Write_Python.txt | tr '\n' ' ') + # appends the value of CURDATE to the original configuration + CONFIGWITHDATE=$(echo "${CURCONFIG/bqio_write_python_/bqio_write_python_$CURDATE}") + # assigns $CONFIGWITHDATE to a variable named prepared_config + echo "prepared_config=$CONFIGWITHDATE" >> $GITHUB_OUTPUT + - name: run BigQueryIO Write Batch Python Performance Test + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.io.gcp.bigquery_write_perf_test \ + -PpythonVersion=3.8 \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{steps.set_config.outputs.prepared_config}}' \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Read_Python.txt b/.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Read_Python.txt new file mode 100644 index 0000000000000..facf8b91ddb4d --- /dev/null +++ b/.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Read_Python.txt @@ -0,0 +1,33 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=performance-tests-bqio-read-python-10gb0917155348 +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--input_dataset=beam_performance +--input_table=bqio_read_10GB +--publish_to_big_query=true +--metrics_dataset=beam_performance +--metrics_table=bqio_read_10GB_results +--influx_measurement=python_bqio_read_10GB_results +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--input_options=''{\\"num_records\\":10485760,\\"key_size\\":1,\\"value_size\\":1024,\\"algorithm\\":\\"lcg\\"}'' +--num_****s=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Write_Python.txt b/.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Write_Python.txt new file mode 100644 index 0000000000000..b8c6bd9265f07 --- /dev/null +++ b/.github/workflows/performance-tests-job-configs/config_PerformanceTests_BiqQueryIO_Write_Python.txt @@ -0,0 +1,33 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=performance-tests-bqio-write-python-batch-10gb0917155348 +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--output_dataset=beam_performance +--output_table=bqio_write_10GB +--publish_to_big_query=true +--metrics_dataset=beam_performance +--metrics_table=bqio_write_10GB_results +--influx_measurement=python_bqio_write_10GB_results +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--input_options=''{\\"num_records\\":10485760,\\"key_size\\":1,\\"value_size\\":1024,\\"algorithm\\":\\"lcg\\"}'' +--num_****s=5 +--autoscaling_algorithm=NONE +--runner=DataflowRunner \ No newline at end of file From 42c80fd8a9f6b4e19ab65ccae2fb218e803e7aff Mon Sep 17 00:00:00 2001 From: Vitaly Terentyev Date: Thu, 21 Sep 2023 18:55:52 +0400 Subject: [PATCH 26/34] Add Python Load Tests Combine Dataflow Batch github action (#28431) * Add Python Load Tests Combine Dataflow Batch github action * Change cron * Refactoring --- .github/workflows/README.md | 1 + ...oadTests_Python_Combine_Dataflow_Batch.yml | 115 ++++++++++++++++++ .../config_Combine_Python_Batch_2GB_10b.txt | 32 +++++ ...nfig_Combine_Python_Batch_2GB_Fanout_4.txt | 33 +++++ ...nfig_Combine_Python_Batch_2GB_Fanout_8.txt | 33 +++++ 5 files changed, 214 insertions(+) create mode 100644 .github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml create mode 100644 .github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt create mode 100644 .github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt create mode 100644 .github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 0d0277bd478d0..4b1bf01b5e3dd 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -181,6 +181,7 @@ Please note that jobs with matrix need to have matrix element in the comment. Ex | Workflow name | Matrix | Trigger Phrase | Cron Status | |:-------------:|:------:|:--------------:|:-----------:| | [ Load Tests CoGBK Dataflow Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java CoGBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) +| [ Load Tests Combine Dataflow Batch Python ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Python Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) | [ Performance Tests BigQueryIO Batch Java Avro ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Avro`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) | [ Performance Tests BigQueryIO Batch Java Json ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Json`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) | [ Performance Tests BigQueryIO Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) | N/A |`Run BigQueryIO Streaming Performance Test Java`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml new file mode 100644 index 0000000000000..f7d7a056d5953 --- /dev/null +++ b/.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Load Tests Combine Dataflow Batch Python + +on: + issue_comment: + types: [created] + schedule: + - cron: '40 5 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_LoadTests_Python_Combine_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + github.event.comment.body == 'Run Load Tests Python Combine Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Python_Combine_Dataflow_Batch"] + job_phrase: ["Run Load Tests Python Combine Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + - name: Prepare configs + #Reads config files, excludes comments, appends current date to the job_name parameter + id: set_configs + shell: bash + run: | + CURDATE=$(date '+%m%d%H%M%S' --utc) + CONFIG_ARR=('config_Combine_Python_Batch_2GB_10b.txt' 'config_Combine_Python_Batch_2GB_Fanout_4.txt' 'config_Combine_Python_Batch_2GB_Fanout_8.txt') + for INDEX in ${!CONFIG_ARR[@]} + do + CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/${CONFIG_ARR[INDEX]} | tr '\n' ' ') + CURCONFIG=$(echo "${CURCONFIG/load-tests-python-dataflow-batch-combine-$((INDEX + 1))-/load-tests-python-dataflow-batch-combine-$((INDEX + 1))-$CURDATE}") + echo "prepared_config_$((INDEX + 1))=$CURCONFIG" >> $GITHUB_OUTPUT + done + - name: run Combine Dataflow Batch Python Load Test 1 (10 bytes records) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + - name: run Combine Dataflow Batch Python Load Test 2 (fanout 4) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + - name: run Combine Dataflow Batch Python Load Test 3 (fanout 8) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:python:apache_beam:testing:load_tests:run + arguments: | + -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ + -Prunner=DataflowRunner \ + -PpythonVersion=3.8 \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt b/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt new file mode 100644 index 0000000000000..a6dabb5e50868 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_10b.txt @@ -0,0 +1,32 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-python-dataflow-batch-combine-1- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/smoketests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_combine_1 +--influx_measurement=python_batch_combine_1 +--input_options=''{\\"num_records\\":200000000,\\"key_size\\":1,\\"value_size\\":9,\\"algorithm\\":\\"lcg\\"}'' +--num_workers=5 +--autoscaling_algorithm=NONE +--top_count=20 +--influxDatabase=beam_test_metrics +--influxHost=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt b/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt new file mode 100644 index 0000000000000..7639456296b6d --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_4.txt @@ -0,0 +1,33 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-python-dataflow-batch-combine-2- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/smoketests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_combine_2 +--influx_measurement=python_batch_combine_2 +--input_options=''{\\"num_records\\":5000000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--num_workers=16 +--autoscaling_algorithm=NONE +--fanout=4 +--top_count=20 +--influxDatabase=beam_test_metrics +--influxHost=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt b/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt new file mode 100644 index 0000000000000..e5d46791a83c1 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_Combine_Python_Batch_2GB_Fanout_8.txt @@ -0,0 +1,33 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-python-dataflow-batch-combine-3- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/smoketests +--publish_to_big_query=true +--metrics_dataset=load_test +--metrics_table=python_dataflow_batch_combine_3 +--influx_measurement=python_batch_combine_3 +--input_options=''{\\"num_records\\":2500000,\\"key_size\\":10,\\"value_size\\":90,\\"algorithm\\":\\"lcg\\"}'' +--num_workers=16 +--autoscaling_algorithm=NONE +--fanout=8 +--top_count=20 +--influxDatabase=beam_test_metrics +--influxHost=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file From af877ff369c1e6ad7697d00f3e5f02aca893f642 Mon Sep 17 00:00:00 2001 From: Vitaly Terentyev Date: Thu, 21 Sep 2023 18:59:34 +0400 Subject: [PATCH 27/34] Add Load Tests Combine Dataflow Batch Go workflow (#28527) * Add Load Tests Combine Dataflow Batch Go workflow * Refactoring --- ...am_LoadTests_Go_Combine_Dataflow_Batch.yml | 108 ++++++++++++++++++ .../config_Combine_Go_Batch_10b.txt | 34 ++++++ .../config_Combine_Go_Batch_Fanout_4.txt | 34 ++++++ .../config_Combine_Go_Batch_Fanout_8.txt | 34 ++++++ 4 files changed, 210 insertions(+) create mode 100644 .github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml create mode 100644 .github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt create mode 100644 .github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt create mode 100644 .github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt diff --git a/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml new file mode 100644 index 0000000000000..423290d3fdc6a --- /dev/null +++ b/.github/workflows/beam_LoadTests_Go_Combine_Dataflow_Batch.yml @@ -0,0 +1,108 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Load Tests Combine Dataflow Batch Go + +on: + issue_comment: + types: [created] + schedule: + - cron: '40 23 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_LoadTests_Go_Combine_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + github.event.comment.body == 'Run Load Tests Go Combine Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Go_Combine_Dataflow_Batch"] + job_phrase: ["Run Load Tests Go Combine Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Prepare configs + #Reads config files, excludes comments, appends current date to the job_name parameter + id: set_configs + shell: bash + run: | + CURDATE=$(date '+%m%d%H%M%S' --utc) + CONFIG_ARR=('config_Combine_Go_Batch_10b.txt' 'config_Combine_Go_Batch_Fanout_4.txt' 'config_Combine_Go_Batch_Fanout_8.txt') + for INDEX in ${!CONFIG_ARR[@]} + do + CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/${CONFIG_ARR[INDEX]} | tr '\n' ' ') + CURCONFIG=$(echo "${CURCONFIG/load-tests-go-dataflow-batch-combine-$((INDEX + 1))-/load-tests-go-dataflow-batch-combine-$((INDEX + 1))-$CURDATE}") + echo "prepared_config_$((INDEX + 1))=$CURCONFIG" >> $GITHUB_OUTPUT + done + - name: run Combine Dataflow Batch Go Load Test 1 (single key) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=combine \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + - name: run Combine Dataflow Batch Go Load Test 2 (multiple keys) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=combine \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + - name: run Combine Dataflow Batch Go Load Test 3 (reiterate 10KB) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=combine \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt b/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt new file mode 100644 index 0000000000000..b9ad28105903b --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_10b.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-combine-1- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_combine_1 +--input_options=''{\"num_records\":200000000,\"key_size\":1,\"value_size\":9}'' +--fanout=1 +--top_count=20 +--num_workers=5 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt b/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt new file mode 100644 index 0000000000000..5f3a185832703 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_4.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-combine-2- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_combine_2 +--input_options=''{\"num_records\":5000000,\"key_size\":10,\"value_size\":90}'' +--fanout=4 +--top_count=20 +--num_workers=16 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt b/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt new file mode 100644 index 0000000000000..eba65b666a257 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_Combine_Go_Batch_Fanout_8.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-combine-3- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_combine_3 +--input_options=''{\"num_records\":2500000,\"key_size\":10,\"value_size\":90}'' +--fanout=8 +--top_count=20 +--num_workers=16 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file From b5b69b134cfde5a7778014216c3d0e3f779dc914 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 21 Sep 2023 08:52:44 -0700 Subject: [PATCH 28/34] [YAML] Transform schema introspection. (#28478) This allows one to enumerate the set of provided transforms together with their config schemas. Future work would be to pull out documentation as well. It would be valuable, if possible, to trace through forwarding of *args and **kwargs as well. --- sdks/python/apache_beam/typehints/schemas.py | 6 ++ sdks/python/apache_beam/yaml/yaml_provider.py | 92 ++++++++++++++++++- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/typehints/schemas.py b/sdks/python/apache_beam/typehints/schemas.py index 155ea86219dee..5b900f2966886 100644 --- a/sdks/python/apache_beam/typehints/schemas.py +++ b/sdks/python/apache_beam/typehints/schemas.py @@ -72,6 +72,7 @@ from typing import ByteString from typing import Dict from typing import Generic +from typing import Iterable from typing import List from typing import Mapping from typing import NamedTuple @@ -308,6 +309,11 @@ def typing_to_runner_api(self, type_: type) -> schema_pb2.FieldType: return schema_pb2.FieldType( map_type=schema_pb2.MapType(key_type=key_type, value_type=value_type)) + elif _safe_issubclass(type_, Iterable) and not _safe_issubclass(type_, str): + element_type = self.typing_to_runner_api(_get_args(type_)[0]) + return schema_pb2.FieldType( + array_type=schema_pb2.ArrayType(element_type=element_type)) + try: logical_type = LogicalType.from_typing(type_) except ValueError: diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py index 0cd9bdcadcc34..6f760f359b061 100644 --- a/sdks/python/apache_beam/yaml/yaml_provider.py +++ b/sdks/python/apache_beam/yaml/yaml_provider.py @@ -21,6 +21,7 @@ import collections import hashlib +import inspect import json import os import subprocess @@ -45,8 +46,12 @@ from apache_beam.transforms import external from apache_beam.transforms import window from apache_beam.transforms.fully_qualified_named_transform import FullyQualifiedNamedTransform +from apache_beam.typehints import native_type_compatibility from apache_beam.typehints import schemas from apache_beam.typehints import trivial_inference +from apache_beam.typehints.schemas import named_tuple_to_schema +from apache_beam.typehints.schemas import typing_from_runner_api +from apache_beam.typehints.schemas import typing_to_runner_api from apache_beam.utils import python_callable from apache_beam.utils import subprocess_server from apache_beam.version import __version__ as beam_version @@ -65,6 +70,9 @@ def provided_transforms(self) -> Iterable[str]: """Returns a list of transform type names this provider can handle.""" raise NotImplementedError(type(self)) + def config_schema(self, type): + return None + def requires_inputs(self, typ: str, args: Mapping[str, Any]) -> bool: """Returns whether this transform requires inputs. @@ -140,6 +148,8 @@ def provided_transforms(self): return self._urns.keys() def schema_transforms(self): + if callable(self._service): + self._service = self._service() if self._schema_transforms is None: try: self._schema_transforms = { @@ -152,6 +162,11 @@ def schema_transforms(self): self._schema_transforms = {} return self._schema_transforms + def config_schema(self, type): + if self._urns[type] in self.schema_transforms(): + return named_tuple_to_schema( + self.schema_transforms()[self._urns[type]].configuration_schema) + def requires_inputs(self, typ, args): if self._urns[type] in self.schema_transforms(): return bool(self.schema_transforms()[self._urns[type]].inputs) @@ -392,6 +407,31 @@ def cache_artifacts(self): def provided_transforms(self): return self._transform_factories.keys() + def config_schema(self, typ): + factory = self._transform_factories[typ] + if isinstance(factory, type) and issubclass(factory, beam.PTransform): + # https://bugs.python.org/issue40897 + params = dict(inspect.signature(factory.__init__).parameters) + del params['self'] + else: + params = inspect.signature(factory).parameters + + def type_of(p): + t = p.annotation + if t == p.empty: + return Any + else: + return t + + names_and_types = [ + (name, typing_to_runner_api(type_of(p))) for name, p in params.items() + ] + return schema_pb2.Schema( + fields=[ + schema_pb2.Field(name=name, type=type) for name, + type in names_and_types + ]) + def create_transform(self, type, args, yaml_create_transform): return self._transform_factories[type](**args) @@ -490,7 +530,10 @@ def extract_field(x, name): # Or should this be posargs, args? # pylint: disable=dangerous-default-value - def fully_qualified_named_transform(constructor, args=(), kwargs={}): + def fully_qualified_named_transform( + constructor: str, + args: Iterable[Any] = (), + kwargs: Mapping[str, Any] = {}): with FullyQualifiedNamedTransform.with_filter('*'): return constructor >> FullyQualifiedNamedTransform( constructor, args, kwargs) @@ -662,6 +705,19 @@ def available(self) -> bool: def provided_transforms(self) -> Iterable[str]: return self._transforms.keys() + def config_schema(self, type): + underlying_schema = self._underlying_provider.config_schema( + self._transforms[type]) + if underlying_schema is None: + return None + underlying_schema_types = {f.name: f.type for f in underlying_schema.fields} + return schema_pb2.Schema( + fields=[ + schema_pb2.Field(name=src, type=underlying_schema_types[dest]) + for src, + dest in self._mappings[type].items() + ]) + def requires_inputs(self, typ, args): return self._underlying_provider.requires_inputs(typ, args) @@ -723,8 +779,42 @@ def standard_providers(): with open(os.path.join(os.path.dirname(__file__), 'standard_providers.yaml')) as fin: standard_providers = yaml.load(fin, Loader=SafeLoader) + return merge_providers( create_builtin_provider(), create_mapping_providers(), io_providers(), parse_providers(standard_providers)) + + +def list_providers(): + def pretty_type(field_type): + if field_type.WhichOneof('type_info') == 'row_type': + return pretty_schema(field_type.row_type.schema) + else: + t = typing_from_runner_api(field_type) + optional_base = native_type_compatibility.extract_optional_type(t) + if optional_base: + t = optional_base + suffix = '?' + else: + suffix = '' + s = str(t) + if s.startswith(' Date: Thu, 21 Sep 2023 12:05:27 -0400 Subject: [PATCH 29/34] Fix remaining tests for pandas 2 compatibility (#28524) --- .../apache_beam/dataframe/frames_test.py | 4 ++ .../dataframe/pandas_doctests_test.py | 61 ++++++++++++++----- .../dataframe/pandas_top_level_functions.py | 1 + .../interactive/interactive_runner_test.py | 2 +- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index e3555b50187b7..6f7a63c291642 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -1936,6 +1936,8 @@ def test_groupby_sum_min_count(self): self._run_test(lambda df: df.groupby('group').sum(min_count=2), df) + @unittest.skipIf( + PD_VERSION >= (2, 0), "dtypes on groups is deprecated in Pandas 2.") def test_groupby_dtypes(self): self._run_test( lambda df: df.groupby('group').dtypes, GROUPBY_DF, check_proxy=False) @@ -2159,6 +2161,7 @@ def test_dataframe_agg_level(self): level=1, numeric_only=True), GROUPBY_DF) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_series_agg_multifunc_level(self): # level= is ignored for multiple agg fns self._run_test( @@ -2181,6 +2184,7 @@ def test_series_mean_skipna(self): self._run_test(lambda df: df.two.mean(skipna=True), df) self._run_test(lambda df: df.three.mean(skipna=True), df) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_dataframe_agg_multifunc_level(self): # level= is ignored for multiple agg fns self._run_test( diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py index 56eddd3cfb928..4fb05780fbec6 100644 --- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py +++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py @@ -164,6 +164,9 @@ def test_ndframe_tests(self): ' key=lambda x: np.argsort(index_natsorted(df["time"]))\n' ')' ], + # TODO(https://github.com/apache/beam/issues/28559): Re-enable when + # bug is fixed. + 'pandas.core.generic.NDFrame.xs': ['*'], **skip_writes }) self.assertEqual(result.failed, 0) @@ -296,13 +299,19 @@ def test_dataframe_tests(self): 'pandas.core.frame.DataFrame.value_counts': [ 'df.value_counts(dropna=False)' ], + + 'pandas.core.frame.DataFrame.to_timestamp': ['*'] }, skip={ - # DataFrame construction from a dictionary and - # Series requires using the len() function, which - # is a non-deferred operation that we do not allow + # DataFrame construction from a dictionary, Series, or other + # DataFrame requires using the len() function, which is a + # non-deferred operation that we do not allow 'pandas.core.frame.DataFrame': [ 'pd.DataFrame(data=d, index=[0, 1, 2, 3])', + 'df = pd.DataFrame(data=ser, index=["a", "c"])', + 'df', + 'df2 = pd.DataFrame(data=df1, index=["a", "c"])', + 'df2', ], # s2 created with reindex 'pandas.core.frame.DataFrame.dot': [ @@ -361,15 +370,17 @@ def test_dataframe_tests(self): # actually raise NotImplementedError 'pandas.core.frame.DataFrame.pivot_table': ['*'], # Expected to raise a ValueError, but we raise NotImplementedError + # pylint: disable=line-too-long 'pandas.core.frame.DataFrame.pivot': [ "df.pivot(index='foo', columns='bar', values='baz')", "df.pivot(index='foo', columns='bar')['baz']", "df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])", - # pylint: disable=line-too-long 'df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")', - # pylint: disable=line-too-long - 'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")' + 'df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")', + 'df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")', + 'df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")', ], + # pylint: enable=line-too-long 'pandas.core.frame.DataFrame.append': [ 'df', # pylint: disable=line-too-long @@ -511,6 +522,8 @@ def test_series_tests(self): 'ser.groupby(["a", "b", "a", np.nan]).mean()', 'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()', ], + 'pandas.core.series.Series.to_period': ['*'], + 'pandas.core.series.Series.to_timestamp': ['*'], }, skip={ # Relies on setting values with iloc @@ -535,6 +548,8 @@ def test_series_tests(self): 'pandas.core.series.Series.idxmin': ['s.idxmin()'], 'pandas.core.series.Series.idxmax': ['s.idxmax()'], 'pandas.core.series.Series.duplicated': ['*'], + # Relies on setting index. + 'pandas.core.series.Series.rename_axis': ['*'], 'pandas.core.series.Series.set_axis': ['*'], 'pandas.core.series.Series.nonzero': ['*'], 'pandas.core.series.Series.pop': ['ser'], # testing side effect @@ -710,6 +725,7 @@ def test_groupby_tests(self): 'pandas.core.groupby.groupby.GroupBy.nth': ['*'], 'pandas.core.groupby.groupby.GroupBy.cumcount': ['*'], 'pandas.core.groupby.groupby.GroupBy.resample': ['*'], + 'pandas.core.groupby.groupby.GroupBy.rolling': ['*'], }, not_implemented_ok={ 'pandas.core.groupby.groupby.GroupBy.first': ['*'], @@ -764,16 +780,21 @@ def test_groupby_tests(self): 'df.fillna(method=\'ffill\')', 'df.fillna(method="ffill")', 'df.fillna(value=values, limit=1)', + 'df.groupby("key").fillna(method="ffill")', + 'df.groupby("key").fillna(method="bfill")', + 'df.groupby("key").fillna(method="ffill", limit=1)', ], 'pandas.core.groupby.generic.SeriesGroupBy.fillna': [ 'df.fillna(method=\'ffill\')', 'df.fillna(method="ffill")', 'df.fillna(value=values, limit=1)', ], + 'pandas.core.groupby.groupby.GroupBy.tail': ['*'], }, not_implemented_ok={ 'pandas.core.groupby.generic.DataFrameGroupBy.idxmax': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.idxmin': ['*'], + 'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'], @@ -794,14 +815,6 @@ def test_groupby_tests(self): # These examples rely on grouping by a list 'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'], - 'pandas.core.groupby.generic.SeriesGroupBy.transform': [ - # Dropping invalid columns during a transform is unsupported. - 'grouped.transform(lambda x: (x - x.mean()) / x.std())' - ], - 'pandas.core.groupby.generic.DataFrameGroupBy.transform': [ - # Dropping invalid columns during a transform is unsupported. - 'grouped.transform(lambda x: (x - x.mean()) / x.std())' - ], # Skipped idxmax/idxmin due an issue with the test framework 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['s.idxmin()'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['s.idxmax()'], @@ -811,7 +824,24 @@ def test_groupby_tests(self): # pylint: disable=line-too-long "df.groupby('gender', as_index=False).value_counts(normalize=True)", ], - }) + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.SeriesGroupBy.fillna': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.DataFrameGroupBy.fillna': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.SeriesGroupBy.take': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.DataFrameGroupBy.take': ['*'], + # Named aggregation not supported yet. + 'pandas.core.groupby.generic.NamedAgg': [ + 'df.groupby("key").agg(result_a=agg_a, result_1=agg_1)' + ], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.DataFrameGroupBy.transform': ['*'], + # These examples rely on grouping by a list + 'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'], + }, + ) self.assertEqual(result.failed, 0) def test_top_level(self): @@ -843,7 +873,6 @@ def test_top_level(self): 'pivot_table': ['*'], 'qcut': ['*'], 'reset_option': ['*'], - 'set_eng_float_format': ['*'], 'set_option': ['*'], 'to_numeric': ['*'], 'to_timedelta': ['*'], diff --git a/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py b/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py index 39df3f25a2e85..ce36dbeb09ad7 100644 --- a/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py +++ b/sdks/python/apache_beam/dataframe/pandas_top_level_functions.py @@ -162,6 +162,7 @@ def concat( period_range = _defer_to_pandas('period_range') pivot = _call_on_first_arg('pivot') pivot_table = _call_on_first_arg('pivot_table') + set_eng_float_format = _defer_to_pandas('set_eng_float_format') show_versions = _defer_to_pandas('show_versions') test = frame_base.wont_implement_method( pd, diff --git a/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py b/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py index 5ffa6224edb0e..1da20fb2dfa94 100644 --- a/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py +++ b/sdks/python/apache_beam/runners/interactive/interactive_runner_test.py @@ -321,7 +321,7 @@ def test_dataframes_with_grouped_index(self): Record('c', 18, 150) ] - aggregate = lambda df: df.groupby('height').mean() + aggregate = lambda df: df.groupby('height').mean(numeric_only=True) deferred_df = aggregate(to_dataframe(p | beam.Create(data))) df_expected = aggregate(pd.DataFrame(data)) From 94327fdac529903a8e0fae59172ca7ccee739b1a Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Thu, 21 Sep 2023 13:28:22 -0400 Subject: [PATCH 30/34] Fix sync it (#28588) --- .../beam/it/gcp/bigtable/BigtableResourceManager.java | 10 +++++----- .../it/gcp/bigtable/BigtableResourceManagerUtils.java | 5 +++-- .../it/gcp/bigtable/BigtableResourceManagerTest.java | 3 ++- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java index 713880229281e..311ce9575c2e0 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManager.java @@ -96,7 +96,7 @@ public class BigtableResourceManager implements ResourceManager { private final Set cdcEnabledTables; private boolean hasInstance; - private Iterable clusters; + private List clusters; private final boolean usingStaticInstance; @@ -195,12 +195,12 @@ public String getInstanceId() { /** * Creates a Bigtable instance in which all clusters, nodes and tables will exist. * - * @param clusters Collection of BigtableResourceManagerCluster objects to associate with the - * given Bigtable instance. + * @param clusters List of BigtableResourceManagerCluster objects to associate with the given + * Bigtable instance. * @throws BigtableResourceManagerException if there is an error creating the instance in * Bigtable. */ - public synchronized void createInstance(Iterable clusters) + public synchronized void createInstance(List clusters) throws BigtableResourceManagerException { // Check to see if instance already exists, and throw error if it does @@ -559,7 +559,7 @@ public List getClusterNames() { } private Iterable getClusters() { - if (usingStaticInstance && this.clusters == null) { + if (usingStaticInstance && this.clusters.isEmpty()) { try (BigtableInstanceAdminClient instanceAdminClient = bigtableResourceManagerClientFactory.bigtableInstanceAdminClient()) { List managedClusters = new ArrayList<>(); diff --git a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java index a893493d766ea..28f1f5bf60c51 100644 --- a/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java +++ b/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerUtils.java @@ -21,6 +21,7 @@ import com.google.cloud.bigtable.admin.v2.models.StorageType; import java.time.format.DateTimeFormatter; +import java.util.List; import java.util.regex.Pattern; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -51,9 +52,9 @@ private BigtableResourceManagerUtils() {} * @param zone the zone/region that the cluster will be deployed to. * @param numNodes the number of nodes that the cluster will contain. * @param storageType the type of storage to configure the cluster with (SSD or HDD). - * @return Collection containing a single BigtableResourceManagerCluster object. + * @return List containing a single BigtableResourceManagerCluster object. */ - static Iterable generateDefaultClusters( + static List generateDefaultClusters( String baseString, String zone, int numNodes, StorageType storageType) { String clusterId = diff --git a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java index f8673ed696ccc..74b25e84c691b 100644 --- a/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java +++ b/it/google-cloud-platform/src/test/java/org/apache/beam/it/gcp/bigtable/BigtableResourceManagerTest.java @@ -40,6 +40,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.junit.Before; @@ -73,7 +74,7 @@ public class BigtableResourceManagerTest { private static final StorageType CLUSTER_STORAGE_TYPE = StorageType.SSD; private BigtableResourceManager testManager; - private Iterable cluster; + private List cluster; @Before public void setUp() throws IOException { From 18ce60c2d9a41d01c7716fc85aeb1a554532d99e Mon Sep 17 00:00:00 2001 From: Vitaly Terentyev Date: Thu, 21 Sep 2023 21:47:42 +0400 Subject: [PATCH 31/34] Add Load Tests GBK Dataflow Batch Go workflow (#28449) * Add Load Tests GBK Dataflow Batch Go workflow * Refactoring --- .github/workflows/README.md | 1 + .../beam_LoadTests_Go_GBK_Dataflow_Batch.yml | 140 ++++++++++++++++++ .../config_GBK_Go_Batch_100b.txt | 34 +++++ .../config_GBK_Go_Batch_100kb.txt | 34 +++++ .../config_GBK_Go_Batch_10b.txt | 34 +++++ .../config_GBK_Go_Batch_Fanout_4.txt | 34 +++++ .../config_GBK_Go_Batch_Fanout_8.txt | 34 +++++ .../config_GBK_Go_Batch_Reiteration_10KB.txt | 34 +++++ .../config_GBK_Go_Batch_Reiteration_2MB.txt | 34 +++++ 9 files changed, 379 insertions(+) create mode 100644 .github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt create mode 100644 .github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 4b1bf01b5e3dd..0bbf1451304f9 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -182,6 +182,7 @@ Please note that jobs with matrix need to have matrix element in the comment. Ex |:-------------:|:------:|:--------------:|:-----------:| | [ Load Tests CoGBK Dataflow Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) | N/A |`Run Load Tests Java CoGBK Dataflow Streaming`| [![.github/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Java_CoGBK_Dataflow_Streaming.yml) | [ Load Tests Combine Dataflow Batch Python ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) | N/A |`Run Load Tests Python Combine Dataflow Batch`| [![.github/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Python_Combine_Dataflow_Batch.yml) +| [ Load Tests GBK Dataflow Batch Go ](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml) | N/A |`Run Load Tests Go GBK Dataflow Batch`| [![.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml) | [ Performance Tests BigQueryIO Batch Java Avro ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Avro`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Avro.yml) | [ Performance Tests BigQueryIO Batch Java Json ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) | N/A |`Run BigQueryIO Batch Performance Test Java Json`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Batch_Java_Json.yml) | [ Performance Tests BigQueryIO Streaming Java ](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) | N/A |`Run BigQueryIO Streaming Performance Test Java`| [![.github/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PerformanceTests_BigQueryIO_Streaming_Java.yml) diff --git a/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml b/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml new file mode 100644 index 0000000000000..bfdb19c1f5d5b --- /dev/null +++ b/.github/workflows/beam_LoadTests_Go_GBK_Dataflow_Batch.yml @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Load Tests GBK Dataflow Batch Go + +on: + issue_comment: + types: [created] + schedule: + - cron: '40 23 * * *' + workflow_dispatch: + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login }}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +jobs: + beam_LoadTests_Go_GBK_Dataflow_Batch: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'schedule' || + github.event.comment.body == 'Run Load Tests Go GBK Dataflow Batch' + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 720 + name: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + strategy: + matrix: + job_name: ["beam_LoadTests_Go_GBK_Dataflow_Batch"] + job_phrase: ["Run Load Tests Go GBK Dataflow Batch"] + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Prepare configs + #Reads config files, excludes comments, appends current date to the job_name parameter + id: set_configs + shell: bash + run: | + CURDATE=$(date '+%m%d%H%M%S' --utc) + CONFIG_ARR=('config_GBK_Go_Batch_10b.txt' 'config_GBK_Go_Batch_100b.txt' 'config_GBK_Go_Batch_100b.txt' 'config_GBK_Go_Batch_Fanout_4.txt' 'config_GBK_Go_Batch_Fanout_8.txt' 'config_GBK_Go_Batch_Reiteration_10KB.txt', 'config_GBK_Go_Batch_Reiteration_2MB.txt') + for INDEX in ${!CONFIG_ARR[@]} + do + CURCONFIG=$(grep -v "^#.*" ./.github/workflows/load-tests-job-configs/${CONFIG_ARR[INDEX]} | tr '\n' ' ') + CURCONFIG=$(echo "${CURCONFIG/load-tests-go-dataflow-batch-gbk-$((INDEX + 1))-/load-tests-go-dataflow-batch-gbk-$((INDEX + 1))-$CURDATE}") + echo "prepared_config_$((INDEX + 1))=$CURCONFIG" >> $GITHUB_OUTPUT + done + - name: run GBK Dataflow Batch Go Load Test 1 (10 b records) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_1 }}' \ + - name: run GBK Dataflow Batch Go Load Test 2 (100 b records) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_2 }}' \ + - name: run GBK Dataflow Batch Go Load Test 3 (100 kb records) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_3 }}' \ + - name: run GBK Dataflow Batch Go Load Test 4 (fanout 4) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_4 }}' \ + - name: run GBK Dataflow Batch Go Load Test 5 (fanout 8) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_5 }}' \ + - name: run GBK Dataflow Batch Go Load Test 6 (reiterate 4 times 10 kb) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_6 }}' \ + - name: run GBK Dataflow Batch Go Load Test 7 (reiterate 4 times 2 mb) + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :sdks:go:test:load:run + arguments: | + -PloadTest.mainClass=group_by_key \ + -Prunner=DataflowRunner \ + '-PloadTest.args=${{ steps.set_configs.outputs.prepared_config_7 }}' \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt new file mode 100644 index 0000000000000..f3ebed91b6ade --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100b.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-2- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_2 +--input_options=''{\"num_records\":20000000,\"key_size\":10,\"value_size\":90}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt new file mode 100644 index 0000000000000..e5007c7d5b90d --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_100kb.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-3- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_3 +--input_options=''{\"num_records\":20000,\"key_size\":10000,\"value_size\":90000}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt new file mode 100644 index 0000000000000..7683eac5cb934 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_10b.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-1- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_1 +--input_options=''{\"num_records\":200000000,\"key_size\":1,\"value_size\":9}'' +--iterations=1 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt new file mode 100644 index 0000000000000..5792b3bf0b95f --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_4.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-4- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_4 +--input_options=''{\"num_records\":5000000,\"key_size\":10,\"value_size\":90}'' +--iterations=1 +--fanout=4 +--num_workers=16 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt new file mode 100644 index 0000000000000..369fb25aa0e19 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Fanout_8.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-5- +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_5 +--input_options=''{\"num_records\":2500000,\"key_size\":10,\"value_size\":90}'' +--iterations=1 +--fanout=8 +--num_workers=16 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt new file mode 100644 index 0000000000000..9eb878d4e9fb9 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_10KB.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-6 +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_6 +--input_options=''{\"num_records\":20000000,\"key_size\":10,\"value_size\":90,\"num_hot_keys\":200,\"hot_key_fraction\":1}'' +--iterations=4 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file diff --git a/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt new file mode 100644 index 0000000000000..aa26473ca4335 --- /dev/null +++ b/.github/workflows/load-tests-job-configs/config_GBK_Go_Batch_Reiteration_2MB.txt @@ -0,0 +1,34 @@ +############################################################################### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +--job_name=load-tests-go-dataflow-batch-gbk-7 +--project=apache-beam-testing +--region=us-central1 +--temp_location=gs://temp-storage-for-perf-tests/loadtests +--staging_location=gs://temp-storage-for-perf-tests/loadtests +--influx_namespace=dataflow +--influx_measurement=go_batch_gbk_7 +--input_options=''{\"num_records\":20000000,\"key_size\":10,\"value_size\":90,\"num_hot_keys\":10,\"hot_key_fraction\":1}'' +--iterations=4 +--fanout=1 +--num_workers=5 +--autoscaling_algorithm=NONE +--environment_type=DOCKER +--environment_config=gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest +--influx_db_name=beam_test_metrics +--influx_hostname=http://10.128.0.96:8086 +--runner=DataflowRunner \ No newline at end of file From 0146a8389b4bf94cafd244f70fc0ad13b32066cc Mon Sep 17 00:00:00 2001 From: Vlado Djerek Date: Thu, 21 Sep 2023 21:02:50 +0200 Subject: [PATCH 32/34] cleanup jobs (#28528) --- .github/workflows/README.md | 2 + .../workflows/beam_CleanUpGCPResources.yml | 83 +++++++++++++++++++ .../beam_CleanUpPrebuiltSDKImages.yml | 83 +++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 .github/workflows/beam_CleanUpGCPResources.yml create mode 100644 .github/workflows/beam_CleanUpPrebuiltSDKImages.yml diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 0bbf1451304f9..4414802c7a344 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -337,3 +337,5 @@ Please note that jobs with matrix need to have matrix element in the comment. Ex | [ PreCommit Kotlin Examples ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml) | N/A | `Run Kotlin_Examples PreCommit` | [![.github/workflows/beam_PreCommit_Kotlin_Examples.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Kotlin_Examples.yml) | | [ PreCommit Portable Python ](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml) | ['3.8','3.11'] | `Run Portable_Python PreCommit` | [![.github/workflows/beam_PreCommit_Portable_Python.yml](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PreCommit_Portable_Python.yml) | | [ Cancel Stale Dataflow Jobs ](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml) | N/A | `Run Cancel Stale Dataflow Jobs` | [![.github/workflows/beam_CancelStaleDataflowJobs.yml](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CancelStaleDataflowJobs.yml) | +| [ Clean Up GCP Resources ](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml) | N/A | `Run Clean GCP Resources` | [![.github/workflows/beam_CleanUpGCPResources.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpGCPResources.yml) | +| [ Clean Up Prebuilt SDK Images ](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml) | N/A | `Run Clean Prebuilt Images` | [![.github/workflows/beam_beam_CleanUpPrebuiltSDKImages.yml](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_CleanUpPrebuiltSDKImages.yml) | \ No newline at end of file diff --git a/.github/workflows/beam_CleanUpGCPResources.yml b/.github/workflows/beam_CleanUpGCPResources.yml new file mode 100644 index 0000000000000..7a420822f9dfb --- /dev/null +++ b/.github/workflows/beam_CleanUpGCPResources.yml @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Clean Up GCP Resources + +on: + issue_comment: + types: [created] + schedule: + - cron: '0 0 * * *' + workflow_dispatch: + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +jobs: + beam_CleanUpGCPResources: + name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + strategy: + matrix: + job_name: [beam_CleanUpGCPResources] + job_phrase: [Run Clean GCP Resources] + if: | + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Clean GCP Resources' + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Authenticate on GCP + id: auth + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + - name: run cleanup GCP resources + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :beam-test-tools:cleanupOtherStaleResources \ No newline at end of file diff --git a/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml b/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml new file mode 100644 index 0000000000000..3ecf192aded97 --- /dev/null +++ b/.github/workflows/beam_CleanUpPrebuiltSDKImages.yml @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Clean Up Prebuilt SDK Images + +on: + issue_comment: + types: [created] + schedule: + - cron: '0 0 * * *' + workflow_dispatch: + +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.body || github.event.sender.login}}' + cancel-in-progress: true + +env: + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} + GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} + +#Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event +permissions: + actions: write + pull-requests: read + checks: read + contents: read + deployments: read + id-token: none + issues: read + discussions: read + packages: read + pages: read + repository-projects: read + security-events: read + statuses: read + +jobs: + beam_CleanUpPrebuiltSDKImages: + name: ${{matrix.job_name}} (${{matrix.job_phrase}}) + runs-on: [self-hosted, ubuntu-20.04, main] + timeout-minutes: 100 + strategy: + matrix: + job_name: [beam_CleanUpPrebuiltSDKImages] + job_phrase: [Run Clean Prebuilt Images] + if: | + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + github.event.comment.body == 'Run Clean Prebuilt Images' + steps: + - uses: actions/checkout@v3 + - name: Setup repository + uses: ./.github/actions/setup-action + with: + comment_phrase: ${{ matrix.job_phrase }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }}) + - name: Authenticate on GCP + id: auth + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + - name: run remove stale sdk container images + uses: ./.github/actions/gradle-command-self-hosted-action + with: + gradle-command: :beam-test-tools:removeStaleSDKContainerImages \ No newline at end of file From 04a26da777ff4c0ed9112f07bf0f41a39bc7260d Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Fri, 22 Sep 2023 13:12:58 +0000 Subject: [PATCH 33/34] [Python BQ] Allow setting a fixed number of Storage API streams (#28592) * expose num streams option; fix some streaming tests * clarify docs; address nit --- ...t_Python_CrossLanguage_Gcp_Dataflow.groovy | 2 +- ...mit_Python_CrossLanguage_Gcp_Direct.groovy | 2 +- ...torageWriteApiSchemaTransformProvider.java | 37 ++++++++++++++---- .../io/external/xlang_bigqueryio_it_test.py | 38 ++++++++++++------- sdks/python/apache_beam/io/gcp/bigquery.py | 9 +++++ .../io/built-in/google-bigquery.md | 2 +- 6 files changed, 66 insertions(+), 24 deletions(-) diff --git a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy index d1ee27088c727..1280fcb4e2339 100644 --- a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy +++ b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy @@ -28,7 +28,7 @@ import static PythonTestProperties.CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIO // Collects tests with the @pytest.mark.uses_gcp_java_expansion_service decorator PostcommitJobBuilder.postCommitJob('beam_PostCommit_Python_Xlang_Gcp_Dataflow', 'Run Python_Xlang_Gcp_Dataflow PostCommit', 'Python_Xlang_Gcp_Dataflow (\"Run Python_Xlang_Gcp_Dataflow PostCommit\")', this) { - description('Runs end-to-end cross language GCP IO tests on the Dataflow runner.') + description('Runs end-to-end cross language GCP IO tests on the Dataflow runner. \"Run Python_Xlang_Gcp_Dataflow PostCommit\"') // Set common parameters. diff --git a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy index 438b735fba7ff..e4bf771be1ae9 100644 --- a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy +++ b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy @@ -28,7 +28,7 @@ import static PythonTestProperties.CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIO // Collects tests with the @pytest.mark.uses_gcp_java_expansion_service decorator PostcommitJobBuilder.postCommitJob('beam_PostCommit_Python_Xlang_Gcp_Direct', 'Run Python_Xlang_Gcp_Direct PostCommit', 'Python_Xlang_Gcp_Direct (\"Run Python_Xlang_Gcp_Direct PostCommit\")', this) { - description('Runs end-to-end cross language GCP IO tests on the Direct runner.') + description('Runs end-to-end cross language GCP IO tests on the Direct runner. \"Run Python_Xlang_Gcp_Direct PostCommit\"') // Set common parameters. commonJobProperties.setTopLevelMainJobProperties(delegate) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index e44617930119f..c3eed24672360 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -176,6 +176,13 @@ public void validate() { !Strings.isNullOrEmpty(this.getErrorHandling().getOutput()), invalidConfigMessage + "Output must not be empty if error handling specified."); } + + if (this.getAutoSharding() != null && this.getAutoSharding()) { + checkArgument( + this.getNumStreams() == 0, + invalidConfigMessage + + "Cannot set a fixed number of streams when auto-sharding is enabled. Please pick only one of the two options."); + } } /** @@ -218,11 +225,17 @@ public static Builder builder() { public abstract Boolean getUseAtLeastOnceSemantics(); @SchemaFieldDescription( - "This option enables using a dynamically determined number of shards to write to " + "This option enables using a dynamically determined number of Storage Write API streams to write to " + "BigQuery. Only applicable to unbounded data.") @Nullable public abstract Boolean getAutoSharding(); + @SchemaFieldDescription( + "If set, the Storage API sink will default to using this number of write streams. " + + "Only applicable to unbounded data.") + @Nullable + public abstract Integer getNumStreams(); + @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") @Nullable public abstract ErrorHandling getErrorHandling(); @@ -243,6 +256,8 @@ public abstract static class Builder { public abstract Builder setAutoSharding(Boolean autoSharding); + public abstract Builder setNumStreams(Integer numStreams); + public abstract Builder setErrorHandling(ErrorHandling errorHandling); /** Builds a {@link BigQueryStorageWriteApiSchemaTransformConfiguration} instance. */ @@ -321,13 +336,19 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { if (inputRows.isBounded() == IsBounded.UNBOUNDED) { Long triggeringFrequency = configuration.getTriggeringFrequencySeconds(); Boolean autoSharding = configuration.getAutoSharding(); - write = - write.withTriggeringFrequency( - (triggeringFrequency == null || triggeringFrequency <= 0) - ? DEFAULT_TRIGGERING_FREQUENCY - : Duration.standardSeconds(triggeringFrequency)); - // use default value true for autoSharding if not configured for STORAGE_WRITE_API - if (autoSharding == null || autoSharding) { + Integer numStreams = configuration.getNumStreams(); + // Triggering frequency is only applicable for exactly-once + if (!configuration.getUseAtLeastOnceSemantics()) { + write = + write.withTriggeringFrequency( + (triggeringFrequency == null || triggeringFrequency <= 0) + ? DEFAULT_TRIGGERING_FREQUENCY + : Duration.standardSeconds(triggeringFrequency)); + } + // set num streams if specified, otherwise default to autoSharding + if (numStreams > 0) { + write = write.withNumStorageWriteApiStreams(numStreams); + } else if (autoSharding == null || autoSharding) { write = write.withAutoSharding(); } } diff --git a/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py b/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py index fbfde550ea708..e234aab7314f4 100644 --- a/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py +++ b/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py @@ -30,12 +30,13 @@ from hamcrest.core import assert_that as hamcrest_assert import apache_beam as beam -from apache_beam.io.external.generate_sequence import GenerateSequence from apache_beam.io.gcp.bigquery import StorageWriteToBigQuery from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher +from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.transforms.periodicsequence import PeriodicImpulse from apache_beam.utils.timestamp import Timestamp # Protect against environments where bigquery library is not available. @@ -99,11 +100,13 @@ class BigQueryXlangStorageWriteIT(unittest.TestCase): ALL_TYPES_SCHEMA = ( "int:INTEGER,float:FLOAT,numeric:NUMERIC,str:STRING," "bool:BOOLEAN,bytes:BYTES,timestamp:TIMESTAMP") + _RUNNER = "" def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.args = self.test_pipeline.get_full_options_as_args() self.project = self.test_pipeline.get_option('project') + _RUNNER = PipelineOptions(self.args).get_all_options()['runner'] self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s_%s_%s' % ( @@ -244,8 +247,7 @@ def test_write_with_beam_rows(self): table=table_id, expansion_service=self.expansion_service)) hamcrest_assert(p, bq_matcher) - def run_streaming( - self, table_name, auto_sharding=False, use_at_least_once=False): + def run_streaming(self, table_name, num_streams=0, use_at_least_once=False): elements = self.ELEMENTS.copy() schema = self.ALL_TYPES_SCHEMA table_id = '{}:{}.{}'.format(self.project, self.dataset_id, table_name) @@ -260,33 +262,43 @@ def run_streaming( streaming=True, allow_unsafe_triggers=True) + auto_sharding = (num_streams == 0) with beam.Pipeline(argv=args) as p: _ = ( p - | GenerateSequence( - start=0, stop=4, expansion_service=self.expansion_service) - | beam.Map(lambda x: elements[x]) + | PeriodicImpulse(0, 4, 1) + | beam.Map(lambda t: elements[t]) | beam.io.WriteToBigQuery( table=table_id, method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API, schema=schema, + triggering_frequency=1, with_auto_sharding=auto_sharding, + num_storage_api_streams=num_streams, use_at_least_once=use_at_least_once, expansion_service=self.expansion_service)) hamcrest_assert(p, bq_matcher) - def test_streaming(self): - table = 'streaming' + @unittest.skipUnless( + "dataflowrunner" in _RUNNER.lower(), + "The exactly-once route has the requirement " + "`beam:requirement:pardo:on_window_expiration:v1`, " + "which is currently only supported by the Dataflow runner.") + def test_streaming_with_fixed_num_streams(self): + table = 'streaming_fixed_num_streams' + self.run_streaming(table_name=table, num_streams=4) + + @unittest.skip( + "Streaming to the Storage Write API sink with autosharding is broken " + "with Dataflow Runner V2.") + def test_streaming_with_auto_sharding(self): + table = 'streaming_with_auto_sharding' self.run_streaming(table_name=table) def test_streaming_with_at_least_once(self): - table = 'streaming' + table = 'streaming_with_at_least_once' self.run_streaming(table_name=table, use_at_least_once=True) - def test_streaming_with_auto_sharding(self): - table = 'streaming_with_auto_sharding' - self.run_streaming(table_name=table, auto_sharding=True) - if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index e092ad069ad02..986919fd6b821 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -1869,6 +1869,7 @@ def __init__( # TODO(https://github.com/apache/beam/issues/20712): Switch the default # when the feature is mature. with_auto_sharding=False, + num_storage_api_streams=0, ignore_unknown_columns=False, load_job_project_id=None, max_insert_payload_size=MAX_INSERT_PAYLOAD_SIZE, @@ -2018,6 +2019,9 @@ def __init__( determined number of shards to write to BigQuery. This can be used for all of FILE_LOADS, STREAMING_INSERTS, and STORAGE_WRITE_API. Only applicable to unbounded input. + num_storage_api_streams: Specifies the number of write streams that the + Storage API sink will use. This parameter is only applicable when + writing unbounded data. ignore_unknown_columns: Accept rows that contain values that do not match the schema. The unknown values are ignored. Default is False, which treats unknown values as errors. This option is only valid for @@ -2060,6 +2064,7 @@ def __init__( self.use_at_least_once = use_at_least_once self.expansion_service = expansion_service self.with_auto_sharding = with_auto_sharding + self._num_storage_api_streams = num_storage_api_streams self.insert_retry_strategy = insert_retry_strategy self._validate = validate self._temp_file_format = temp_file_format or bigquery_tools.FileFormat.JSON @@ -2259,6 +2264,7 @@ def find_in_nested_dict(schema): triggering_frequency=triggering_frequency, use_at_least_once=self.use_at_least_once, with_auto_sharding=self.with_auto_sharding, + num_storage_api_streams=self._num_storage_api_streams, expansion_service=self.expansion_service)) if is_rows: @@ -2521,6 +2527,7 @@ def __init__( triggering_frequency=0, use_at_least_once=False, with_auto_sharding=False, + num_storage_api_streams=0, expansion_service=None): """Initialize a StorageWriteToBigQuery transform. @@ -2558,6 +2565,7 @@ def __init__( self._triggering_frequency = triggering_frequency self._use_at_least_once = use_at_least_once self._with_auto_sharding = with_auto_sharding + self._num_storage_api_streams = num_storage_api_streams self._expansion_service = ( expansion_service or _default_io_expansion_service()) self.schematransform_config = SchemaAwareExternalTransform.discover_config( @@ -2569,6 +2577,7 @@ def expand(self, input): expansion_service=self._expansion_service, rearrange_based_on_discovery=True, autoSharding=self._with_auto_sharding, + numStreams=self._num_storage_api_streams, createDisposition=self._create_disposition, table=self._table, triggeringFrequencySeconds=self._triggering_frequency, diff --git a/website/www/site/content/en/documentation/io/built-in/google-bigquery.md b/website/www/site/content/en/documentation/io/built-in/google-bigquery.md index eae98b84d2c10..7a31b63a3c96e 100644 --- a/website/www/site/content/en/documentation/io/built-in/google-bigquery.md +++ b/website/www/site/content/en/documentation/io/built-in/google-bigquery.md @@ -788,7 +788,7 @@ BigQuery Storage Write API for Python SDK currently has some limitations on supp {{< paragraph class="language-py" >}} **Note:** If you want to run WriteToBigQuery with Storage Write API from the source code, you need to run `./gradlew :sdks:java:io:google-cloud-platform:expansion-service:build` to build the expansion-service jar. If you are running from a released Beam SDK, the jar will already be included. -**Note:** Auto sharding is not currently supported for Python's Storage Write API. +**Note:** Auto sharding is not currently supported for Python's Storage Write API exactly-once mode on DataflowRunner. {{< /paragraph >}} From 426dbd3955e08dbfbad83d27dfd47fb2ec489487 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Fri, 22 Sep 2023 13:14:31 +0000 Subject: [PATCH 34/34] Revert "[Python BQ] Allow setting a fixed number of Storage API streams (#28592)" (#28613) This reverts commit 04a26da777ff4c0ed9112f07bf0f41a39bc7260d. --- ...t_Python_CrossLanguage_Gcp_Dataflow.groovy | 2 +- ...mit_Python_CrossLanguage_Gcp_Direct.groovy | 2 +- ...torageWriteApiSchemaTransformProvider.java | 37 ++++-------------- .../io/external/xlang_bigqueryio_it_test.py | 38 +++++++------------ sdks/python/apache_beam/io/gcp/bigquery.py | 9 ----- .../io/built-in/google-bigquery.md | 2 +- 6 files changed, 24 insertions(+), 66 deletions(-) diff --git a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy index 1280fcb4e2339..d1ee27088c727 100644 --- a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy +++ b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Dataflow.groovy @@ -28,7 +28,7 @@ import static PythonTestProperties.CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIO // Collects tests with the @pytest.mark.uses_gcp_java_expansion_service decorator PostcommitJobBuilder.postCommitJob('beam_PostCommit_Python_Xlang_Gcp_Dataflow', 'Run Python_Xlang_Gcp_Dataflow PostCommit', 'Python_Xlang_Gcp_Dataflow (\"Run Python_Xlang_Gcp_Dataflow PostCommit\")', this) { - description('Runs end-to-end cross language GCP IO tests on the Dataflow runner. \"Run Python_Xlang_Gcp_Dataflow PostCommit\"') + description('Runs end-to-end cross language GCP IO tests on the Dataflow runner.') // Set common parameters. diff --git a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy index e4bf771be1ae9..438b735fba7ff 100644 --- a/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy +++ b/.test-infra/jenkins/job_PostCommit_Python_CrossLanguage_Gcp_Direct.groovy @@ -28,7 +28,7 @@ import static PythonTestProperties.CROSS_LANGUAGE_VALIDATES_RUNNER_PYTHON_VERSIO // Collects tests with the @pytest.mark.uses_gcp_java_expansion_service decorator PostcommitJobBuilder.postCommitJob('beam_PostCommit_Python_Xlang_Gcp_Direct', 'Run Python_Xlang_Gcp_Direct PostCommit', 'Python_Xlang_Gcp_Direct (\"Run Python_Xlang_Gcp_Direct PostCommit\")', this) { - description('Runs end-to-end cross language GCP IO tests on the Direct runner. \"Run Python_Xlang_Gcp_Direct PostCommit\"') + description('Runs end-to-end cross language GCP IO tests on the Direct runner.') // Set common parameters. commonJobProperties.setTopLevelMainJobProperties(delegate) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index c3eed24672360..e44617930119f 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -176,13 +176,6 @@ public void validate() { !Strings.isNullOrEmpty(this.getErrorHandling().getOutput()), invalidConfigMessage + "Output must not be empty if error handling specified."); } - - if (this.getAutoSharding() != null && this.getAutoSharding()) { - checkArgument( - this.getNumStreams() == 0, - invalidConfigMessage - + "Cannot set a fixed number of streams when auto-sharding is enabled. Please pick only one of the two options."); - } } /** @@ -225,17 +218,11 @@ public static Builder builder() { public abstract Boolean getUseAtLeastOnceSemantics(); @SchemaFieldDescription( - "This option enables using a dynamically determined number of Storage Write API streams to write to " + "This option enables using a dynamically determined number of shards to write to " + "BigQuery. Only applicable to unbounded data.") @Nullable public abstract Boolean getAutoSharding(); - @SchemaFieldDescription( - "If set, the Storage API sink will default to using this number of write streams. " + - "Only applicable to unbounded data.") - @Nullable - public abstract Integer getNumStreams(); - @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") @Nullable public abstract ErrorHandling getErrorHandling(); @@ -256,8 +243,6 @@ public abstract static class Builder { public abstract Builder setAutoSharding(Boolean autoSharding); - public abstract Builder setNumStreams(Integer numStreams); - public abstract Builder setErrorHandling(ErrorHandling errorHandling); /** Builds a {@link BigQueryStorageWriteApiSchemaTransformConfiguration} instance. */ @@ -336,19 +321,13 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { if (inputRows.isBounded() == IsBounded.UNBOUNDED) { Long triggeringFrequency = configuration.getTriggeringFrequencySeconds(); Boolean autoSharding = configuration.getAutoSharding(); - Integer numStreams = configuration.getNumStreams(); - // Triggering frequency is only applicable for exactly-once - if (!configuration.getUseAtLeastOnceSemantics()) { - write = - write.withTriggeringFrequency( - (triggeringFrequency == null || triggeringFrequency <= 0) - ? DEFAULT_TRIGGERING_FREQUENCY - : Duration.standardSeconds(triggeringFrequency)); - } - // set num streams if specified, otherwise default to autoSharding - if (numStreams > 0) { - write = write.withNumStorageWriteApiStreams(numStreams); - } else if (autoSharding == null || autoSharding) { + write = + write.withTriggeringFrequency( + (triggeringFrequency == null || triggeringFrequency <= 0) + ? DEFAULT_TRIGGERING_FREQUENCY + : Duration.standardSeconds(triggeringFrequency)); + // use default value true for autoSharding if not configured for STORAGE_WRITE_API + if (autoSharding == null || autoSharding) { write = write.withAutoSharding(); } } diff --git a/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py b/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py index e234aab7314f4..fbfde550ea708 100644 --- a/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py +++ b/sdks/python/apache_beam/io/external/xlang_bigqueryio_it_test.py @@ -30,13 +30,12 @@ from hamcrest.core import assert_that as hamcrest_assert import apache_beam as beam +from apache_beam.io.external.generate_sequence import GenerateSequence from apache_beam.io.gcp.bigquery import StorageWriteToBigQuery from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher -from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.testing.test_pipeline import TestPipeline -from apache_beam.transforms.periodicsequence import PeriodicImpulse from apache_beam.utils.timestamp import Timestamp # Protect against environments where bigquery library is not available. @@ -100,13 +99,11 @@ class BigQueryXlangStorageWriteIT(unittest.TestCase): ALL_TYPES_SCHEMA = ( "int:INTEGER,float:FLOAT,numeric:NUMERIC,str:STRING," "bool:BOOLEAN,bytes:BYTES,timestamp:TIMESTAMP") - _RUNNER = "" def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.args = self.test_pipeline.get_full_options_as_args() self.project = self.test_pipeline.get_option('project') - _RUNNER = PipelineOptions(self.args).get_all_options()['runner'] self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s_%s_%s' % ( @@ -247,7 +244,8 @@ def test_write_with_beam_rows(self): table=table_id, expansion_service=self.expansion_service)) hamcrest_assert(p, bq_matcher) - def run_streaming(self, table_name, num_streams=0, use_at_least_once=False): + def run_streaming( + self, table_name, auto_sharding=False, use_at_least_once=False): elements = self.ELEMENTS.copy() schema = self.ALL_TYPES_SCHEMA table_id = '{}:{}.{}'.format(self.project, self.dataset_id, table_name) @@ -262,43 +260,33 @@ def run_streaming(self, table_name, num_streams=0, use_at_least_once=False): streaming=True, allow_unsafe_triggers=True) - auto_sharding = (num_streams == 0) with beam.Pipeline(argv=args) as p: _ = ( p - | PeriodicImpulse(0, 4, 1) - | beam.Map(lambda t: elements[t]) + | GenerateSequence( + start=0, stop=4, expansion_service=self.expansion_service) + | beam.Map(lambda x: elements[x]) | beam.io.WriteToBigQuery( table=table_id, method=beam.io.WriteToBigQuery.Method.STORAGE_WRITE_API, schema=schema, - triggering_frequency=1, with_auto_sharding=auto_sharding, - num_storage_api_streams=num_streams, use_at_least_once=use_at_least_once, expansion_service=self.expansion_service)) hamcrest_assert(p, bq_matcher) - @unittest.skipUnless( - "dataflowrunner" in _RUNNER.lower(), - "The exactly-once route has the requirement " - "`beam:requirement:pardo:on_window_expiration:v1`, " - "which is currently only supported by the Dataflow runner.") - def test_streaming_with_fixed_num_streams(self): - table = 'streaming_fixed_num_streams' - self.run_streaming(table_name=table, num_streams=4) - - @unittest.skip( - "Streaming to the Storage Write API sink with autosharding is broken " - "with Dataflow Runner V2.") - def test_streaming_with_auto_sharding(self): - table = 'streaming_with_auto_sharding' + def test_streaming(self): + table = 'streaming' self.run_streaming(table_name=table) def test_streaming_with_at_least_once(self): - table = 'streaming_with_at_least_once' + table = 'streaming' self.run_streaming(table_name=table, use_at_least_once=True) + def test_streaming_with_auto_sharding(self): + table = 'streaming_with_auto_sharding' + self.run_streaming(table_name=table, auto_sharding=True) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index 986919fd6b821..e092ad069ad02 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -1869,7 +1869,6 @@ def __init__( # TODO(https://github.com/apache/beam/issues/20712): Switch the default # when the feature is mature. with_auto_sharding=False, - num_storage_api_streams=0, ignore_unknown_columns=False, load_job_project_id=None, max_insert_payload_size=MAX_INSERT_PAYLOAD_SIZE, @@ -2019,9 +2018,6 @@ def __init__( determined number of shards to write to BigQuery. This can be used for all of FILE_LOADS, STREAMING_INSERTS, and STORAGE_WRITE_API. Only applicable to unbounded input. - num_storage_api_streams: Specifies the number of write streams that the - Storage API sink will use. This parameter is only applicable when - writing unbounded data. ignore_unknown_columns: Accept rows that contain values that do not match the schema. The unknown values are ignored. Default is False, which treats unknown values as errors. This option is only valid for @@ -2064,7 +2060,6 @@ def __init__( self.use_at_least_once = use_at_least_once self.expansion_service = expansion_service self.with_auto_sharding = with_auto_sharding - self._num_storage_api_streams = num_storage_api_streams self.insert_retry_strategy = insert_retry_strategy self._validate = validate self._temp_file_format = temp_file_format or bigquery_tools.FileFormat.JSON @@ -2264,7 +2259,6 @@ def find_in_nested_dict(schema): triggering_frequency=triggering_frequency, use_at_least_once=self.use_at_least_once, with_auto_sharding=self.with_auto_sharding, - num_storage_api_streams=self._num_storage_api_streams, expansion_service=self.expansion_service)) if is_rows: @@ -2527,7 +2521,6 @@ def __init__( triggering_frequency=0, use_at_least_once=False, with_auto_sharding=False, - num_storage_api_streams=0, expansion_service=None): """Initialize a StorageWriteToBigQuery transform. @@ -2565,7 +2558,6 @@ def __init__( self._triggering_frequency = triggering_frequency self._use_at_least_once = use_at_least_once self._with_auto_sharding = with_auto_sharding - self._num_storage_api_streams = num_storage_api_streams self._expansion_service = ( expansion_service or _default_io_expansion_service()) self.schematransform_config = SchemaAwareExternalTransform.discover_config( @@ -2577,7 +2569,6 @@ def expand(self, input): expansion_service=self._expansion_service, rearrange_based_on_discovery=True, autoSharding=self._with_auto_sharding, - numStreams=self._num_storage_api_streams, createDisposition=self._create_disposition, table=self._table, triggeringFrequencySeconds=self._triggering_frequency, diff --git a/website/www/site/content/en/documentation/io/built-in/google-bigquery.md b/website/www/site/content/en/documentation/io/built-in/google-bigquery.md index 7a31b63a3c96e..eae98b84d2c10 100644 --- a/website/www/site/content/en/documentation/io/built-in/google-bigquery.md +++ b/website/www/site/content/en/documentation/io/built-in/google-bigquery.md @@ -788,7 +788,7 @@ BigQuery Storage Write API for Python SDK currently has some limitations on supp {{< paragraph class="language-py" >}} **Note:** If you want to run WriteToBigQuery with Storage Write API from the source code, you need to run `./gradlew :sdks:java:io:google-cloud-platform:expansion-service:build` to build the expansion-service jar. If you are running from a released Beam SDK, the jar will already be included. -**Note:** Auto sharding is not currently supported for Python's Storage Write API exactly-once mode on DataflowRunner. +**Note:** Auto sharding is not currently supported for Python's Storage Write API. {{< /paragraph >}}