helium · mikev · Jul 21, 2023 · Jul 21, 2023 · Jul 21, 2023 · Jul 22, 2023
diff --git a/manifests/poc-data-cluster/prod/helium/mobile-rewards-share-delta-lake-sink.yaml b/manifests/poc-data-cluster/prod/helium/mobile-rewards-share-delta-lake-sink.yaml
@@ -0,0 +1,66 @@
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+ name: mobile-rewards-share-delta-lake-sink
+ namespace: helium
+spec:
+ concurrencyPolicy: Forbid
+ schedule: "10 */4 * * *"
+ jobTemplate:
+ spec:
+ backoffLimit: 10
+ template:
+ spec:
+ serviceAccountName: s3-data-lake-bucket-access
+ tolerations: # Schedule executor pods on spot instance group
+ - key: dedicated
+ operator: Equal
+ value: spark
+ effect: NoSchedule
+ nodeSelector:
+ nodegroup-type: spot
+ containers:
+ - name: mobile-rewards-delta-lake-sink
+ image: public.ecr.aws/k0m1p4t7/protobuf-delta-lake-sink:0.0.10
+ imagePullPolicy: IfNotPresent
+ resources:
+ requests:
+ cpu: 1000m
+ memory: 6900Mi
+ limits:
+ memory: 6900Mi
+ env: 
+ - name: AWS_S3_ALLOW_UNSAFE_RENAME
+ value: "true"
+ args:
+ - --source-bucket
+ - foundation-poc-data-requester-pays
+ - --source-region
+ - us-west-2
+ - --file-prefix
+ - foundation-iot-verified-rewards/mobile_reward_share
+ - --source-proto-name
+ - "mobile_reward_share"
+ - --source-proto-base-url
+ - https://raw.githubusercontent.com/helium/proto/master/src
+ - --source-protos
+ - data_rate.proto
+ - --source-protos
+ - service/packet_verifier.proto
+ - --source-protos
+ - service/poc_mobile.proto
+ - --source-protos
+ - region.proto 
+ - --target-bucket
+ - foundation-data-lake-requester-pays
+ - --target-table
+ - bronze/mobile_reward_share
+ - --target-region
+ - us-west-2
+ - --partition-timestamp-column
+ - start_period
+ - --partition-timestamp-date-divisor
+ - "86400"
+ - --batch-size
+ - "500000000" # Targetting 500mb parquet files, per databricks recs on large tables
+ restartPolicy: OnFailure
diff --git a/manifests/poc-data-cluster/prod/spark/iot-data-reward-totals-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-data-reward-totals-silver.yaml
@@ -0,0 +1,83 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: iot-data-reward-totals-silver-query
+ namespace: spark
+data:
+ query.sql: |
+ SELECT 
+ date, sum(dc_transfer_amount_iot) AS data_iot_total
+ FROM iot_reward_share
+---
+apiVersion: "sparkoperator.k8s.io/v1beta2"
+kind: SparkApplication
+metadata:
+ name: iot-data-reward-totals-silver
+ namespace: spark
+spec:
+ type: Scala
+ mode: cluster
+ image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws"
+ imagePullPolicy: Always
+ mainClass: Main
+ mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar"
+ sparkVersion: "3.4.0"
+ restartPolicy:
+ type: OnFailure
+ onFailureRetries: 3
+ onFailureRetryInterval: 10
+ onSubmissionFailureRetries: 3
+ onSubmissionFailureRetryInterval: 10
+ sparkConf:
+ spark.databricks.delta.autoCompact.enabled: "true"
+ hadoopConf:
+ fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider
+ volumes:
+ - name: "tmp"
+ hostPath:
+ path: "/tmp"
+ type: Directory
+ - name: config-vol
+ configMap:
+ name: iot-data-reward-totals-silver-query
+ items:
+ - key: query.sql
+ path: query.sql
+ driver:
+ serviceAccount: spark-data-lake-access
+ cores: 1
+ coreLimit: "1200m"
+ memory: "512m"
+ nodeSelector:
+ node.kubernetes.io/instance-type: m5.large
+ envVars:
+ TABLE_IOT_REWARD_SHARE: s3a://foundation-data-lake-requester-pays/silver/iot-reward-share
+ PARTITION_BY: "date"
+ CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-data-reward-totals
+ OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-data-reward-totals
+ QUERY_PATH: /app/query.sql
+ labels:
+ version: 3.4.0
+ volumeMounts:
+ - name: "test-volume"
+ mountPath: "/tmp"
+ - name: config-vol
+ mountPath: /app
+ executor:
+ serviceAccount: spark-data-lake-access
+ cores: 1
+ coreLimit: "1200m"
+ instances: 3
+ memory: "10G"
+ tolerations: # Schedule executor pods on spot instance group
+ - key: dedicated
+ operator: Equal
+ value: spark
+ effect: NoSchedule
+ nodeSelector:
+ nodegroup-type: spot
+ labels:
+ version: 3.4.0
+ volumeMounts:
+ - name: "tmp"
+ mountPath: "/tmp"
diff --git a/manifests/poc-data-cluster/prod/spark/iot-netid-counts-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-netid-counts-silver.yaml
@@ -0,0 +1,85 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: iot-oui-counts-silver-query
+ namespace: spark
+data:
+ query.sql: |
+ SELECT 
+ date, net_id, count(net_id) AS count
+ FROM iot_packets
+ GROUP BY net_id
+ ORDER BY net_id DESC
+---
+apiVersion: "sparkoperator.k8s.io/v1beta2"
+kind: SparkApplication
+metadata:
+ name: iot-netid-counts-silver
+ namespace: spark
+spec:
+ type: Scala
+ mode: cluster
+ image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws"
+ imagePullPolicy: Always
+ mainClass: Main
+ mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar"
+ sparkVersion: "3.4.0"
+ restartPolicy:
+ type: OnFailure
+ onFailureRetries: 3
+ onFailureRetryInterval: 10
+ onSubmissionFailureRetries: 3
+ onSubmissionFailureRetryInterval: 10
+ sparkConf:
+ spark.databricks.delta.autoCompact.enabled: "true"
+ hadoopConf:
+ fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider
+ volumes:
+ - name: "tmp"
+ hostPath:
+ path: "/tmp"
+ type: Directory
+ - name: config-vol
+ configMap:
+ name: iot-netid-counts-silver-query
+ items:
+ - key: query.sql
+ path: query.sql
+ driver:
+ serviceAccount: spark-data-lake-access
+ cores: 1
+ coreLimit: "1200m"
+ memory: "512m"
+ nodeSelector:
+ node.kubernetes.io/instance-type: m5.large
+ envVars:
+ TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets
+ PARTITION_BY: "date"
+ CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-netid-counts
+ OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-netid-counts
+ QUERY_PATH: /app/query.sql
+ labels:
+ version: 3.4.0
+ volumeMounts:
+ - name: "test-volume"
+ mountPath: "/tmp"
+ - name: config-vol
+ mountPath: /app
+ executor:
+ serviceAccount: spark-data-lake-access
+ cores: 1
+ coreLimit: "1200m"
+ instances: 3
+ memory: "10G"
+ tolerations: # Schedule executor pods on spot instance group
+ - key: dedicated
+ operator: Equal
+ value: spark
+ effect: NoSchedule
+ nodeSelector:
+ nodegroup-type: spot
+ labels:
+ version: 3.4.0
+ volumeMounts:
+ - name: "tmp"
+ mountPath: "/tmp"
diff --git a/manifests/poc-data-cluster/prod/spark/iot-oui-counts-silver.yaml b/manifests/poc-data-cluster/prod/spark/iot-oui-counts-silver.yaml
@@ -0,0 +1,91 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: iot-oui-counts-silver-query
+ namespace: spark
+data:
+ query.sql: |
+ SELECT 
+ date, oui, count(oui) AS count
+ FROM iot_packets
+ GROUP BY oui
+ ORDER BY oui DESC
+ WHERE date_add(current_date(), -1);
+---
+apiVersion: "sparkoperator.k8s.io/v1beta2"
+kind: ScheduledSparkApplication
+metadata:
+ name: iot-oui-counts-silver
+ namespace: spark
+spec:
+ schedule: "@daily"
+ concurrencyPolicy: Allow
+ successfulRunHistoryLimit: 1
+ failedRunHistoryLimit: 3
+ template:
+ type: Scala
+ mode: cluster
+ image: "public.ecr.aws/k0m1p4t7/spark:v3.4.0-aws"
+ imagePullPolicy: Always
+ mainClass: Main
+ mainApplicationFile: "s3a://foundation-data-lake-requester-pays/jars/spark-streaming-sql-assembly-1.0.jar"
+ sparkVersion: "3.4.0"
+ restartPolicy:
+ type: OnFailure
+ onFailureRetries: 3
+ onFailureRetryInterval: 10
+ onSubmissionFailureRetries: 3
+ onSubmissionFailureRetryInterval: 10
+ sparkConf:
+ spark.databricks.delta.autoCompact.enabled: "true"
+ hadoopConf:
+ fs.s3a.aws.credentials.provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider
+ volumes:
+ - name: "tmp"
+ hostPath:
+ path: "/tmp"
+ type: Directory
+ - name: config-vol
+ configMap:
+ name: iot-oui-counts-silver-query
+ items:
+ - key: query.sql
+ path: query.sql
+ driver:
+ serviceAccount: spark-data-lake-access
+ cores: 1
+ coreLimit: "1200m"
+ memory: "512m"
+ nodeSelector:
+ node.kubernetes.io/instance-type: m5.large
+ envVars:
+ TABLE_IOT_PACKETS: s3a://foundation-data-lake-requester-pays/silver/iot-packets
+ PARTITION_BY: "date"
+ CHECKPOINT: s3a://foundation-data-lake-requester-pays/checkpoints/iot-oui-counts
+ OUTPUT: s3a://foundation-data-lake-requester-pays/silver/iot-oui-counts
+ QUERY_PATH: /app/query.sql
+ labels:
+ version: 3.4.0
+ volumeMounts:
+ - name: "test-volume"
+ mountPath: "/tmp"
+ - name: config-vol
+ mountPath: /app
+ executor:
+ serviceAccount: spark-data-lake-access
+ cores: 1
+ coreLimit: "1200m"
+ instances: 3
+ memory: "10G"
+ tolerations: # Schedule executor pods on spot instance group
+ - key: dedicated
+ operator: Equal
+ value: spark
+ effect: NoSchedule
+ nodeSelector:
+ nodegroup-type: spot
+ labels:
+ version: 3.4.0
+ volumeMounts:
+ - name: "tmp"
+ mountPath: "/tmp"