lsst-dm · ktlim · Mar 21, 2023 · Mar 21, 2023 · Mar 21, 2023 · Mar 21, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.secrets
diff --git a/README.md b/README.md
@@ -1,2 +1,150 @@
 # usdf_deploy
-Deployment configurations and scripts for the USDF.
+Deployment configurations and scripts for the the data ingest from Rubin to the USDF.
+
+
+# Background
+
+We expect images from the summit (or a test stand) to arrive at USDF by means of an PUT of an s3 object.
+A set of services is notified of that PUT and handles the images, ingesting them into a Butler repository and defining visits for them.
+
+In the short term, the services can also optionally register the images with Rucio for replication.
+
+In order to act upon these images, we expect
+
+- a web notification that a new object has been PUT
+- access to the s3 bucket (read only)
+- credentials to update the butler database
+
+See [Section 5 of DMTN-143](https://dmtn-143.lsst.io/#implementation) for more details.
+
+# Bucket Policies
+
+The buckets are paired, with a raw data bucket and a user data products bucket for each environment.
+This enables each environment to work the same way as the Summit, enabling more end-to-end testing.
+
+The raw data bucket owner credentials are given to the Camera subsystem to enable writing directly to the bucket.
+Eventually the ``transfer_embargo`` service that moves datasets from the embargo repo to the un-embargoed main repo will also have write credentials in order to remove datasets.
+
+The ``rubin-summit-users`` bucket owner credentials are given to all users.
+These ``rubin-summit-users`` credentials are then given read access to all buckets and read/write access to all ``-users`` buckets.
+The use of a single set of credentials means that users don't need to switch credentials when going from environment to environment.
+
+The bucket policies are applied using the AWS s3api.
+```
+alias s3api="apptainer exec /sdf/sw/s3/aws-cli_latest.sif aws --endpoint-url https://s3dfrgw.slac.stanford.edu s3api"
+s3api --profile=$PROFILE put-bucket-policy --bucket $BUCKET --policy file:$FILE
+```
+where the PROFILE specifies the credentials for the bucket owner.
+
+The bucket policies should only need to be set once, but they may change over time.
+
+# Bucket Notifications
+
+Setting up bucket notifications in S3 (including Ceph) requires two things: creating a notification topic and configuring the bucket to notify on that topic.
+
+Creating a topic uses parameters specified in the [Ceph Bucket Notifications documentation](https://docs.ceph.com/en/latest/radosgw/notifications/#create-a-topic).
+For a Kakfa or webhook topic, make sure to specify ``push-endpoint`` as a URI.
+The auto-ingest system expects ``persistent=true`` in order to allow Ceph to return status as soon as an object is created, without waiting for the notification.
+The auto-ingest system also expects webhooks to have an ``OpaqueData`` item which matches the ``notification`` secret in ``vault.slac.stanford.edu``.
+Sample command:
+```
+alias sns='singularity exec /sdf/sw/s3/aws-cli_latest.sif aws --endpoint-url https://s3dfrgw.slac.stanford.edu sns --region=""'
+sns create-topic --name=rubin-summit --attributes='{"push-endpoint": "http://172.24.5.174:8080/notify", "OpaqueData": "GETFROMVAULT", "persistent": "true" }'
+```
+Each topic is assigned an "ARN" by Ceph with its name as the last component for future reference.
+
+Next, one or more notifications need to be configured for the bucket.
+Each one links bucket events with topics via the ARN.
+Sample JSON:
+```
+{
+    "TopicConfigurations": [
+        {
+            "Id": "rubin-prompt-processing-prod",
+            "TopicArn": "arn:aws:sns:default::rubin-prompt-processing-prod",
+            "Events": [
+                "s3:ObjectCreated:*"
+            ]
+        },
+        {
+            "Id": "rubin-summit-to-http",
+            "TopicArn": "arn:aws:sns:default::rubin-summit",
+            "Events": [
+                "s3:ObjectCreated:*"
+            ]
+        }
+    ]
+}
+```
+Sample command:
+```
+s3api --profile=wsummit put-bucket-notification-configuration --bucket=rubin-summit --notification-configuration=file:///path/to/my/config.json
+```
+
+Note that changing a topic's attributes does not take effect until the bucket notification configurations are rewritten, even if they're updated with the exact same JSON.
+
+# Deployment Structure
+
+The ingest services are comprised of a Redis pod, a single "enqueue" pod, a single "idle" cleanup pod, and a set of one or more "ingest" pods.
+
+Redis is the inter-pod communications mechanism via persistent queues, and it also acts as the monitoring database.
+
+The "enqueue" pod receives bucket notification webhooks and immediately writes the embedded object store keys to a Redis queue.
+
+The "ingest" pods take the object store keys from the main queue, copy them atomically to a per-pod worker queue, and process them, removing them from the worker queue when done.
+
+The "idle" pod looks for worker queues that have not been modified in a while, indicating that the corresponding "ingest" pod has died.
+It pushes the contents of such queues back onto the main queue so that they're available to other "ingest" pods.
+
+Each pod type has a deployment YAML.
+There is also a ``ns.yaml`` that defines the namespace for each environment.
+A ``kustomization.yaml`` script adjusts these per environment.
+A common ``Makefile`` retrieves secrets from vault.slac.stanford.edu, dumps or applies the customized YAML, and cleans up the secrets.
+
+# Deployment Process
+
+Log into the k8s cluster.
+On USDF the two vclusters are https://k8s.slac.stanford.edu/usdf-embargo-dmz-dev and https://k8s.slac.stanford.edu/usdf-embargo-dmz for dev and prod respectively.
+
+Obtain a token to access the secrets in vault:
+```
+# obtain token to access secrets
+export VAULT_ADDR=https://vault.slac.stanford.edu
+vault login -method ldap -username <username>
+```
+Alternatively, especially for those without Windows LDAP accounts, copy the token from the web interface at vault.slac.stanford.edu and provide it to ``vault login``.
+
+Apply the Kubernetes manifests:
+```
+cd kubernetes/overlays
+make apply
+```
+The above will authenticate you against our vault instance so that you can obtain the most up-to-date secrets, download the passwords temporarily into your working directory, push the kubernetes manifests to the cluster and then subsequently remove the secrets.
+You can also apply one environment at a time.
+
+The external (but SLAC-internal) IP address of the ``-butler-enqueue`` service needs to be used in the endpoint address for the webhook notification topic for the corresponding raw data bucket (e.g. ``http://172.24.5.180:8080/notify``).
+The OpaqueData value for that notification topic should match the notification secret in vault.
+
+# Monitoring the Services
+
+The Loki log explorer at grafana.slac.stanford.edu is the best way to monitor the services for now.
+Select the ``vcluster--usdf-embargo-dmz`` namespace and (usually) the ``ingest`` container.
+Searching for "ERROR" may be helpful.
+
+# Scaling the Deployment
+
+If the latency between delivery of the image file and ingest seems high, increasing the number of ingest pods should help.
+
+``kubectl scale --replicas=N deployment/{env}-butler-ingest`` can be used to dynamically scale the number of ingest pods up or down.
+Editing the deployment to change the number of replicas has the same effect.
+Note that these changes only persist until the next ``make apply``.
+
+# Software Update Process
+
+1. After updating the service code in https://github.com/lsst-dm/embargo-butler, tag main with a ``vX.Y.Z`` semantic version.
+   This will automatically build and publish containers with that tag.
+1. Update the ``kustomization.yaml`` to select that tag for the environments and pods where it is needed.
+1. Apply and ensure that the deployment is correct in the dev cluster.
+   ``/sdf/home/k/ktl/ingest_trigger/trigger_ingest.py`` may be of use in testing.
+1. Commit and merge the deployment update.  Note that PRs need to be manually set to go against the ``slaclab`` repo ``main`` branch, since it is a fork.
+1. Apply and ensure that the deployment is correct in the prod cluster.
diff --git a/bucket-policies/rsu-read-bts.json → bucket-policies/rubin-bts-policy.json b/bucket-policies/rsu-read-bts.json → bucket-policies/rubin-bts-policy.json
diff --git a/bucket-policies/rsu-rw-btsu.json → bucket-policies/rubin-bts-users-policy.json b/bucket-policies/rsu-rw-btsu.json → bucket-policies/rubin-bts-users-policy.json
diff --git a/bucket-policies/rsu-read-sts.json → bucket-policies/rubin-sts-policy.json b/bucket-policies/rsu-read-sts.json → bucket-policies/rubin-sts-policy.json
diff --git a/bucket-policies/rubin-sts-topic.sh b/bucket-policies/rubin-sts-topic.sh
@@ -0,0 +1,3 @@
+alias sns='singularity exec /sdf/sw/s3/aws-cli_latest.sif aws --endpoint-url https://s3dfrgw.slac.stanford.edu sns --region=""'
+sns delete-topic --topic-arn=arn:aws:sns:default::rubin-sts
+sns create-topic --name=rubin-sts --attributes='{"push-endpoint": "http://172.24.5.180:8080/notify", "OpaqueData": "GET FROM VAULT", "persistent": "true" }'
diff --git a/bucket-policies/rsu-read-summit.json → bucket-policies/rubin-summit-policy.json b/bucket-policies/rsu-read-summit.json → bucket-policies/rubin-summit-policy.json
diff --git a/bucket-policies/rubin-summit-topic.sh b/bucket-policies/rubin-summit-topic.sh
@@ -0,0 +1,3 @@
+alias sns='singularity exec /sdf/sw/s3/aws-cli_latest.sif aws --endpoint-url https://s3dfrgw.slac.stanford.edu sns --region=""'
+sns delete-topic --topic-arn=arn:aws:sns:default::rubin-summit
+sns create-topic --name=rubin-summit --attributes='{"push-endpoint": "http://172.24.5.174:8080/notify", "OpaqueData": "GET FROM VAULT", "persistent": "true" }'
diff --git a/bucket-policies/rsu-read-tts.json → bucket-policies/rubin-tts-policy.json b/bucket-policies/rsu-read-tts.json → bucket-policies/rubin-tts-policy.json
diff --git a/bucket-policies/rsu-rw-ttsu.json → bucket-policies/rubin-tts-users-policy.json b/bucket-policies/rsu-rw-ttsu.json → bucket-policies/rubin-tts-users-policy.json
diff --git a/kubernetes/overlays/Makefile b/kubernetes/overlays/Makefile
@@ -0,0 +1,6 @@
+
+apply:
+	cd summit && make apply
+	cd bts && make apply
+	cd tts && make apply
+	cd sts && make apply
diff --git a/kubernetes/overlays/Makefile.per_environment b/kubernetes/overlays/Makefile.per_environment
@@ -0,0 +1,30 @@
+SECRET_PATH ?= secret/rubin/usdf-embargo-dmz
+ENV = $(notdir ${CURDIR})
+
+get-secrets-from-vault:
+	mkdir -p etc/.secrets/
+	# internal redis password and Ceph notification secret
+	set -e; for i in redis notification; do vault kv get --field=$$i ${SECRET_PATH}/${ENV} > etc/.secrets/$$i ; done
+	# butler dbauth.yaml file with creds
+	set -e; for i in db-auth.yaml; do vault kv get --field=$$i secret/rubin/usdf-butler/client-config > etc/.secrets/$$i ; done
+	# butler Postgres creds
+	set -e; for i in password user; do vault kv get --field=$$i secret/rubin/usdf-butler/postgres > etc/.secrets/pg_$$i ; done
+	# s3 creds for data from the summit (s3df ceph bucket)
+	set -e; for i in access_key secret_key; do vault kv get --field=$$i secret/rubin/embargo/read/rubin-summit-users > etc/.secrets/$$i ; done
+	# optional rucio service account ssh key
+	for i in register_svc_rsa; do vault kv get --field=$$i ${SECRET_PATH}/${ENV} > etc/.secrets/$$i || echo ignoring missing $$i ; done
+
+clean-secrets:
+	rm -rf etc/.secrets/
+
+run-dump:
+	kubectl kustomize .
+
+dump: get-secrets-from-vault run-dump clean-secrets
+
+run-apply:
+	kubectl apply -k .
+
+apply: get-secrets-from-vault run-apply clean-secrets
+
+
diff --git a/kubernetes/overlays/README.md b/kubernetes/overlays/README.md
@@ -0,0 +1,6 @@
+obtain your k8s token via https://k8s.slac.stanford.edu/{usdf-embargo-dmz,usdf-embargo-dmz-dev}, and then deploy each separate environment by cd'ing into its directory and running
+
+vault login -method=ldap 
+make apply
+
+
diff --git a/kubernetes/overlays/bts/Makefile b/kubernetes/overlays/bts/Makefile
@@ -0,0 +1 @@
+../Makefile.per_environment
diff --git a/usdf-oga-dmz/enqueue-deploy.yaml → kubernetes/overlays/bts/enqueue-deploy.yaml b/usdf-oga-dmz/enqueue-deploy.yaml → kubernetes/overlays/bts/enqueue-deploy.yaml
@@ -2,7 +2,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: oga-butler-enqueue
+  name: bts-butler-enqueue
   labels:
     app: enqueue
   annotations:
@@ -20,7 +20,7 @@ spec:
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: oga-butler-enqueue
+  name: bts-butler-enqueue
   labels:
     app: enqueue
 spec:
@@ -35,23 +35,20 @@ spec:
     spec:
       containers:
       - name: enqueue
-        image: "ghcr.io/lsst-dm/oga-butler-enqueue:latest"
+        image: "ghcr.io/lsst-dm/embargo-butler-enqueue:prod"
         env:
         - name: REDIS_HOST
-          valueFrom:
-            configMapKeyRef:
-              name: oga-butler-redis-common
-              key: redis_host
+          value: redis
         - name: REDIS_PASSWORD
           valueFrom:
             secretKeyRef:
-              name: oga-butler-enqueue-secret
-              key: redis_password
+              name: redis
+              key: redis-password
         - name: NOTIFICATION_SECRET
           valueFrom:
             secretKeyRef:
-              name: oga-butler-enqueue-secret
-              key: notification_secret
+              name: notification
+              key: secret
         resources:
           limits:
             cpu: "1"
diff --git a/usdf-oga-dmz/idle-deploy.yaml → kubernetes/overlays/bts/idle-deploy.yaml b/usdf-oga-dmz/idle-deploy.yaml → kubernetes/overlays/bts/idle-deploy.yaml
@@ -2,7 +2,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: oga-butler-idle
+  name: bts-butler-idle
   labels:
     app: idle
 spec:
@@ -17,18 +17,15 @@ spec:
     spec:
       containers:
       - name: idle
-        image: "ghcr.io/lsst-dm/oga-butler-idle:latest"
+        image: "ghcr.io/lsst-dm/embargo-butler-idle:prod"
         env:
         - name: REDIS_HOST
-          valueFrom:
-            configMapKeyRef:
-              name: oga-butler-redis-common
-              key: redis_host
+          value: redis
         - name: REDIS_PASSWORD
           valueFrom:
             secretKeyRef:
-              name: oga-butler-ingest-secret
-              key: redis_password
+              name: redis
+              key: redis-password
         resources:
           limits:
             cpu: "0.1"
diff --git a/usdf-oga-dmz/ingest-deploy-bts.yaml → kubernetes/overlays/bts/ingest-deploy.yaml b/usdf-oga-dmz/ingest-deploy-bts.yaml → kubernetes/overlays/bts/ingest-deploy.yaml
@@ -20,7 +20,7 @@ spec:
     spec:
       initContainers:
       - name: "fix-secret-permissions"
-        image: "ghcr.io/lsst-dm/oga-butler-ingest:latest"
+        image: "ghcr.io/lsst-dm/embargo-butler-ingest:prod"
         command:
         - "/bin/bash"
         - "-c"
@@ -35,46 +35,35 @@ spec:
           name: secrets
       containers:
       - name: ingest
-        image: "ghcr.io/lsst-dm/oga-butler-ingest:latest"
+        image: "ghcr.io/lsst-dm/embargo-butler-ingest:prod"
         env:
         - name: S3_ENDPOINT_URL
-          valueFrom:
-            configMapKeyRef:
-              name: oga-butler-ingest-config
-              key: s3_url
+          value: https://s3dfrgw.slac.stanford.edu
         - name: REDIS_HOST
-          valueFrom:
-            configMapKeyRef:
-              name: oga-butler-redis-common
-              key: redis_host
+          value: redis
         - name: BUCKET
-          valueFrom:
-            configMapKeyRef:
-              name: oga-butler-ingest-config
-              key: bts_bucket
+          value: rubin-bts
         - name: BUTLER_REPO
-          valueFrom:
-            configMapKeyRef:
-              name: oga-butler-ingest-config
-              key: bts_butler_repo
+          value: s3://rubin-bts-users/butler.yaml
         - name: REDIS_PASSWORD
           valueFrom:
             secretKeyRef:
-              name: oga-butler-ingest-secret
-              key: redis_password
+              name: redis
+              key: redis-password
         - name: AWS_ACCESS_KEY_ID
           valueFrom:
             secretKeyRef:
-              name: oga-butler-ingest-secret
+              name: s3
               key: s3_access
         - name: AWS_SECRET_ACCESS_KEY
           valueFrom:
             secretKeyRef:
-              name: oga-butler-ingest-secret
+              name: s3
               key: s3_key
         resources:
           limits:
             cpu: "1"
+            memory: "2Gi"
         volumeMounts:
         - mountPath: /home/lsst/.lsst
           name: secrets
@@ -83,4 +72,4 @@ spec:
         emptyDir: {}
       - name: db-auth
         secret:
-          secretName: oga-butler-ingest-dbauth
+          secretName: db-auth
diff --git a/kubernetes/overlays/bts/kustomization.yaml b/kubernetes/overlays/bts/kustomization.yaml
@@ -0,0 +1,37 @@
+images:
+- name: ghcr.io/lsst-dm/embargo-butler-enqueue
+  newTag: "0.99"
+- name: ghcr.io/lsst-dm/embargo-butler-ingest
+  newTag: "0.99"
+- name: ghcr.io/lsst-dm/embargo-butler-idle
+  newTag: "0.99"
+- name: docker.io/redis
+  newTag: "7.0.8"
+
+namespace: bts
+
+resources:
+- ns.yaml
+- enqueue-deploy.yaml
+- idle-deploy.yaml
+- ingest-deploy.yaml
+- redis-deploy.yaml
+
+secretGenerator:
+- name: s3
+  files:
+  - s3_access=etc/.secrets/access_key
+  - s3_key=etc/.secrets/secret_key
+- name: redis
+  files:
+  - redis-password=etc/.secrets/redis
+- name: db-auth
+  files:
+  - db-auth.yaml=etc/.secrets/db-auth.yaml
+- name: db-env
+  files:
+  - pg_password=etc/.secrets/pg_password
+  - pg_user=etc/.secrets/pg_user
+- name: notification
+  files:
+  - secret=etc/.secrets/notification