From a141f4c0b277a2aa22f572d24b393e39879436cf Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 09:25:46 +0100
Subject: [PATCH 01/12] chore: update yoyo migrations; add VS Code helper

---
 .vscode/launch.json  | 31 ++++++++++++++++++++++++++++++-
 pyproject.toml       |  7 ++++---
 requirements-dev.txt | 34 +++++++++++++++++++---------------
 requirements.txt     | 18 +++++++++---------
 4 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 4a6ec4f..a658f52 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -12,10 +12,39 @@
             "args": [
                 "--operation",
                 "checker",
+                "--single-run",
+                "--run-for-n-datasets",
+                "75"
+            ],
+            "console": "integratedTerminal",
+            "envFile": "${workspaceFolder}/.env"
+        },
+        {
+            "name": "Python Debugger: Bulk Data Service - Zipper - Single Run",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "src/iati_bulk_data_service.py",
+            "args": [
+                "--operation",
+                "zipper",
                 "--single-run"
             ],
             "console": "integratedTerminal",
             "envFile": "${workspaceFolder}/.env"
-        }
+        },
+        {
+            "name": "Python Debugger: Bulk Data Service - Checker & Zipper Loop",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "src/iati_bulk_data_service.py",
+            "args": [
+                "--operation",
+                "checker",
+                "--run-for-n-datasets",
+                "75"
+            ],
+            "console": "integratedTerminal",
+            "envFile": "${workspaceFolder}/.env"
+        },
     ]
 }
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ac7e687..f982c49 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,8 +7,8 @@ dependencies = [
     "azure-storage-blob==12.20.0",
     "psycopg[binary,pool]==3.1.18",
     "requests==2.31.0",
-    "yoyo-migrations==8.2.0",
-    "prometheus-client==0.20.0"
+    "yoyo-migrations==9.0.0",
+    "prometheus-client==0.20.0",
 ]
 
 
@@ -22,7 +22,8 @@ dev = [
     "flake8",
     "flake8-pyproject",
     "types-requests",
-    "python-dotenv"
+    "python-dotenv",
+    "pytest-watcher"
 ]
 
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9169a7e..d7ec9d2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -8,13 +8,13 @@ azure-core==1.30.2
     # via azure-storage-blob
 azure-storage-blob==12.20.0
     # via bulk-data-service (pyproject.toml)
-black==24.4.2
+black==24.8.0
     # via bulk-data-service (pyproject.toml)
 build==1.2.1
     # via pip-tools
-certifi==2024.7.4
+certifi==2024.8.30
     # via requests
-cffi==1.16.0
+cffi==1.17.1
     # via cryptography
 charset-normalizer==3.3.2
     # via requests
@@ -22,17 +22,17 @@ click==8.1.7
     # via
     #   black
     #   pip-tools
-cryptography==42.0.8
+cryptography==43.0.1
     # via azure-storage-blob
-flake8==7.1.0
+flake8==7.1.1
     # via
     #   bulk-data-service (pyproject.toml)
     #   flake8-pyproject
 flake8-pyproject==1.2.3
     # via bulk-data-service (pyproject.toml)
-idna==3.7
+idna==3.8
     # via requests
-importlib-metadata==8.0.0
+importlib-metadata==8.4.0
     # via yoyo-migrations
 iniconfig==2.0.0
     # via pytest
@@ -42,7 +42,7 @@ isort==5.13.2
     # via bulk-data-service (pyproject.toml)
 mccabe==0.7.0
     # via flake8
-mypy==1.10.1
+mypy==1.11.2
     # via bulk-data-service (pyproject.toml)
 mypy-extensions==1.0.0
     # via
@@ -69,7 +69,7 @@ psycopg-binary==3.1.18
     # via psycopg
 psycopg-pool==3.2.2
     # via psycopg
-pycodestyle==2.12.0
+pycodestyle==2.12.1
     # via flake8
 pycparser==2.22
     # via cffi
@@ -79,7 +79,9 @@ pyproject-hooks==1.1.0
     # via
     #   build
     #   pip-tools
-pytest==8.2.2
+pytest==8.3.2
+    # via bulk-data-service (pyproject.toml)
+pytest-watcher==0.4.3
     # via bulk-data-service (pyproject.toml)
 python-dotenv==1.0.1
     # via bulk-data-service (pyproject.toml)
@@ -91,11 +93,11 @@ six==1.16.0
     # via
     #   azure-core
     #   isodate
-sqlparse==0.5.0
+sqlparse==0.5.1
     # via yoyo-migrations
 tabulate==0.9.0
     # via yoyo-migrations
-types-requests==2.32.0.20240622
+types-requests==2.32.0.20240905
     # via bulk-data-service (pyproject.toml)
 typing-extensions==4.12.2
     # via
@@ -108,11 +110,13 @@ urllib3==2.2.2
     # via
     #   requests
     #   types-requests
-wheel==0.43.0
+watchdog==5.0.2
+    # via pytest-watcher
+wheel==0.44.0
     # via pip-tools
-yoyo-migrations==8.2.0
+yoyo-migrations==9.0.0
     # via bulk-data-service (pyproject.toml)
-zipp==3.19.2
+zipp==3.20.1
     # via importlib-metadata
 
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/requirements.txt b/requirements.txt
index a37dbd9..2e62df3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,23 +2,23 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile --strip-extras pyproject.toml
+#    pip-compile --output-file=requirements.txt --strip-extras pyproject.toml
 #
 azure-core==1.30.2
     # via azure-storage-blob
 azure-storage-blob==12.20.0
     # via bulk-data-service (pyproject.toml)
-certifi==2024.7.4
+certifi==2024.8.30
     # via requests
-cffi==1.16.0
+cffi==1.17.1
     # via cryptography
 charset-normalizer==3.3.2
     # via requests
-cryptography==42.0.8
+cryptography==43.0.1
     # via azure-storage-blob
-idna==3.7
+idna==3.8
     # via requests
-importlib-metadata==8.0.0
+importlib-metadata==8.4.0
     # via yoyo-migrations
 isodate==0.6.1
     # via azure-storage-blob
@@ -40,7 +40,7 @@ six==1.16.0
     # via
     #   azure-core
     #   isodate
-sqlparse==0.5.0
+sqlparse==0.5.1
     # via yoyo-migrations
 tabulate==0.9.0
     # via yoyo-migrations
@@ -52,7 +52,7 @@ typing-extensions==4.12.2
     #   psycopg-pool
 urllib3==2.2.2
     # via requests
-yoyo-migrations==8.2.0
+yoyo-migrations==9.0.0
     # via bulk-data-service (pyproject.toml)
-zipp==3.19.2
+zipp==3.20.1
     # via importlib-metadata

From 465f853bb14f42d7fad841874fd7f2774abc15b9 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 09:32:10 +0100
Subject: [PATCH 02/12] chore: add and tidy config vars

---
 .env-example                                          |  7 ++++---
 .github/workflows/build-and-deploy-job.yml            |  3 +++
 .../azure-resource-manager-deployment-template.yml    |  8 ++++++--
 azure-deployment/generate-manifest-from-template.sh   |  3 +++
 .../manual-azure-deploy-variables-example.env         |  7 ++++---
 .../add-default-config-to-github-variables.sh         |  1 +
 azure-provision/default-github-config-template.env    | 11 +++++++----
 7 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/.env-example b/.env-example
index e2dd984..4529794 100644
--- a/.env-example
+++ b/.env-example
@@ -1,5 +1,7 @@
 DATA_REGISTRATION=ckan-registry
 DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
+DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true"
+DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
 
 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1
 
@@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24
 
 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
 
-# Log file
-LOGFILE=
-
 # Sample local setup - values read by docker compose (for simple Postgres DB
 # creation), and used by the app
 DB_NAME=bulk_data_service_db
@@ -27,3 +26,5 @@ AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM0
 
 AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
 AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
+
+CHECKER_LOOP_WAIT_MINS=20
diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml
index bda631a..cec52bd 100644
--- a/.github/workflows/build-and-deploy-job.yml
+++ b/.github/workflows/build-and-deploy-job.yml
@@ -103,12 +103,15 @@ jobs:
           # Variables which configure the app
           DATA_REGISTRATION: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRATION')] }}
           DATA_REGISTRY_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_BASE_URL')] }}
+          DATA_REGISTRY_PUBLISHER_METADATA_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_URL')] }}
+          DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS')] }}
           NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }}
           FORCE_REDOWNLOAD_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'FORCE_REDOWNLOAD_AFTER_HOURS')] }}
           REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS')] }}
           ZIP_WORKING_DIR: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'ZIP_WORKING_DIR')] }}
           AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML')] }}
           AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP')] }}
+          CHECKER_LOOP_WAIT_MINS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'CHECKER_LOOP_WAIT_MINS')] }}
 
         run: |
           ./azure-deployment/generate-manifest-from-template.sh
diff --git a/azure-deployment/azure-resource-manager-deployment-template.yml b/azure-deployment/azure-resource-manager-deployment-template.yml
index c650182..6d0c651 100644
--- a/azure-deployment/azure-resource-manager-deployment-template.yml
+++ b/azure-deployment/azure-resource-manager-deployment-template.yml
@@ -32,16 +32,20 @@ properties: # Properties of container group
             value: "#DATA_REGISTRATION#"
           - name: DATA_REGISTRY_BASE_URL
             value: "#DATA_REGISTRY_BASE_URL#"
+          - name: DATA_REGISTRY_PUBLISHER_METADATA_URL
+            value: "#DATA_REGISTRY_PUBLISHER_METADATA_URL#"
+          - name: DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS
+            value: "#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#"
           - name: WEB_BASE_URL
             value: "#WEB_BASE_URL#"
+          - name: CHECKER_LOOP_WAIT_MINS
+            value: "#CHECKER_LOOP_WAIT_MINS#"
           - name: NUMBER_DOWNLOADER_THREADS
             value: "#NUMBER_DOWNLOADER_THREADS#"
           - name: FORCE_REDOWNLOAD_AFTER_HOURS
             value: "#FORCE_REDOWNLOAD_AFTER_HOURS#"
           - name: REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS
             value: "#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#"
-          - name: LOGFILE
-            value: ""
           - name: ZIP_WORKING_DIR
             value: "#ZIP_WORKING_DIR#"
           - name: AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML
diff --git a/azure-deployment/generate-manifest-from-template.sh b/azure-deployment/generate-manifest-from-template.sh
index f6b65bb..19abc6a 100755
--- a/azure-deployment/generate-manifest-from-template.sh
+++ b/azure-deployment/generate-manifest-from-template.sh
@@ -43,6 +43,8 @@ sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment
 
 sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
 sed -i ''s^#DATA_REGISTRY_BASE_URL#^$DATA_REGISTRY_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_URL#^$DATA_REGISTRY_PUBLISHER_METADATA_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#^$DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
 sed -i ''s^#NUMBER_DOWNLOADER_THREADS#^$NUMBER_DOWNLOADER_THREADS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
 sed -i ''s^#FORCE_REDOWNLOAD_AFTER_HOURS#^$FORCE_REDOWNLOAD_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
 sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
@@ -50,3 +52,4 @@ sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resou
 sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
 sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
 sed -i ''s^#WEB_BASE_URL#^$WEB_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
+sed -i ''s^#CHECKER_LOOP_WAIT_MINS#^$CHECKER_LOOP_WAIT_MINS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
diff --git a/azure-deployment/manual-azure-deploy-variables-example.env b/azure-deployment/manual-azure-deploy-variables-example.env
index fccf05e..198f3ac 100644
--- a/azure-deployment/manual-azure-deploy-variables-example.env
+++ b/azure-deployment/manual-azure-deploy-variables-example.env
@@ -8,14 +8,15 @@ WEB_BASE_URL=
 
 DATA_REGISTRATION=ckan-registry
 DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
+DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true"
+DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
 
 FORCE_REDOWNLOAD_AFTER_HOURS=24
 
-# Log file
-LOGFILE=
-
 NUMBER_DOWNLOADER_THREADS=25
 
 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
 
 ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
+
+CHECKER_LOOP_WAIT_MINS=20
diff --git a/azure-provision/add-default-config-to-github-variables.sh b/azure-provision/add-default-config-to-github-variables.sh
index ce17d50..404c11a 100755
--- a/azure-provision/add-default-config-to-github-variables.sh
+++ b/azure-provision/add-default-config-to-github-variables.sh
@@ -32,5 +32,6 @@ TARGET_ENVIRONMENT="$1"
 cp -f azure-provision/default-github-config-template.env azure-provision/default-github-config.env
 
 sed -i "s/^/${TARGET_ENVIRONMENT^^}/g" azure-provision/default-github-config.env
+sed -i "s/{{TARGET_ENVIRONMENT}}/${TARGET_ENVIRONMENT}/g" azure-provision/default-github-config.env
 
 gh variable set --env-file ./azure-provision/default-github-config.env
diff --git a/azure-provision/default-github-config-template.env b/azure-provision/default-github-config-template.env
index da2fafc..481da52 100644
--- a/azure-provision/default-github-config-template.env
+++ b/azure-provision/default-github-config-template.env
@@ -1,9 +1,12 @@
-_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=\$web
-_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=\$web
+_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="$web"
+_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="$web"
 _DATA_REGISTRATION=ckan-registry
 _DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
+_DATA_REGISTRY_PUBLISHER_METADATA_URL=https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true
+_DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
 _FORCE_REDOWNLOAD_AFTER_HOURS=24
-_LOGFILE=log
 _NUMBER_DOWNLOADER_THREADS=25
 _REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
-_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
\ No newline at end of file
+_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
+_WEB_BASE_URL="https://{{TARGET_ENVIRONMENT}}-bulk-data.iatistandard.org"
+_CHECKER_LOOP_WAIT_MINS=20

From 10f865b383fcd9d701fe71f7ab2c8696f89bfb89 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 09:32:44 +0100
Subject: [PATCH 03/12] feat: turn caching on CDN off

---
 azure-provision/azure-create-resources.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/azure-provision/azure-create-resources.sh b/azure-provision/azure-create-resources.sh
index ebc0652..585bb1b 100755
--- a/azure-provision/azure-create-resources.sh
+++ b/azure-provision/azure-create-resources.sh
@@ -206,6 +206,11 @@ az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
                        --origin-host-header "$AZURE_BASE_HOSTNAME" \
                        --location global
 
+az cdn endpoint rule add --resource-group "$RESOURCE_GROUP_NAME" \
+                         --profile-name "$CDN_PROFILE_NAME" \
+                         --name "$CDN_ENDPOINT_NAME" \
+                         --action-name "CacheExpiration" \
+                         --cache-behavior BypassCache --rule-name global --order 0
 
 read -p "Press any key when the CNAME has been created on Cloudflare " -n 1 -r
 

From 398cd8e224ce02052b5083ba268428828c625772 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 09:37:14 +0100
Subject: [PATCH 04/12] chore: env var for wait time; urblib warnings off

---
 Dockerfile                       | 2 +-
 src/bulk_data_service/checker.py | 3 ++-
 src/config/config.py             | 4 +++-
 src/config/initialisation.py     | 9 +++++++++
 src/iati_bulk_data_service.py    | 3 +++
 5 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 src/config/initialisation.py

diff --git a/Dockerfile b/Dockerfile
index 3b702c2..b84c3fc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.12-slim-bookworm
+FROM python:3.12.5-slim-bookworm
 
 RUN apt-get update -y
 
diff --git a/src/bulk_data_service/checker.py b/src/bulk_data_service/checker.py
index 5d6d417..f0e1240 100644
--- a/src/bulk_data_service/checker.py
+++ b/src/bulk_data_service/checker.py
@@ -33,7 +33,8 @@ def checker_service_loop(context: dict):
 
             zipper_run(context, datasets_in_zip, datasets_in_bds)
 
-            time.sleep(60 * 30)
+            context["logger"].info("Pausing for {} mins".format(context["CHECKER_LOOP_WAIT_MINS"]))
+            time.sleep(60 * int(context["CHECKER_LOOP_WAIT_MINS"]))
 
         except Exception as e:
             context["logger"].error(
diff --git a/src/config/config.py b/src/config/config.py
index 5d1d1ea..28908e4 100644
--- a/src/config/config.py
+++ b/src/config/config.py
@@ -3,11 +3,12 @@
 _config_variables = [
     "DATA_REGISTRATION",
     "DATA_REGISTRY_BASE_URL",
+    "DATA_REGISTRY_PUBLISHER_METADATA_URL",
+    "DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS",
     "WEB_BASE_URL",
     "NUMBER_DOWNLOADER_THREADS",
     "FORCE_REDOWNLOAD_AFTER_HOURS",
     "REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS",
-    "LOGFILE",
     "ZIP_WORKING_DIR",
     "DB_NAME",
     "DB_USER",
@@ -19,6 +20,7 @@
     "AZURE_STORAGE_CONNECTION_STRING",
     "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML",
     "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP",
+    "CHECKER_LOOP_WAIT_MINS"
 ]
 
 
diff --git a/src/config/initialisation.py b/src/config/initialisation.py
new file mode 100644
index 0000000..b8dddf9
--- /dev/null
+++ b/src/config/initialisation.py
@@ -0,0 +1,9 @@
+import urllib3
+
+
+def misc_global_initialisation(context: dict):
+    # these warnings are turned off because many IATI XML files are served
+    # from web servers with misconfigured certificates, so we can't verify
+    # certificates, and without this the logs fill up with warnings saying we
+    # should turn verification on
+    urllib3.disable_warnings(category=urllib3.exceptions.InsecureRequestWarning)
diff --git a/src/iati_bulk_data_service.py b/src/iati_bulk_data_service.py
index fc9d51e..ff4e7ed 100644
--- a/src/iati_bulk_data_service.py
+++ b/src/iati_bulk_data_service.py
@@ -3,6 +3,7 @@
 from bulk_data_service.checker import checker
 from bulk_data_service.zipper import zipper
 from config.config import get_config
+from config.initialisation import misc_global_initialisation
 from utilities.azure import create_azure_blob_containers
 from utilities.db import apply_db_migrations
 from utilities.logging import initialise_logging
@@ -20,6 +21,8 @@ def main(args: argparse.Namespace):
 
     create_azure_blob_containers(context)
 
+    misc_global_initialisation(context)
+
     if args.operation == "checker":
         checker(context)
     elif args.operation == "zipper":

From 7e4a898ffe8f4ca49b0a744e3f9d8aaaf71f2b17 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:20:28 +0100
Subject: [PATCH 05/12] test: update test env

---
 tests-local-environment/.env                  | 11 ++++---
 ...n-registration-and-data-server-config.json | 33 +++++++++++++++++++
 tests/artifacts/config-files/env-file-1       |  9 +++--
 tests/artifacts/config-files/env-file-2       |  9 +++--
 4 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/tests-local-environment/.env b/tests-local-environment/.env
index 717369a..d3658f2 100644
--- a/tests-local-environment/.env
+++ b/tests-local-environment/.env
@@ -1,5 +1,7 @@
 DATA_REGISTRATION=ckan-registry
 DATA_REGISTRY_BASE_URL=http://localhost:3000/registration/datasets-03
+DATA_REGISTRY_PUBLISHER_METADATA_URL=http://localhost:3000/registration/ckan-publishers
+DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
 
 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1
 
@@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24
 
 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
 
-# Logs directory
-LOGFILE=
-
 ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
 
 DB_NAME=bulk_data_service_db
@@ -27,5 +26,7 @@ DB_CONNECTION_TIMEOUT=30
 # Azurite Emulator (run from docker compose)
 AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:11000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:11001/devstoreaccount1;TableEndpoint=http://127.0.0.1:11002/devstoreaccount1;
 
-AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
-AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=test-data
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=test-data
+
+CHECKER_LOOP_WAIT_MINS=1
diff --git a/tests-local-environment/mockoon-registration-and-data-server-config.json b/tests-local-environment/mockoon-registration-and-data-server-config.json
index f3f2c39..774d281 100644
--- a/tests-local-environment/mockoon-registration-and-data-server-config.json
+++ b/tests-local-environment/mockoon-registration-and-data-server-config.json
@@ -402,6 +402,35 @@
         }
       ],
       "responseMode": null
+    },
+    {
+      "uuid": "52cc8a4f-d934-4056-a1dd-1fd970079d14",
+      "type": "http",
+      "documentation": "",
+      "method": "get",
+      "endpoint": "registration/ckan-publishers",
+      "responses": [
+        {
+          "uuid": "dc002f3b-6e85-446b-95cb-223ba39082f6",
+          "body": "{\n  \"help\": \"https://iatiregistry.org/api/3/action/help_show?name=organization_list\",\n  \"success\": true,\n  \"result\": [\n    {\n      \"description\": \"\",\n      \"id\": \"4f0f8498-20d2-4ca5-a20f-f441eedb1d4f\",\n      \"image_display_url\": \"\",\n      \"image_url\": \"\",\n      \"is_organization\": true,\n      \"license_id\": \"notspecified\",\n      \"name\": \"test_org_a\",\n      \"num_followers\": 0,\n      \"package_count\": 0,\n      \"publisher_agencies\": \"\",\n      \"publisher_constraints\": \"\",\n      \"publisher_contact\": \"s'Gravenhekje 1a\\r\\n1011TG Amsterdam\\r\\nThe Netherlands\",\n      \"publisher_contact_email\": \"please@update.email\",\n      \"publisher_country\": \"NL\",\n      \"publisher_data_quality\": \"\",\n      \"publisher_description\": \"\",\n      \"publisher_field_exclusions\": \"\",\n      \"publisher_frequency\": \"\",\n      \"publisher_frequency_select\": \"not_specified\",\n      \"publisher_iati_id\": \"TEST_ORG_A\",\n      \"publisher_implementation_schedule\": \"\",\n      \"publisher_organization_type\": \"60\",\n      \"publisher_record_exclusions\": \"\",\n      \"publisher_refs\": \"\",\n      \"publisher_segmentation\": \"\",\n      \"publisher_source_type\": \"primary_source\",\n      \"publisher_thresholds\": \"\",\n      \"publisher_timeliness\": \"\",\n      \"publisher_ui\": \"\",\n      \"publisher_units\": \"\",\n      \"publisher_url\": \"http://www.onepercentclub.com\",\n      \"state\": \"active\",\n      \"title\": \"Test Organisation A\",\n      \"type\": \"organization\",\n      \"users\": [\n        {\n          \"capacity\": \"admin\",\n          \"name\": \"onepercentclub\"\n        }\n      ],\n      \"tags\": [\n        \n      ],\n      \"groups\": [\n        \n      ]\n    },    \n    {\n      \"description\": \"\",\n      \"id\": \"31ffc713-cba9-4af9-a71e-9306d00c11e8\",\n      \"image_display_url\": \"\",\n      \"image_url\": \"\",\n      \"is_organization\": true,\n      \"license_id\": \"notspecified\",\n      \"name\": \"onepercentclub\",\n      \"num_followers\": 0,\n      \"package_count\": 0,\n      \"publisher_agencies\": \"\",\n      \"publisher_constraints\": \"\",\n      \"publisher_contact\": \"s'Gravenhekje 1a\\r\\n1011TG Amsterdam\\r\\nThe Netherlands\",\n      \"publisher_contact_email\": \"please@update.email\",\n      \"publisher_country\": \"NL\",\n      \"publisher_data_quality\": \"\",\n      \"publisher_description\": \"\",\n      \"publisher_field_exclusions\": \"\",\n      \"publisher_frequency\": \"\",\n      \"publisher_frequency_select\": \"not_specified\",\n      \"publisher_iati_id\": \"NL-KVK-3427895\",\n      \"publisher_implementation_schedule\": \"\",\n      \"publisher_organization_type\": \"60\",\n      \"publisher_record_exclusions\": \"\",\n      \"publisher_refs\": \"\",\n      \"publisher_segmentation\": \"\",\n      \"publisher_source_type\": \"primary_source\",\n      \"publisher_thresholds\": \"\",\n      \"publisher_timeliness\": \"\",\n      \"publisher_ui\": \"\",\n      \"publisher_units\": \"\",\n      \"publisher_url\": \"http://www.onepercentclub.com\",\n      \"state\": \"active\",\n      \"title\": \"1%Club\",\n      \"type\": \"organization\",\n      \"users\": [\n        {\n          \"capacity\": \"admin\",\n          \"name\": \"onepercentclub\"\n        }\n      ],\n      \"tags\": [\n        \n      ],\n      \"groups\": [\n        \n      ]\n    },\n    {\n      \"description\": \"\",\n      \"id\": \"1a3e3f42-6704-4adf-897a-9bdf5b854a00\",\n      \"image_display_url\": \"\",\n      \"image_url\": \"\",\n      \"is_organization\": true,\n      \"license_id\": \"notspecified\",\n      \"name\": \"3fi\",\n      \"num_followers\": 0,\n      \"package_count\": 1,\n      \"publisher_agencies\": \"\",\n      \"publisher_constraints\": \"\",\n      \"publisher_contact\": \"3F\\r\\nKampmannsgade 4\\r\\nDK-1790 Copenhagen V\\r\\nDenmark \",\n      \"publisher_contact_email\": \"jesper.nielsen@3f.dk\",\n      \"publisher_country\": \"DK\",\n      \"publisher_data_quality\": \"\",\n      \"publisher_description\": \"\",\n      \"publisher_field_exclusions\": \"\",\n      \"publisher_first_publish_date\": \"\",\n      \"publisher_frequency\": \"\",\n      \"publisher_frequency_select\": \"not_specified\",\n      \"publisher_iati_id\": \"DK-CVR-31378028\",\n      \"publisher_implementation_schedule\": \"\",\n      \"publisher_organization_type\": \"22\",\n      \"publisher_record_exclusions\": \"\",\n      \"publisher_refs\": \"\",\n      \"publisher_segmentation\": \"\",\n      \"publisher_source_type\": \"primary_source\",\n      \"publisher_thresholds\": \"\",\n      \"publisher_timeliness\": \"\",\n      \"publisher_ui\": \"\",\n      \"publisher_units\": \"\",\n      \"publisher_url\": \"https://tema.3f.dk/international \",\n      \"state\": \"active\",\n      \"title\": \"3F International\",\n      \"type\": \"organization\",\n      \"users\": [\n        {\n          \"capacity\": \"admin\",\n          \"name\": \"3-f_international\"\n        }\n      ],\n      \"tags\": [\n        \n      ],\n      \"groups\": [\n        \n      ]\n    },\n    {\n      \"description\": \"\",\n      \"id\": \"8c858655-cb84-4527-befa-b77d4de07e22\",\n      \"image_display_url\": \"\",\n      \"image_url\": \"\",\n      \"is_organization\": true,\n      \"license_id\": \"odc-pddl\",\n      \"name\": \"fifty_eight\",\n      \"num_followers\": 0,\n      \"package_count\": 1,\n      \"publisher_agencies\": \"\",\n      \"publisher_constraints\": \"\",\n      \"publisher_contact\": \"2030hub\\r\\n23 Argyle St\\r\\nLiverpool\\r\\nL1 5BL\",\n      \"publisher_contact_email\": \"keithshortley@50eight.com\",\n      \"publisher_country\": \"GB\",\n      \"publisher_data_quality\": \"\",\n      \"publisher_description\": \"Fifty Eight partners with companies, NGOs, public sector and others ​to address the challenges of modern slavery and improve working conditions in global supply chains. \",\n      \"publisher_field_exclusions\": \"\",\n      \"publisher_first_publish_date\": \"2020-02-17T00:00:00.000000\",\n      \"publisher_frequency\": \"\",\n      \"publisher_frequency_select\": \"quarterly\",\n      \"publisher_iati_id\": \"GB-COH-07291220\",\n      \"publisher_implementation_schedule\": \"\",\n      \"publisher_organization_type\": \"70\",\n      \"publisher_record_exclusions\": \"\",\n      \"publisher_refs\": \"\",\n      \"publisher_segmentation\": \"IATI data will be published at country level where possible and appropriate.\",\n      \"publisher_source_type\": \"primary_source\",\n      \"publisher_thresholds\": \"\",\n      \"publisher_timeliness\": \"Within 14 days of period end\",\n      \"publisher_ui\": \"\",\n      \"publisher_units\": \"UK Aid Connect EAPEC \",\n      \"publisher_url\": \"https://www.50eight.com/\",\n      \"state\": \"active\",\n      \"title\": \"50 Eight Limited\",\n      \"type\": \"organization\",\n      \"users\": [\n        {\n          \"capacity\": \"admin\",\n          \"name\": \"keithshortley\"\n        }\n      ],\n      \"tags\": [\n        \n      ],\n      \"groups\": [\n        \n      ]\n    }\n  ]\n}\n",
+          "latency": 0,
+          "statusCode": 200,
+          "label": "",
+          "headers": [],
+          "bodyType": "INLINE",
+          "filePath": "",
+          "databucketID": "",
+          "sendFileAsBody": false,
+          "rules": [],
+          "rulesOperator": "OR",
+          "disableTemplating": false,
+          "fallbackTo404": false,
+          "default": true,
+          "crudKey": "id",
+          "callbacks": []
+        }
+      ],
+      "responseMode": null
     }
   ],
   "rootChildren": [
@@ -429,6 +458,10 @@
       "type": "route",
       "uuid": "b81b26b1-62f6-473f-9815-bbb219c7667b"
     },
+    {
+      "type": "route",
+      "uuid": "52cc8a4f-d934-4056-a1dd-1fd970079d14"
+    },
     {
       "type": "route",
       "uuid": "fe5038e8-f3a5-43d2-ab12-3fc2c72b2c11"
diff --git a/tests/artifacts/config-files/env-file-1 b/tests/artifacts/config-files/env-file-1
index 49311fe..9fe7c7f 100644
--- a/tests/artifacts/config-files/env-file-1
+++ b/tests/artifacts/config-files/env-file-1
@@ -1,5 +1,7 @@
 DATA_REGISTRATION=ckan-registry
 DATA_REGISTRY_BASE_URL=http://localhost:3000/registration-service/
+DATA_REGISTRY_PUBLISHER_METADATA_URL=http://localhost:3000/registration/ckan-publishers
+DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
 
 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1
 
@@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24
 
 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
 
-# Log file
-LOGFILE=
-
 ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
 
 DB_NAME=bulk_data_service_db
@@ -29,5 +28,5 @@ DB_CONNECTION_TIMEOUT=30
 AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;
 
 
-AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
-AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="test-data"
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="test-data"
diff --git a/tests/artifacts/config-files/env-file-2 b/tests/artifacts/config-files/env-file-2
index 6577004..131472d 100644
--- a/tests/artifacts/config-files/env-file-2
+++ b/tests/artifacts/config-files/env-file-2
@@ -1,5 +1,7 @@
 DATA_REGISTRATION=ckan-registry
 DATA_REGISTRY_BASE_URL=http://localhost:3000/registration-service/
+DATA_REGISTRY_PUBLISHER_METADATA_URL=http://localhost:3000/registration/ckan-publishers
+DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
 
 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1/
 
@@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24
 
 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
 
-# Log file
-LOGFILE=
-
 ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
 
 DB_NAME=bulk_data_service_db
@@ -29,5 +28,5 @@ DB_CONNECTION_TIMEOUT=30
 AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;
 
 
-AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
-AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="test-data"
+AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="test-data"

From 2d472990bcba0d0c225a79dee7d0b64de728d886 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:21:11 +0100
Subject: [PATCH 06/12] test: tests for utility functions

---
 tests/unit/test_utilities_http.py |  15 +++++
 tests/unit/test_utilities_misc.py | 106 +++++++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_utilities_http.py

diff --git a/tests/unit/test_utilities_http.py b/tests/unit/test_utilities_http.py
new file mode 100644
index 0000000..ef7a883
--- /dev/null
+++ b/tests/unit/test_utilities_http.py
@@ -0,0 +1,15 @@
+import datetime
+
+import pytest
+
+from utilities.http import parse_last_modified_header
+
+
+@pytest.mark.parametrize("input,expected", [
+    ("Fri, 06 Sep 2024 13:08:28 GMT", datetime.datetime(2024, 9, 6, 13, 8, 28, 0, datetime.timezone.utc)),
+    ("Wed, 21 Oct 2015 07:28:00 GMT", datetime.datetime(2015, 10, 21, 7, 28, 0, 0, datetime.timezone.utc)),
+    ("Wed, 21 Oct 2015 07:28:00", None),
+    ("Wed, 21 October 2015 07:28:00 +00:00", None),
+])
+def test_parse_http_last_modified_header(input, expected):
+    assert parse_last_modified_header(input) == expected
diff --git a/tests/unit/test_utilities_misc.py b/tests/unit/test_utilities_misc.py
index e6ea5fa..46e4f18 100644
--- a/tests/unit/test_utilities_misc.py
+++ b/tests/unit/test_utilities_misc.py
@@ -1,4 +1,6 @@
-from utilities.misc import get_hash, get_hash_excluding_generated_timestamp
+import pytest
+
+from utilities.misc import filter_dict_by_structure, get_hash, get_hash_excluding_generated_timestamp
 
 
 def test_get_hash():
@@ -51,3 +53,105 @@ def test_get_hash_excluding_generated_timestamp():
     hash = get_hash_excluding_generated_timestamp(yiplActivitiesXmlFile)
 
     assert(hash == "759eaa39276381f3fc146232cefd2111a2abc199")
+
+
+@pytest.mark.parametrize("input,structure,expected", [
+    ({"a": 10}, {"a" : None}, {"a": 10}),
+    ({"a": None}, {"a" : None}, {"a": None}),
+    ({"a": 10, "b": "should be filtered"}, {"a" : None}, {"a": 10}),
+    ({"a": 10, "b": "should be filtered", "c" : "also filtered"}, {"a" : None}, {"a": 10}),
+    ({"a": 10, "b": "included"}, {"a" : None, "b": None}, {"a": 10, "b": "included"}),
+    ({"b": "included"}, {"a" : None, "b": None}, {"b": "included"}),
+    ({"c": 10}, {"a" : None, "b": None}, {}),
+    ({}, {"a" : None, "b": None}, {}),
+    ({}, {}, {}),
+])
+def test_filter_dict_atomic_value(input, structure, expected):
+
+    assert filter_dict_by_structure(input, structure) == expected
+
+
+@pytest.mark.parametrize("input,structure,expected", [
+    ({},
+     {"a": {"include": None}},
+     {}),
+
+    ({"a": {}},
+     {"a": {"include": None}},
+     {"a": {}}),
+
+    ({"a": {"include": 10, "filter out": 20}},
+     {"a": {"include": None}},
+     {"a": {"include": 10}}),
+
+    ({"a": {"filter out": 20}},
+     {"a": {"include": None}},
+     {"a": {}}),
+
+    ({"a": {"filter out": 20}},
+     {"a": {}},
+     {"a": {}}),
+
+    ({"a": None},
+     {"a": {}},
+     {"a": None}),
+
+    ({"a": None},
+     {"a": {"b": None}},
+     {"a": None}),
+
+    ({"a": 10},
+     {"a": {"b": {}}},
+     {"a": 10}),
+
+    ({"a": ["test"]},
+     {"a": {"b": {}}},
+     {"a": ["test"]}),
+
+
+])
+def test_filter_dict_nested_dict(input, structure, expected):
+
+    assert filter_dict_by_structure(input, structure) == expected
+
+
+@pytest.mark.parametrize("input,structure,expected", [
+    ({},
+     {"a": []},
+     {}),
+
+    ({"a": []},
+     {"a": None},
+     {"a": []}),
+
+    ({"a": ["one", "two"]},
+     {"a": None},
+     {"a": ["one", "two"]}),
+
+])
+def test_filter_dict_with_list_no_dict_items(input, structure, expected):
+
+    assert filter_dict_by_structure(input, structure) == expected
+
+
+@pytest.mark.parametrize("input,structure,expected", [
+    ({},
+     {"a": [{}]},
+     {}),
+
+    ({"a": []},
+     {"a": [{}]},
+     {"a": []}),
+
+    ({"a": ["one", "two"]},
+     {"a": [{}]},
+     {"a": [{}, {}]}),
+
+    ({"a": [{"include": 10, "filter": 20}]},
+     {"a": [{"include": None}]},
+     {"a": [{"include": 10}]}),
+
+])
+def test_filter_dict_with_list_with_dict_items(input, structure, expected):
+
+    assert filter_dict_by_structure(input, structure) == expected

From 3377c9d93f652be33b39daa569ad12fa19e46fdb Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:21:36 +0100
Subject: [PATCH 07/12] test: tests for fetching, storing CKAN metadata

---
 tests/integration/test_dataset_add.py         | 206 +++++++++++++++++-
 .../integration/test_dataset_registration.py  |  82 ++++++-
 tests/integration/test_dataset_update.py      | 202 ++++++++++++++++-
 3 files changed, 486 insertions(+), 4 deletions(-)

diff --git a/tests/integration/test_dataset_add.py b/tests/integration/test_dataset_add.py
index c9ef98c..e5c5c1a 100644
--- a/tests/integration/test_dataset_add.py
+++ b/tests/integration/test_dataset_add.py
@@ -1,3 +1,4 @@
+import json
 import uuid
 
 import pytest
@@ -11,7 +12,107 @@
     ("publisher_id", uuid.UUID("ea055d99-f7e9-456f-9f99-963e95493c1b")),
     ("publisher_name", "test_foundation_a"),
     ("source_url", "http://localhost:3000/data/test_foundation_a-dataset-404.xml"),
-    ("type", "activity")
+    ("type", "activity"),
+    ("registration_service_name", "ckan-registry"),
+    ("registration_service_dataset_metadata", json.dumps(
+            {
+                "author": None,
+                "author_email": "publisher@email-here.com",
+                "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+                "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+                "isopen": True,
+                "license_id": "other-at",
+                "license_title": "Other (Attribution)",
+                "maintainer": None,
+                "maintainer_email": None,
+                "metadata_created": "2024-03-04T10:24:11.373108",
+                "metadata_modified": "2024-05-07T15:38:58.740018",
+                "name": "test_foundation_a-dataset-001",
+                "notes": "",
+                "num_resources": 1,
+                "num_tags": 0,
+                "organization": {
+                    "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+                    "name": "test_foundation_a",
+                    "title": "Test Foundation A",
+                    "type": "organization",
+                    "description": "",
+                    "image_url": "",
+                    "created": "2020-02-24T20:56:01.763851",
+                    "is_organization": True,
+                    "approval_status": "approved",
+                    "state": "active"
+                },
+                "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+                "private": False,
+                "state": "active",
+                "title": "040324",
+                "type": "dataset",
+                "url": None,
+                "version": None,
+                "extras": [
+                    {
+                        "key": "activity_count",
+                        "value": "10"
+                    },
+                    {
+                        "key": "country",
+                        "value": "GB"
+                    },
+                    {
+                        "key": "data_updated",
+                        "value": "2024-03-01 14:24:09"
+                    },
+                    {
+                        "key": "filetype",
+                        "value": "activity"
+                    },
+                    {
+                        "key": "iati_version",
+                        "value": "2.03"
+                    },
+                    {
+                        "key": "language",
+                        "value": ""
+                    },
+                    {
+                        "key": "secondary_publisher",
+                        "value": ""
+                    },
+                    {
+                        "key": "validation_status",
+                        "value": "Not Found"
+                    }
+                ],
+                "resources": [
+                    {
+                        "cache_last_updated": None,
+                        "cache_url": None,
+                        "created": "2024-05-07T15:38:57.312249",
+                        "description": None,
+                        "format": "IATI-XML",
+                        "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+                        "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+                        "last_modified": None,
+                        "metadata_modified": "2024-05-07T15:38:58.757860",
+                        "mimetype": "",
+                        "mimetype_inner": None,
+                        "name": None,
+                        "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+                        "position": 0,
+                        "resource_type": None,
+                        "size": 399382,
+                        "state": "active",
+                        "url": "http://localhost:3000/data/test_foundation_a-dataset-404.xml",
+                        "url_type": None
+                    }
+                ],
+                "tags": [],
+                "groups": [],
+                "relationships_as_subject": [],
+                "relationships_as_object": []
+            }
+    ))
 ])
 def test_add_new_undownloadable_dataset(get_and_clear_up_context, field, expected):  # noqa: F811
 
@@ -21,6 +122,7 @@ def test_add_new_undownloadable_dataset(get_and_clear_up_context, field, expecte
 
     # dataset c8a40aa5-9f31-... with 404
     context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
+
     datasets_in_bds = {}
     checker_run(context, datasets_in_bds)
 
@@ -37,7 +139,107 @@ def test_add_new_undownloadable_dataset(get_and_clear_up_context, field, expecte
     ("source_url", "http://localhost:3000/data/test_foundation_a-dataset-001.xml"),
     ("type", "activity"),
     ("hash", "7703103493edafde0dbce66e507abb642fc7bd52"),
-    ("hash_excluding_generated_timestamp", "ef0eead52dfc3bd86311c84327282f607402dfff")
+    ("hash_excluding_generated_timestamp", "ef0eead52dfc3bd86311c84327282f607402dfff"),
+    ("registration_service_name", "ckan-registry"),
+    ("registration_service_dataset_metadata", json.dumps(
+           {
+                "author": None,
+                "author_email": "publisher@email-here.com",
+                "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+                "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+                "isopen": True,
+                "license_id": "other-at",
+                "license_title": "Other (Attribution)",
+                "maintainer": None,
+                "maintainer_email": None,
+                "metadata_created": "2024-03-04T10:24:11.373108",
+                "metadata_modified": "2024-05-07T15:38:58.740018",
+                "name": "test_foundation_a-dataset-001",
+                "notes": "",
+                "num_resources": 1,
+                "num_tags": 0,
+                "organization": {
+                    "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+                    "name": "test_foundation_a",
+                    "title": "Test Foundation A",
+                    "type": "organization",
+                    "description": "",
+                    "image_url": "",
+                    "created": "2020-02-24T20:56:01.763851",
+                    "is_organization": True,
+                    "approval_status": "approved",
+                    "state": "active"
+                },
+                "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+                "private": False,
+                "state": "active",
+                "title": "040324",
+                "type": "dataset",
+                "url": None,
+                "version": None,
+                "extras": [
+                    {
+                        "key": "activity_count",
+                        "value": "10"
+                    },
+                    {
+                        "key": "country",
+                        "value": "GB"
+                    },
+                    {
+                        "key": "data_updated",
+                        "value": "2024-03-01 14:24:09"
+                    },
+                    {
+                        "key": "filetype",
+                        "value": "activity"
+                    },
+                    {
+                        "key": "iati_version",
+                        "value": "2.03"
+                    },
+                    {
+                        "key": "language",
+                        "value": ""
+                    },
+                    {
+                        "key": "secondary_publisher",
+                        "value": ""
+                    },
+                    {
+                        "key": "validation_status",
+                        "value": "Not Found"
+                    }
+                ],
+                "resources": [
+                    {
+                        "cache_last_updated": None,
+                        "cache_url": None,
+                        "created": "2024-05-07T15:38:57.312249",
+                        "description": None,
+                        "format": "IATI-XML",
+                        "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+                        "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+                        "last_modified": None,
+                        "metadata_modified": "2024-05-07T15:38:58.757860",
+                        "mimetype": "",
+                        "mimetype_inner": None,
+                        "name": None,
+                        "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+                        "position": 0,
+                        "resource_type": None,
+                        "size": 399382,
+                        "state": "active",
+                        "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml",
+                        "url_type": None
+                    }
+                ],
+                "tags": [],
+                "groups": [],
+                "relationships_as_subject": [],
+                "relationships_as_object": []
+            }
+    ))
 ])
 def test_add_new_downloadable_dataset(get_and_clear_up_context, field, expected):  # noqa: F811
 
diff --git a/tests/integration/test_dataset_registration.py b/tests/integration/test_dataset_registration.py
index 5c2bc1c..2028927 100644
--- a/tests/integration/test_dataset_registration.py
+++ b/tests/integration/test_dataset_registration.py
@@ -1,11 +1,15 @@
+import json
+
 import pytest
 
 from bulk_data_service.checker import checker_run
+from dataset_registration.iati_registry_ckan import get_publisher_metadata_as_str
 from helpers.helpers import get_and_clear_up_context  # noqa: F401
+from utilities.http import get_requests_session
 
 
 @pytest.mark.parametrize("http_status_code", ["400", "404", "500"])
-def test_ckan_reg_url_400(get_and_clear_up_context, http_status_code):  # noqa: F811
+def test_ckan_registry_url_400(get_and_clear_up_context, http_status_code):  # noqa: F811
 
     context = get_and_clear_up_context
     context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/error-response/" + http_status_code
@@ -14,3 +18,79 @@ def test_ckan_reg_url_400(get_and_clear_up_context, http_status_code):  # noqa:
     checker_run(context, datasets_in_bds)
 
     assert len(datasets_in_bds) == 0
+
+
+def test_ckan_registry_get_metadata_known_publisher(get_and_clear_up_context):  # noqa: F811
+
+    expected = json.dumps({
+      "description": "",
+      "id": "1a3e3f42-6704-4adf-897a-9bdf5b854a00",
+      "image_display_url": "",
+      "image_url": "",
+      "is_organization": True,
+      "license_id": "notspecified",
+      "name": "3fi",
+      "num_followers": 0,
+      "package_count": 1,
+      "publisher_agencies": "",
+      "publisher_constraints": "",
+      "publisher_contact": "3F\r\nKampmannsgade 4\r\nDK-1790 Copenhagen V\r\nDenmark ",
+      "publisher_contact_email": "jesper.nielsen@3f.dk",
+      "publisher_country": "DK",
+      "publisher_data_quality": "",
+      "publisher_description": "",
+      "publisher_field_exclusions": "",
+      "publisher_first_publish_date": "",
+      "publisher_frequency": "",
+      "publisher_frequency_select": "not_specified",
+      "publisher_iati_id": "DK-CVR-31378028",
+      "publisher_implementation_schedule": "",
+      "publisher_organization_type": "22",
+      "publisher_record_exclusions": "",
+      "publisher_refs": "",
+      "publisher_segmentation": "",
+      "publisher_source_type": "primary_source",
+      "publisher_thresholds": "",
+      "publisher_timeliness": "",
+      "publisher_ui": "",
+      "publisher_units": "",
+      "publisher_url": "https://tema.3f.dk/international ",
+      "state": "active",
+      "title": "3F International",
+      "type": "organization",
+      "users": [
+        {
+          "capacity": "admin",
+          "name": "3-f_international"
+        }
+      ],
+      "tags": [
+
+      ],
+      "groups": [
+
+      ]
+    })
+
+    context = get_and_clear_up_context
+
+    context["DATA_REGISTRY_PUBLISHER_METADATA_URL"] = "http://localhost:3000/registration/ckan-publishers"
+
+    session = get_requests_session()
+
+    publisher_metadata_str = get_publisher_metadata_as_str(context, session, "3fi")
+
+    assert publisher_metadata_str == expected
+
+
+def test_ckan_registry_get_metadata_unknown_publisher(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    context["DATA_REGISTRY_PUBLISHER_METADATA_URL"] = "http://localhost:3000/registration/ckan-publishers"
+
+    session = get_requests_session()
+
+    publisher_metadata_str = get_publisher_metadata_as_str(context, session, "unknown_publisher")
+
+    assert publisher_metadata_str == "{}"
diff --git a/tests/integration/test_dataset_update.py b/tests/integration/test_dataset_update.py
index ea6f3e2..53c55ec 100644
--- a/tests/integration/test_dataset_update.py
+++ b/tests/integration/test_dataset_update.py
@@ -1,3 +1,4 @@
+import json
 import uuid
 
 import pytest
@@ -12,7 +13,206 @@
     ("publisher_name", "test_foundation_a", "test_org_a"),
     ("source_url", "http://localhost:3000/data/test_foundation_a-dataset-001.xml",
         "http://localhost:3000/not_found"),
-    ("type", "activity", "organisation")
+    ("type", "activity", "organisation"),
+    ("registration_service_dataset_metadata", json.dumps(
+            {
+                "author": None,
+                "author_email": "publisher@email-here.com",
+                "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+                "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+                "isopen": True,
+                "license_id": "other-at",
+                "license_title": "Other (Attribution)",
+                "maintainer": None,
+                "maintainer_email": None,
+                "metadata_created": "2024-03-04T10:24:11.373108",
+                "metadata_modified": "2024-05-07T15:38:58.740018",
+                "name": "test_foundation_a-dataset-001",
+                "notes": "",
+                "num_resources": 1,
+                "num_tags": 0,
+                "organization": {
+                    "id": "ea055d99-f7e9-456f-9f99-963e95493c1b",
+                    "name": "test_foundation_a",
+                    "title": "Test Foundation A",
+                    "type": "organization",
+                    "description": "",
+                    "image_url": "",
+                    "created": "2020-02-24T20:56:01.763851",
+                    "is_organization": True,
+                    "approval_status": "approved",
+                    "state": "active"
+                },
+                "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+                "private": False,
+                "state": "active",
+                "title": "040324",
+                "type": "dataset",
+                "url": None,
+                "version": None,
+                "extras": [
+                    {
+                        "key": "activity_count",
+                        "value": "10"
+                    },
+                    {
+                        "key": "country",
+                        "value": "GB"
+                    },
+                    {
+                        "key": "data_updated",
+                        "value": "2024-03-01 14:24:09"
+                    },
+                    {
+                        "key": "filetype",
+                        "value": "activity"
+                    },
+                    {
+                        "key": "iati_version",
+                        "value": "2.03"
+                    },
+                    {
+                        "key": "language",
+                        "value": ""
+                    },
+                    {
+                        "key": "secondary_publisher",
+                        "value": ""
+                    },
+                    {
+                        "key": "validation_status",
+                        "value": "Not Found"
+                    }
+                ],
+                "resources": [
+                    {
+                        "cache_last_updated": None,
+                        "cache_url": None,
+                        "created": "2024-05-07T15:38:57.312249",
+                        "description": None,
+                        "format": "IATI-XML",
+                        "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+                        "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+                        "last_modified": None,
+                        "metadata_modified": "2024-05-07T15:38:58.757860",
+                        "mimetype": "",
+                        "mimetype_inner": None,
+                        "name": None,
+                        "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+                        "position": 0,
+                        "resource_type": None,
+                        "size": 399382,
+                        "state": "active",
+                        "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml",
+                        "url_type": None
+                    }
+                ],
+                "tags": [],
+                "groups": [],
+                "relationships_as_subject": [],
+                "relationships_as_object": []
+            }
+        ),
+        json.dumps(
+            {
+                "author": None,
+                "author_email": "publisher@email-here.com",
+                "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb",
+                "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159",
+                "isopen": True,
+                "license_id": "other-at",
+                "license_title": "Other (Attribution)",
+                "maintainer": None,
+                "maintainer_email": None,
+                "metadata_created": "2024-03-04T10:24:11.373108",
+                "metadata_modified": "2024-05-07T15:38:58.740018",
+                "name": "test_foundation_a-dataset-001",
+                "notes": "",
+                "num_resources": 1,
+                "num_tags": 0,
+                "organization": {
+                    "id": "4f0f8498-20d2-4ca5-a20f-f441eedb1d4f",
+                    "name": "test_org_a",
+                    "title": "Test Foundation A",
+                    "type": "organization",
+                    "description": "",
+                    "image_url": "",
+                    "created": "2020-02-24T20:56:01.763851",
+                    "is_organization": True,
+                    "approval_status": "approved",
+                    "state": "active"
+                },
+                "owner_org": "5d04f169-c702-45fe-8162-da7834859d86",
+                "private": False,
+                "state": "active",
+                "title": "040324",
+                "type": "dataset",
+                "url": None,
+                "version": None,
+                "extras": [
+                    {
+                        "key": "activity_count",
+                        "value": "10"
+                    },
+                    {
+                        "key": "country",
+                        "value": "GB"
+                    },
+                    {
+                        "key": "data_updated",
+                        "value": "2024-03-01 14:24:09"
+                    },
+                    {
+                        "key": "filetype",
+                        "value": "organisation"
+                    },
+                    {
+                        "key": "iati_version",
+                        "value": "2.03"
+                    },
+                    {
+                        "key": "language",
+                        "value": ""
+                    },
+                    {
+                        "key": "secondary_publisher",
+                        "value": ""
+                    },
+                    {
+                        "key": "validation_status",
+                        "value": "Not Found"
+                    }
+                ],
+                "resources": [
+                    {
+                        "cache_last_updated": None,
+                        "cache_url": None,
+                        "created": "2024-05-07T15:38:57.312249",
+                        "description": None,
+                        "format": "IATI-XML",
+                        "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac",
+                        "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe",
+                        "last_modified": None,
+                        "metadata_modified": "2024-05-07T15:38:58.757860",
+                        "mimetype": "",
+                        "mimetype_inner": None,
+                        "name": None,
+                        "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9",
+                        "position": 0,
+                        "resource_type": None,
+                        "size": 399382,
+                        "state": "active",
+                        "url": "http://localhost:3000/not_found",
+                        "url_type": None
+                    }
+                ],
+                "tags": [],
+                "groups": [],
+                "relationships_as_subject": [],
+                "relationships_as_object": []
+            }
+        )
+     )
 ])
 def test_update_dataset_publisher_details(get_and_clear_up_context,  # noqa: F811
                                           field,

From 63040d754d722ed35f5845537836bacf661c4d22 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:22:06 +0100
Subject: [PATCH 08/12] test: tests for  new ZIP file creation

---
 tests/helpers/helpers.py                   |   8 +
 tests/integration/test_dataset_indexing.py |   6 +-
 tests/integration/test_zip_creation.py     | 228 ++++++++++++++++++++-
 3 files changed, 231 insertions(+), 11 deletions(-)

diff --git a/tests/helpers/helpers.py b/tests/helpers/helpers.py
index 9133b18..4d8d55a 100644
--- a/tests/helpers/helpers.py
+++ b/tests/helpers/helpers.py
@@ -1,4 +1,6 @@
 import glob
+import os
+import shutil
 from unittest import mock
 
 import pytest
@@ -41,3 +43,9 @@ def get_and_clear_up_context():
     yield context
     truncate_db_table(context)
     delete_azure_blob_containers(context)
+    # this is a sanity check to ensure we don't remove important files on a misconfiguration
+    if context["ZIP_WORKING_DIR"].startswith("/tmp"):
+        zip_dirs = [context["ZIP_WORKING_DIR"], f"{context['ZIP_WORKING_DIR']}-1", f"{context['ZIP_WORKING_DIR']}-2"]
+        for zip_dir in zip_dirs:
+            if os.path.exists(zip_dir):
+                shutil.rmtree(zip_dir)
diff --git a/tests/integration/test_dataset_indexing.py b/tests/integration/test_dataset_indexing.py
index 8a157d5..25a5dcf 100644
--- a/tests/integration/test_dataset_indexing.py
+++ b/tests/integration/test_dataset_indexing.py
@@ -4,7 +4,7 @@
 from azure.storage.blob import BlobServiceClient
 
 from bulk_data_service.checker import checker_run
-from bulk_data_service.dataset_indexing import get_index_name, get_minimal_index_fields
+from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields
 from helpers.helpers import get_and_clear_up_context  # noqa: F401
 from utilities.azure import get_azure_container_name
 
@@ -87,13 +87,13 @@ def test_full_index_creation(get_and_clear_up_context):  # noqa: F811
 
     full_index_item = full_index["datasets"][dataset["name"]]
 
-    for field in dataset.keys():
+    for field in get_full_index_fields(context):
         if isinstance(dataset[field], uuid.UUID):
             assert uuid.UUID(full_index_item[field]) == dataset[field]
         elif isinstance(dataset[field], str):
             assert full_index_item[field] == dataset[field]
 
     # -2 because of the two autogenerated fields
-    assert len(full_index_item.keys()) - 2 == len(dataset.keys())
+    assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context))
 
     blob_service_client.close()
diff --git a/tests/integration/test_zip_creation.py b/tests/integration/test_zip_creation.py
index 4a2a2bb..0683d3f 100644
--- a/tests/integration/test_zip_creation.py
+++ b/tests/integration/test_zip_creation.py
@@ -1,40 +1,252 @@
+import json
+import os
+import zipfile
+
 from bulk_data_service.checker import checker_run
 from bulk_data_service.zipper import zipper_run
 from helpers.helpers import get_and_clear_up_context  # noqa: F401
 from helpers.helpers import get_number_xml_files_in_working_dir
 
 
-def test_zip_creation_when_download_always_failed(get_and_clear_up_context):  # noqa: F811
+def test_dataset_saved_for_download_success(get_and_clear_up_context):  # noqa: F811
 
     context = get_and_clear_up_context
 
-    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
+    run_checker_then_zipper_download_ok(context)
+
+    assert get_number_xml_files_in_working_dir(context) == 1
+    assert os.path.exists("{}{}".format(
+        context["ZIP_WORKING_DIR"],
+        "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml")) is True
+
+
+def test_dataset_not_saved_for_download_fail_and_no_cache(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_fail(context)
+
+    assert get_number_xml_files_in_working_dir(context) == 0
+
+
+def test_dataset_saved_for_download_fail_but_cached(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_fail_but_cached(context)
+
+    assert get_number_xml_files_in_working_dir(context) == 1
+    assert os.path.exists("{}{}".format(
+        context["ZIP_WORKING_DIR"],
+        "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml")) is True
+
+
+def test_publisher_metadata_saved_for_failed_metadata_dl(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_ok(context)
+
+    assert os.path.exists(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_foundation_a.json") is True
+
+
+def test_publisher_metadata_saved_for_successful_metadata_dl(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-05"
     datasets_in_bds = {}
     checker_run(context, datasets_in_bds)
 
     datasets_in_zip = {}
     zipper_run(context, datasets_in_zip, datasets_in_bds)
 
-    assert len(datasets_in_zip) == 0
+    assert os.path.exists(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_org_a.json") is True
 
-    assert get_number_xml_files_in_working_dir(context) == 0
 
+def test_publisher_metadata_content_for_failed_metadata_dl(get_and_clear_up_context):  # noqa: F811
 
-def test_zip_creation_when_download_recently_failed(get_and_clear_up_context):  # noqa: F811
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_ok(context)
+
+    with open(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_foundation_a.json", "r") as f:
+        assert f.read() == "{}"
+
+
+def test_publisher_metadata_content_for_successful_metadata_dl(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-05"
+    datasets_in_bds = {}
+    checker_run(context, datasets_in_bds)
+
+    datasets_in_zip = {}
+    zipper_run(context, datasets_in_zip, datasets_in_bds)
+
+    with open(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_org_a.json", "r") as f:
+        assert f.read() == json.dumps(
+            {
+                "id": "4f0f8498-20d2-4ca5-a20f-f441eedb1d4f",
+                "name": "test_org_a",
+            }
+        )
+
+
+def test_bds_zip_content_for_download_success(get_and_clear_up_context):  # noqa: F811
 
     context = get_and_clear_up_context
 
+    run_checker_then_zipper_download_ok(context)
+
+    bds_zip_filename = context["ZIP_WORKING_DIR"] + "-1/iati-data.zip"
+
+    assert os.path.exists(bds_zip_filename) is True
+
+    bds_zip_file = zipfile.ZipFile(bds_zip_filename)
+
+    filelist = bds_zip_file.namelist()
+
+    assert ("iati-data/dataset-index-minimal.json" in filelist) is True
+    assert ("iati-data/dataset-index-full.json" in filelist) is True
+    assert ("iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True
+
+
+def test_bds_zip_content_for_download_fail_but_cached(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_fail_but_cached(context)
+
+    bds_zip_filename = context["ZIP_WORKING_DIR"] + "-1/iati-data.zip"
+
+    assert os.path.exists(bds_zip_filename) is True
+
+    bds_zip_file = zipfile.ZipFile(bds_zip_filename)
+
+    filelist = bds_zip_file.namelist()
+
+    assert ("iati-data/dataset-index-minimal.json" in filelist) is True
+    assert ("iati-data/dataset-index-full.json" in filelist) is True
+    assert ("iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True
+
+
+def test_bds_zip_content_for_download_fail_no_cached(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_fail(context)
+
+    bds_zip_filename = context["ZIP_WORKING_DIR"] + "-1/iati-data.zip"
+
+    assert os.path.exists(bds_zip_filename) is True
+
+    bds_zip_file = zipfile.ZipFile(bds_zip_filename)
+
+    filelist = bds_zip_file.namelist()
+
+    assert ("iati-data/dataset-index-minimal.json" in filelist) is True
+    assert ("iati-data/dataset-index-full.json" in filelist) is True
+    assert ("iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is False
+
+
+def test_codeforiati_zip_content_for_download_success(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_ok(context)
+
+    bds_zip_filename = context["ZIP_WORKING_DIR"] + "-2/code-for-iati-data-download.zip"
+
+    assert os.path.exists(bds_zip_filename) is True
+
+    bds_zip_file = zipfile.ZipFile(bds_zip_filename)
+
+    filelist = bds_zip_file.namelist()
+
+    assert ("iati-data/dataset-index-minimal.json" in filelist) is False
+    assert ("iati-data/dataset-index-full.json" in filelist) is False
+    assert ("iati-data-main/metadata.json" in filelist) is True
+    assert ("iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True
+    assert ("iati-data-main/metadata/test_foundation_a.json" in filelist) is True
+    assert ("iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" in filelist) is True
+
+
+def test_codeforiati_zip_content_for_download_fail_but_cached(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_fail_but_cached(context)
+
+    bds_zip_filename = context["ZIP_WORKING_DIR"] + "-2/code-for-iati-data-download.zip"
+
+    assert os.path.exists(bds_zip_filename) is True
+
+    bds_zip_file = zipfile.ZipFile(bds_zip_filename)
+
+    filelist = bds_zip_file.namelist()
+
+    assert ("iati-data/dataset-index-minimal.json" in filelist) is False
+    assert ("iati-data/dataset-index-full.json" in filelist) is False
+    assert ("iati-data-main/metadata.json" in filelist) is True
+    assert ("iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True
+    assert ("iati-data-main/metadata/test_foundation_a.json" in filelist) is True
+    assert ("iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" in filelist) is True
+
+
+def test_codeforiati_zip_content_for_download_fail_no_cached(get_and_clear_up_context):  # noqa: F811
+
+    context = get_and_clear_up_context
+
+    run_checker_then_zipper_download_fail(context)
+
+    bds_zip_filename = context["ZIP_WORKING_DIR"] + "-2/code-for-iati-data-download.zip"
+
+    assert os.path.exists(bds_zip_filename) is True
+
+    bds_zip_file = zipfile.ZipFile(bds_zip_filename)
+
+    fileinfolist = bds_zip_file.infolist()
+    filelist = [fileinfo.filename for fileinfo in fileinfolist]
+
+    assert ("iati-data/dataset-index-minimal.json" in filelist) is False
+    assert ("iati-data/dataset-index-full.json" in filelist) is False
+    assert ("iati-data-main/metadata.json" in filelist) is True
+    assert ("iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True
+    assert ("iati-data-main/metadata/test_foundation_a.json" in filelist) is True
+    assert ("iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" in filelist) is True
+
+    fileinfoxml = list(filter(lambda f: f.filename == "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml", fileinfolist))[0]
+    assert fileinfoxml.file_size == 0
+
+
+def run_checker_then_zipper_download_ok(context):
     context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
     datasets_in_bds = {}
     checker_run(context, datasets_in_bds)
 
-    # this is same dataset as above, only with a 404 URL
+    datasets_in_zip = {}
+    zipper_run(context, datasets_in_zip, datasets_in_bds)
+
+
+def run_checker_then_zipper_download_fail(context):
     context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
+    datasets_in_bds = {}
     checker_run(context, datasets_in_bds)
 
     datasets_in_zip = {}
     zipper_run(context, datasets_in_zip, datasets_in_bds)
 
-    assert len(datasets_in_zip) == 1
 
-    assert get_number_xml_files_in_working_dir(context) == 1
+def run_checker_then_zipper_download_fail_but_cached(context):
+    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
+    datasets_in_bds = {}
+    checker_run(context, datasets_in_bds)
+
+    # this is same dataset as above, only with a 404 URL
+    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
+    checker_run(context, datasets_in_bds)
+
+    datasets_in_zip = {}
+    zipper_run(context, datasets_in_zip, datasets_in_bds)

From dc39f9c33c266b4f231194b808a01acc20715415 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:22:53 +0100
Subject: [PATCH 09/12] chore: utility functions for new features

---
 src/utilities/azure.py   | 40 ++++++++++++++++++++++++++++++-----
 src/utilities/http.py    | 18 +++++++++++++---
 src/utilities/logging.py |  5 -----
 src/utilities/misc.py    | 45 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/src/utilities/azure.py b/src/utilities/azure.py
index 4320614..0302dfd 100644
--- a/src/utilities/azure.py
+++ b/src/utilities/azure.py
@@ -4,6 +4,11 @@
 from azure.storage.blob import BlobServiceClient, ContentSettings
 
 
+def azure_blob_exists(az_blob_service: BlobServiceClient, container_name: str, blob_name: str) -> bool:
+    blob_client = az_blob_service.get_blob_client(container_name, blob_name)
+    return blob_client.exists()
+
+
 def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str, blob_name: str, filename: str):
 
     blob_client = az_blob_service.get_blob_client(container_name, blob_name)
@@ -17,7 +22,7 @@ def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str,
 
 def azure_upload_to_blob(
     az_blob_service: BlobServiceClient, container_name: str, blob_name: str, content: Any, content_type: str
-):
+) -> dict[str, Any]:
 
     blob_client = az_blob_service.get_blob_client(container_name, blob_name)
 
@@ -26,7 +31,7 @@ def azure_upload_to_blob(
     if content_type == "application/xml":
         content_settings.content_encoding = "UTF-8"
 
-    blob_client.upload_blob(content, overwrite=True, content_settings=content_settings)
+    return blob_client.upload_blob(content, overwrite=True, content_settings=content_settings)
 
 
 def create_azure_blob_containers(context: dict):
@@ -42,7 +47,16 @@ def create_azure_blob_containers(context: dict):
         if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] not in container_names:
             blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"])
     except Exception as e:
-        context["logger"].info("Could not create Azure blob storage containers: {}".format(e))
+        context["logger"].error(
+            "Could not create Azure blob storage containers."
+            "XML container name: {}. "
+            "ZIP container name: {}. "
+            "Error details: {}".format(
+                context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
+                context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
+                e,
+            )
+        )
         raise e
     finally:
         blob_service.close()
@@ -57,10 +71,11 @@ def delete_azure_blob_containers(context: dict):
     try:
         if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"] in container_names:
             blob_service.delete_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
+            container_names.remove(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
         if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] in container_names:
             blob_service.delete_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"])
     except Exception as e:
-        context["logger"].info("Could not delete Azure blob storage container: {}".format(e))
+        context["logger"].error("Could not delete Azure blob storage container: {}".format(e))
         raise e
     finally:
         blob_service.close()
@@ -77,7 +92,7 @@ def delete_azure_iati_blob(context: dict, blob_service_client: BlobServiceClient
 
         blob_client.delete_blob()
     except azure.core.exceptions.ResourceNotFoundError as e:
-        context["logger"].warning(
+        context["logger"].error(
             "dataset id: {} - Problem deleting blob that was "
             "expected to exist: {}".format(dataset["id"], e).replace("\n", "")
         )
@@ -102,3 +117,18 @@ def get_azure_blob_public_url(context: dict, dataset: dict, iati_blob_type: str)
         blob_name_for_url,
         get_azure_blob_name(dataset, iati_blob_type),
     )
+
+
+def upload_zip_to_azure(context: dict, zip_local_pathname: str, zip_azure_filename: str):
+    az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+    blob_client = az_blob_service.get_blob_client(
+        context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], zip_azure_filename
+    )
+
+    content_settings = ContentSettings(content_type="zip")
+
+    with open(zip_local_pathname, "rb") as data:
+        blob_client.upload_blob(data, overwrite=True, content_settings=content_settings)
+
+    az_blob_service.close()
diff --git a/src/utilities/http.py b/src/utilities/http.py
index 2cbf9b5..d6afadf 100644
--- a/src/utilities/http.py
+++ b/src/utilities/http.py
@@ -1,10 +1,22 @@
-from typing import Any
+import datetime
+from typing import Any, Optional
 
 import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
 
 
+def parse_last_modified_header(last_modified_header: str) -> Optional[datetime.datetime]:
+    last_modified_header_parsed = None
+    try:
+        last_modified_header_parsed = datetime.datetime.strptime(
+            last_modified_header, "%a, %d %b %Y %H:%M:%S %Z"
+        ).replace(tzinfo=datetime.timezone.utc)
+    except ValueError:
+        pass
+    return last_modified_header_parsed
+
+
 def get_requests_session() -> requests.Session:
     session = requests.Session()
     session.headers.update({"User-Agent": "IATI Bulk Data Service 0.1"})
@@ -28,7 +40,7 @@ def http_get_json(session: requests.Session, url: str, timeout: int = 30, except
 
 def http_head_dataset(session: requests.Session, url: str, timeout: int = 10, retries: int = 2) -> requests.Response:
 
-    response = session.head(url=url, timeout=timeout, allow_redirects=True)
+    response = session.head(url=url, timeout=timeout, allow_redirects=True, verify=False)
 
     if response.status_code != 200:
         raise RuntimeError(
@@ -49,7 +61,7 @@ def http_download_dataset(
     session: requests.Session, url: str, timeout: int = 25, retries: int = 2
 ) -> requests.Response:
 
-    response = session.get(url=url, timeout=timeout, allow_redirects=True)
+    response = session.get(url=url, timeout=timeout, allow_redirects=True, verify=False)
 
     if response.status_code != 200:
         raise RuntimeError(
diff --git a/src/utilities/logging.py b/src/utilities/logging.py
index 095af4f..b63adb8 100644
--- a/src/utilities/logging.py
+++ b/src/utilities/logging.py
@@ -20,9 +20,4 @@ def initialise_logging(config: dict) -> logging.Logger:
 
     bds_logger.addHandler(handler_stdout)
 
-    if config["LOGFILE"] != "":
-        handler_file = logging.FileHandler(config["LOGFILE"])
-        handler_file.setFormatter(formatter)
-        bds_logger.addHandler(handler_file)
-
     return bds_logger
diff --git a/src/utilities/misc.py b/src/utilities/misc.py
index b9a7642..d1d9275 100644
--- a/src/utilities/misc.py
+++ b/src/utilities/misc.py
@@ -1,4 +1,5 @@
 import datetime
+import glob
 import hashlib
 import io
 import re
@@ -35,6 +36,16 @@ def get_timestamp_as_str(isodate: str = "") -> str:
         return datetime.datetime.now(tz=datetime.timezone.utc).isoformat()
 
 
+def get_timestamp_as_str_z(isodate: str = "") -> str:
+    dt = (
+        datetime.datetime.fromisoformat(isodate).astimezone()
+        if isodate != ""
+        else datetime.datetime.now(tz=datetime.timezone.utc)
+    )
+    dt = dt.replace(microsecond=0)
+    return dt.isoformat().replace("+00:00", "Z")
+
+
 def set_timestamp_tz_utc(date: datetime.datetime) -> datetime.datetime:
     return date.replace(tzinfo=datetime.timezone.utc)
 
@@ -47,3 +58,37 @@ def zip_data_as_single_file(filename: str, data: str) -> bytes:
         xml_zipped.writestr(filename, data)
 
     return zip_buffer.getvalue()
+
+
+def get_number_xml_files_in_dir(dir_name):
+    return len(glob.glob("**/*.xml", root_dir=dir_name, recursive=True))
+
+
+def filter_dict_by_structure(source: dict, structure_to_retain: dict) -> dict:
+    filtered_dict = {}
+
+    for key, value in structure_to_retain.items():
+        if key not in source:
+            continue
+
+        if structure_to_retain[key] is None:
+            filtered_dict[key] = source[key]
+
+        elif isinstance(structure_to_retain[key], dict) and not isinstance(source[key], dict):
+            filtered_dict[key] = source[key]
+
+        elif isinstance(structure_to_retain[key], dict) and isinstance(source[key], dict):
+            filtered_dict[key] = filter_dict_by_structure(source[key], structure_to_retain[key])
+
+        elif isinstance(structure_to_retain[key], list) and not isinstance(source[key], list):
+            filtered_dict[key] = source[key]
+
+        elif isinstance(structure_to_retain[key], list) and isinstance(source[key], list):
+            filtered_dict[key] = [
+                filter_dict_by_structure(
+                    item, structure_to_retain[key][0] if len(structure_to_retain[key]) == 1 else {}
+                )
+                for item in source[key]
+            ]
+
+    return filtered_dict

From bbdabd6a6b86be2146a7fa3f327a954cfdbd7cb7 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:24:46 +0100
Subject: [PATCH 10/12] feat: store CKAN metadata

* Database migrations
* Download CKAN publisher, dataset metadata
* Save to DB
* Redone indexing code to avoid including legacy metadata in JSON index
---
 db-migrations/20240827_01_pVOLG.rollback.sql  | 10 +++
 db-migrations/20240827_01_pVOLG.sql           | 23 ++++++
 src/bulk_data_service/dataset_indexing.py     | 72 ++++++++++++++---
 src/bulk_data_service/dataset_updater.py      | 48 +++++++++--
 .../iati_registry_ckan.py                     | 79 +++++++++++++++++--
 src/utilities/db.py                           |  5 +-
 6 files changed, 209 insertions(+), 28 deletions(-)
 create mode 100644 db-migrations/20240827_01_pVOLG.rollback.sql
 create mode 100644 db-migrations/20240827_01_pVOLG.sql

diff --git a/db-migrations/20240827_01_pVOLG.rollback.sql b/db-migrations/20240827_01_pVOLG.rollback.sql
new file mode 100644
index 0000000..1cabf1b
--- /dev/null
+++ b/db-migrations/20240827_01_pVOLG.rollback.sql
@@ -0,0 +1,10 @@
+--
+-- depends: 20240531_01_iY5Qa
+ALTER TABLE
+    iati_datasets DROP COLUMN IF EXISTS registration_service_dataset_metadata;
+
+ALTER TABLE
+    iati_datasets DROP COLUMN IF EXISTS registration_service_publisher_metadata;
+
+ALTER TABLE
+    iati_datasets DROP COLUMN IF EXISTS registration_service_name;
\ No newline at end of file
diff --git a/db-migrations/20240827_01_pVOLG.sql b/db-migrations/20240827_01_pVOLG.sql
new file mode 100644
index 0000000..3d4c0b1
--- /dev/null
+++ b/db-migrations/20240827_01_pVOLG.sql
@@ -0,0 +1,23 @@
+--
+-- depends: 20240531_01_iY5Qa
+--
+ALTER TABLE
+    iati_datasets
+ADD
+    registration_service_dataset_metadata VARCHAR;
+
+COMMENT ON COLUMN iati_datasets.registration_service_dataset_metadata IS 'the original dataset metadata record from the data registration service';
+
+ALTER TABLE
+    iati_datasets
+ADD
+    registration_service_publisher_metadata VARCHAR;
+
+COMMENT ON COLUMN iati_datasets.registration_service_publisher_metadata IS 'the original publisher metadata record from the data registration service';
+
+ALTER TABLE
+    iati_datasets
+ADD
+    registration_service_name VARCHAR;
+
+COMMENT ON COLUMN iati_datasets.registration_service_name IS 'the name of the data registration service';
\ No newline at end of file
diff --git a/src/bulk_data_service/dataset_indexing.py b/src/bulk_data_service/dataset_indexing.py
index 06f20a5..6894570 100644
--- a/src/bulk_data_service/dataset_indexing.py
+++ b/src/bulk_data_service/dataset_indexing.py
@@ -10,6 +10,8 @@
 
 def create_and_upload_indices(context: dict, datasets_in_bds: dict[uuid.UUID, dict]):
 
+    context["logger"].info("Creating indices")
+
     minimal_index = create_index_json(context, datasets_in_bds, "minimal")
 
     full_index = create_index_json(context, datasets_in_bds, "full")
@@ -18,6 +20,8 @@ def create_and_upload_indices(context: dict, datasets_in_bds: dict[uuid.UUID, di
 
     upload_index_json_to_azure(context, get_index_name(context, "full"), full_index)
 
+    context["logger"].info("Creation of indices finished")
+
 
 def upload_index_json_to_azure(context: dict, index_name: str, index_json: str):
 
@@ -44,24 +48,24 @@ def create_index_json(context: dict, datasets_in_bds: dict[uuid.UUID, dict], ind
 
 
 def get_dataset_index(context: dict, datasets_in_bds: dict[uuid.UUID, dict], index_type: str) -> dict:
-    return {v["name"]: get_index_item(context, v, index_type) for _, v in datasets_in_bds.items()}
+    return {v["name"]: get_index_entry(context, v, index_type) for _, v in datasets_in_bds.items()}
 
 
-def get_index_item(context: dict, dataset: dict, index_type: str) -> dict[str, Any]:
+def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str, Any]:
 
     if index_type == "minimal":
-        dataset_index = {k: v for k, v in dataset.items() if k in get_minimal_index_fields(context)}
+        dataset_index_entry = get_minimal_index_entry_from_dataset(context, dataset)
     else:
-        dataset_index = {k: v for k, v in dataset.items()}
+        dataset_index_entry = get_full_index_entry_from_dataset(context, dataset)
 
-    dataset_index["url_xml"] = ""
-    dataset_index["url_zip"] = ""
+    dataset_index_entry["url_xml"] = ""
+    dataset_index_entry["url_zip"] = ""
 
-    if dataset_index["last_successful_download"] is not None:
-        dataset_index["url_xml"] = get_azure_blob_public_url(context, dataset, "xml")
-        dataset_index["url_zip"] = get_azure_blob_public_url(context, dataset, "zip")
+    if dataset_index_entry["last_successful_download"] is not None:
+        dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml")
+        dataset_index_entry["url_zip"] = get_azure_blob_public_url(context, dataset, "zip")
 
-    return dataset_index
+    return dataset_index_entry
 
 
 def get_index_name(context: dict, index_type: str) -> str:
@@ -71,7 +75,53 @@ def get_index_name(context: dict, index_type: str) -> str:
     return "dataset-index-{}.json".format(index_type)
 
 
-def get_minimal_index_fields(context: dict) -> list:
+def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict:
+    return {k: v for k, v in dataset.items() if k in get_minimal_index_fields(context)}
+
+
+def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict:
+    full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)}
+
+    field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details")
+
+    field_from_json_str_to_object(full_index_entry, "head_error_message", "head_error_details")
+
+    return full_index_entry
+
+
+def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: str):
+    entry[dest_field] = json.loads(
+        entry[source_field] if entry[source_field] is not None and entry[source_field] != "" else "{}"
+    )
+    del entry[source_field]
+
+
+def get_full_index_fields(context: dict) -> list[str]:
+    return [
+        "id",
+        "name",
+        "publisher_name",
+        "type",
+        "source_url",
+        "hash",
+        "hash_excluding_generated_timestamp",
+        "last_update_check",
+        "last_head_attempt",
+        "last_head_http_status",
+        "head_error_message",
+        "last_download_attempt",
+        "last_download_http_status",
+        "last_successful_download",
+        "last_verified_on_server",
+        "download_error_message",
+        "content_modified",
+        "content_modified_excluding_generated_timestamp",
+        "server_header_last_modified",
+        "server_header_etag",
+    ]
+
+
+def get_minimal_index_fields(context: dict) -> list[str]:
     return [
         "id",
         "name",
diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py
index d953299..56dd26d 100644
--- a/src/bulk_data_service/dataset_updater.py
+++ b/src/bulk_data_service/dataset_updater.py
@@ -1,6 +1,7 @@
 import concurrent.futures
-import uuid
 import json
+import traceback
+import uuid
 from datetime import datetime, timedelta
 from itertools import batched
 from random import random
@@ -9,9 +10,9 @@
 import requests
 from azure.storage.blob import BlobServiceClient
 
-from utilities.azure import azure_upload_to_blob
+from utilities.azure import azure_blob_exists, azure_upload_to_blob
 from utilities.db import get_db_connection, insert_or_update_dataset
-from utilities.http import get_requests_session, http_download_dataset, http_head_dataset
+from utilities.http import get_requests_session, http_download_dataset, http_head_dataset, parse_last_modified_header
 from utilities.misc import (
     get_hash,
     get_hash_excluding_generated_timestamp,
@@ -227,8 +228,10 @@ def check_dataset_etag_last_mod_header(
 
     except Exception as e:
         context["logger"].warning(
-            "dataset id: {} - EXCEPTION with HEAD request, details: " "{}".format(bds_dataset["id"], e)
+            "dataset id: {} - EXCEPTION with HEAD request, details: {}".format(bds_dataset["id"], e)
         )
+        if "{}".format(e) == "str.replace() takes no keyword arguments":
+            context["logger"].error("Full traceback: " "{}".format(traceback.format_exc()))
 
     return attempt_download
 
@@ -252,7 +255,7 @@ def download_and_save_dataset(
     else:
         iati_xml_zipped = zip_data_as_single_file(bds_dataset["name"] + ".xml", download_response.text)
 
-        azure_upload_to_blob(
+        response_xml = azure_upload_to_blob(
             az_blob_service,
             context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
             "{}/{}.xml".format(bds_dataset["publisher_name"], bds_dataset["name"]),
@@ -260,7 +263,11 @@ def download_and_save_dataset(
             "application/xml",
         )
 
-        azure_upload_to_blob(
+        context["logger"].debug(
+            "dataset id: {} - Azure XML upload response: {}".format(bds_dataset["id"], response_xml)
+        )
+
+        response_zip = azure_upload_to_blob(
             az_blob_service,
             context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
             "{}/{}.zip".format(bds_dataset["publisher_name"], bds_dataset["name"]),
@@ -268,6 +275,20 @@ def download_and_save_dataset(
             "application/zip",
         )
 
+        if not azure_blob_exists(
+            az_blob_service,
+            context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
+            "{}/{}.xml".format(bds_dataset["publisher_name"], bds_dataset["name"]),
+        ):
+            context["logger"].error("dataset id: {} - Azure XML upload failed")
+            context["logger"].debug(
+                "dataset id: {} - Azure ZIP upload response: {}".format(bds_dataset["id"], response_xml)
+            )
+
+    last_modified_header = None
+    if download_response.headers.get("Last-Modified", None) is not None:
+        last_modified_header = parse_last_modified_header(download_response.headers.get("Last-Modified", ""))
+
     bds_dataset.update(
         {
             "hash": hash,
@@ -280,7 +301,7 @@ def download_and_save_dataset(
             "download_error_message": None,
             "content_modified": None,
             "content_modified_excluding_generated_timestamp": None,
-            "server_header_last_modified": download_response.headers.get("Last-Modified", None),
+            "server_header_last_modified": last_modified_header,
             "server_header_etag": download_response.headers.get("ETag", None),
         }
     )
@@ -315,9 +336,20 @@ def create_bds_dataset(registered_dataset: dict) -> dict:
         "content_modified_excluding_generated_timestamp": None,
         "server_header_last_modified": None,
         "server_header_etag": None,
+        "registration_service_dataset_metadata": registered_dataset["registration_service_dataset_metadata"],
+        "registration_service_publisher_metadata": registered_dataset["registration_service_publisher_metadata"],
+        "registration_service_name": registered_dataset["registration_service_name"],
     }
 
 
 def update_bds_dataset_registration_info(bds_dataset: dict, registered_dataset: dict):
-    for field in ["publisher_id", "publisher_name", "type", "source_url"]:
+    for field in [
+        "publisher_id",
+        "publisher_name",
+        "type",
+        "source_url",
+        "registration_service_dataset_metadata",
+        "registration_service_publisher_metadata",
+        "registration_service_name",
+    ]:
         bds_dataset[field] = registered_dataset[field]
diff --git a/src/dataset_registration/iati_registry_ckan.py b/src/dataset_registration/iati_registry_ckan.py
index ea5e103..5ebb28f 100644
--- a/src/dataset_registration/iati_registry_ckan.py
+++ b/src/dataset_registration/iati_registry_ckan.py
@@ -1,11 +1,17 @@
+import json
 import random
 import uuid
+from datetime import datetime, timedelta
 from logging import Logger
 from typing import Any
 
 import requests
 
 from utilities.http import http_get_json
+from utilities.misc import get_timestamp
+
+PUBLISHER_METADATA: dict[str, Any] = {}
+PUBLISHER_METADATA_LAST_UPDATE: datetime = get_timestamp() - timedelta(hours=25)
 
 
 def fetch_datasets_metadata(context: dict, session: requests.Session) -> dict[uuid.UUID, dict]:
@@ -16,6 +22,8 @@ def fetch_datasets_metadata(context: dict, session: requests.Session) -> dict[uu
 
     cleaned_datasets_metadata = clean_datasets_metadata(context["logger"], datasets_list_from_registry)
 
+    add_publisher_metadata(context, session, cleaned_datasets_metadata)
+
     datasets_metadata = convert_datasets_metadata(cleaned_datasets_metadata)
 
     return datasets_metadata
@@ -47,6 +55,52 @@ def fetch_datasets_metadata_from_iati_registry(context: dict, session: requests.
     return datasets_metadata
 
 
+def add_publisher_metadata(context: dict, session: requests.Session, datasets_from_registry: list[dict[str, Any]]):
+    for registry_dataset in datasets_from_registry:
+        publisher_metadata = ""
+        if "organization" in registry_dataset and "name" in registry_dataset["organization"]:
+            publisher_metadata = get_publisher_metadata_as_str(
+                context, session, registry_dataset["organization"]["name"]
+            )
+        registry_dataset["registration_service_publisher_metadata"] = publisher_metadata
+
+
+def get_publisher_metadata_as_str(context: dict, session: requests.Session, publisher_name: str) -> str:
+    global PUBLISHER_METADATA, PUBLISHER_METADATA_LAST_UPDATE
+
+    if PUBLISHER_METADATA_LAST_UPDATE < get_timestamp() - timedelta(
+        hours=int(context["DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS"])
+    ):
+        context["logger"].info(
+            "Refreshing publisher metadata from IATI Registry (CKAN) "
+            "after {} hours...".format(context["DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS"])
+        )
+        update_publisher_metadata_cache(context, session)
+        PUBLISHER_METADATA_LAST_UPDATE = get_timestamp()
+
+    return PUBLISHER_METADATA[publisher_name] if publisher_name in PUBLISHER_METADATA else "{}"
+
+
+def update_publisher_metadata_cache(context: dict, session: requests.Session):
+    global PUBLISHER_METADATA, PUBLISHER_METADATA_LAST_UPDATE
+
+    url = context["DATA_REGISTRY_PUBLISHER_METADATA_URL"]
+
+    PUBLISHER_METADATA = {}
+
+    try:
+        publishers_metadata = http_get_json(session, url, 60, True)
+
+        PUBLISHER_METADATA = {publisher["name"]: json.dumps(publisher) for publisher in publishers_metadata["result"]}
+
+    except RuntimeError as e:
+        context["logger"].error("HTTP error when fetching publisher metadata from IATI Registry (CKAN): {}".format(e))
+    except Exception as e:
+        context["logger"].error(
+            "Unexpected error when fetching publisher metadata from IATI Registry (CKAN): {}".format(e)
+        )
+
+
 def clean_datasets_metadata(logger: Logger, datasets_from_registry: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """Cleans the list of datasets from the IATI Registry
 
@@ -113,15 +167,24 @@ def ckan_field_is_valid(value: Any) -> bool:
 def convert_datasets_metadata(datasets_from_registry: list[dict]) -> dict[uuid.UUID, dict]:
 
     registered_datasets = {
-        uuid.UUID(k["id"]): {
-            "id": uuid.UUID(k["id"]),
-            "name": k["name"],
-            "publisher_id": uuid.UUID(k["organization"]["id"]),
-            "publisher_name": k["organization"]["name"],
-            "source_url": get_source_url(k),
-            "type": list(filter(lambda x: x["key"] == "filetype", k["extras"]))[0]["value"],
+        uuid.UUID(dataset["id"]): {
+            "id": uuid.UUID(dataset["id"]),
+            "name": dataset["name"],
+            "publisher_id": uuid.UUID(dataset["organization"]["id"]),
+            "publisher_name": dataset["organization"]["name"],
+            "source_url": get_source_url(dataset),
+            "type": list(filter(lambda x: x["key"] == "filetype", dataset["extras"]))[0]["value"],
+            "registration_service_dataset_metadata": json.dumps(
+                {k: dataset[k] for k in dataset if k != "registration_service_publisher_metadata"}
+            ),
+            "registration_service_publisher_metadata": (
+                dataset["registration_service_publisher_metadata"]
+                if "registration_service_publisher_metadata" in dataset
+                else ""
+            ),
+            "registration_service_name": "ckan-registry",
         }
-        for k in datasets_from_registry
+        for dataset in datasets_from_registry
     }
 
     return registered_datasets
diff --git a/src/utilities/db.py b/src/utilities/db.py
index 6e2d55a..fc3f75d 100644
--- a/src/utilities/db.py
+++ b/src/utilities/db.py
@@ -74,7 +74,10 @@ def insert_or_update_dataset(connection: psycopg.Connection, data):
                         content_modified_excluding_generated_timestamp =
                             %(content_modified_excluding_generated_timestamp)s,
                         server_header_last_modified = %(server_header_last_modified)s,
-                        server_header_etag = %(server_header_etag)s
+                        server_header_etag = %(server_header_etag)s,
+                        registration_service_dataset_metadata = %(registration_service_dataset_metadata)s,
+                        registration_service_publisher_metadata = %(registration_service_publisher_metadata)s,
+                        registration_service_name = %(registration_service_name)s
                     WHERE
                         iati_datasets.id = %(id)s
         """.format(

From aa188d583218813b05bdd4a72ff6691a5edaa81a Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:25:57 +0100
Subject: [PATCH 11/12] feat: second ZIP file format

* Added code to make outputting of new ZIP file formats easier
* Implemented a second ZIP format, which is stripped down version of legacy Data Dump
---
 src/bulk_data_service/zipper.py  | 108 +++++++--------
 src/bulk_data_service/zippers.py | 230 +++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+), 60 deletions(-)
 create mode 100644 src/bulk_data_service/zippers.py

diff --git a/src/bulk_data_service/zipper.py b/src/bulk_data_service/zipper.py
index 7b2b2ea..ed45d0e 100644
--- a/src/bulk_data_service/zipper.py
+++ b/src/bulk_data_service/zipper.py
@@ -4,10 +4,11 @@
 import time
 import uuid
 
-from azure.storage.blob import BlobServiceClient, ContentSettings
 from azure.core.exceptions import ResourceNotFoundError
+from azure.storage.blob import BlobServiceClient
 
 from bulk_data_service.dataset_indexing import get_index_name
+from bulk_data_service.zippers import CodeforIATILegacyZipper, IATIBulkDataServiceZipper
 from utilities.azure import azure_download_blob, get_azure_blob_name, get_azure_container_name
 from utilities.db import get_datasets_in_bds
 
@@ -22,59 +23,75 @@ def zipper(context: dict):
         zipper_service_loop(context, {}, datasets)
 
 
-def zipper_service_loop(context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]):
+def zipper_service_loop(
+    context: dict, datasets_in_working_dir: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]
+):
 
     while True:
-        zipper_run(context, datasets_in_zip, datasets_in_bds)
+        zipper_run(context, datasets_in_working_dir, datasets_in_bds)
 
         time.sleep(60 * 30)
 
 
-def zipper_run(context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]):
+def zipper_run(context: dict, datasets_in_working_dir: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]):
 
     run_start = datetime.datetime.now(datetime.UTC)
-    context["logger"].info("Zipper starting run")
+    context["logger"].info("Zipper run starting")
+
+    setup_working_dir_with_downloaded_datasets(context, datasets_in_working_dir, datasets_in_bds)
+
+    zip_creators = [
+        IATIBulkDataServiceZipper(
+            context, "{}-1".format(context["ZIP_WORKING_DIR"]), datasets_in_working_dir, datasets_in_bds
+        ),
+        CodeforIATILegacyZipper(
+            context, "{}-2".format(context["ZIP_WORKING_DIR"]), datasets_in_working_dir, datasets_in_bds
+        ),
+    ]
+
+    for zip_creator in zip_creators:
+
+        if os.path.exists(zip_creator.zip_working_dir):
+            shutil.rmtree(zip_creator.zip_working_dir)
+        shutil.copytree(context["ZIP_WORKING_DIR"], zip_creator.zip_working_dir)
+
+        zip_creator.prepare()
+
+        zip_creator.zip()
+
+        zip_creator.upload()
+
+    run_end = datetime.datetime.now(datetime.UTC)
+    context["logger"].info("Zipper run finished in {}.".format(run_end - run_start))
 
-    clean_working_dir(context, datasets_in_zip)
+
+def setup_working_dir_with_downloaded_datasets(
+    context: dict, datasets_in_working_dir: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]
+):
+
+    clean_working_dir(context, datasets_in_working_dir)
 
     datasets_with_downloads = {k: v for k, v in datasets_in_bds.items() if v["last_successful_download"] is not None}
 
-    remove_datasets_without_dls_from_working_dir(context, datasets_in_zip, datasets_with_downloads)
+    remove_datasets_without_dls_from_working_dir(context, datasets_in_working_dir, datasets_with_downloads)
 
     new_or_updated_datasets = {
         k: v
         for k, v in datasets_with_downloads.items()
-        if k not in datasets_in_zip or datasets_in_zip[k]["hash"] != datasets_with_downloads[k]["hash"]
+        if k not in datasets_in_working_dir or datasets_in_working_dir[k]["hash"] != datasets_with_downloads[k]["hash"]
     }
 
     context["logger"].info(
-        "Found {} datasets to ZIP. {} are new or updated and will be (re-)downloaded.".format(
+        "Found {} datasets with downloads. "
+        "{} are new or updated and will be (re-)downloaded.".format(
             len(datasets_with_downloads), len(new_or_updated_datasets)
         )
     )
 
     download_new_or_updated_to_working_dir(context, new_or_updated_datasets)
 
-    download_indices_to_working_dir(context)
-
-    context["logger"].info("Zipping {} datasets.".format(len(datasets_with_downloads)))
-    shutil.make_archive(
-        get_big_zip_local_pathname_no_extension(context),
-        "zip",
-        root_dir=context["ZIP_WORKING_DIR"],
-        base_dir="iati-data",
-    )
-
-    context["logger"].info("Uploading zipped datasets.")
-    upload_zip_to_azure(context)
-
-    run_end = datetime.datetime.now(datetime.UTC)
-    context["logger"].info(
-        "Zipper finished in {}. Datasets zipped: {}.".format(run_end - run_start, len(datasets_with_downloads))
-    )
-
-    datasets_in_zip.clear()
-    datasets_in_zip.update(datasets_with_downloads)
+    datasets_in_working_dir.clear()
+    datasets_in_working_dir.update(datasets_with_downloads)
 
 
 def clean_working_dir(context: dict, datasets_in_zip: dict[uuid.UUID, dict]):
@@ -92,21 +109,6 @@ def remove_datasets_without_dls_from_working_dir(
         delete_local_xml_from_zip_working_dir(context, dataset)
 
 
-def upload_zip_to_azure(context: dict):
-    az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
-
-    blob_client = az_blob_service.get_blob_client(
-        context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], get_big_zip_full_filename(context)
-    )
-
-    content_settings = ContentSettings(content_type="zip")
-
-    with open(get_big_zip_local_pathname(context), "rb") as data:
-        blob_client.upload_blob(data, overwrite=True, content_settings=content_settings)
-
-    az_blob_service.close()
-
-
 def download_indices_to_working_dir(context: dict):
     az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
 
@@ -134,6 +136,8 @@ def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict
 
     xml_container_name = get_azure_container_name(context, "xml")
 
+    os.makedirs("{}/iati-data/datasets".format(context["ZIP_WORKING_DIR"]), exist_ok=True)
+
     for dataset in updated_datasets.values():
         filename = get_local_pathname_dataset_xml(context, dataset)
 
@@ -151,22 +155,6 @@ def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict
     az_blob_service.close()
 
 
-def get_big_zip_local_pathname_no_extension(context: dict) -> str:
-    return "{}/{}".format(context["ZIP_WORKING_DIR"], get_big_zip_base_filename(context))
-
-
-def get_big_zip_local_pathname(context: dict) -> str:
-    return "{}/{}".format(context["ZIP_WORKING_DIR"], get_big_zip_full_filename(context))
-
-
-def get_big_zip_base_filename(context: dict) -> str:
-    return "iati-data"
-
-
-def get_big_zip_full_filename(context: dict) -> str:
-    return "{}.zip".format(get_big_zip_base_filename(context))
-
-
 def get_local_pathname_dataset_xml(context: dict, dataset: dict) -> str:
     return "{}/iati-data/datasets/{}".format(context["ZIP_WORKING_DIR"], get_azure_blob_name(dataset, "xml"))
 
diff --git a/src/bulk_data_service/zippers.py b/src/bulk_data_service/zippers.py
new file mode 100644
index 0000000..95d9a8a
--- /dev/null
+++ b/src/bulk_data_service/zippers.py
@@ -0,0 +1,230 @@
+import json
+import os
+import shutil
+import uuid
+from abc import ABC, abstractmethod
+
+from azure.storage.blob import BlobServiceClient
+
+from bulk_data_service.dataset_indexing import get_index_name
+from utilities.azure import azure_download_blob, get_azure_container_name, upload_zip_to_azure
+from utilities.misc import filter_dict_by_structure, get_number_xml_files_in_dir, get_timestamp_as_str_z
+
+
+class IATIDataZipper(ABC):
+
+    def __init__(
+        self,
+        context,
+        zip_working_dir: str,
+        datasets_in_working_dir: dict[uuid.UUID, dict],
+        datasets_in_bds: dict[uuid.UUID, dict],
+    ):
+        self.context = context
+        self.datasets_in_working_dir = datasets_in_working_dir
+        self.datasets_in_bds = datasets_in_bds
+        self.zip_working_dir = zip_working_dir
+
+    @abstractmethod
+    def prepare(self):
+        pass
+
+    @property
+    @abstractmethod
+    def zip_type(self) -> str:
+        pass
+
+    @property
+    def zip_internal_directory_name(self) -> str:
+        return "iati-data"
+
+    @property
+    def zip_local_filename_no_extension(self) -> str:
+        return "iati-data"
+
+    def zip(self):
+        self.context["logger"].info("Zipping {} datasets.".format(get_number_xml_files_in_dir(self.zip_working_dir)))
+        shutil.make_archive(
+            self.get_zip_local_pathname_no_extension(),
+            "zip",
+            root_dir=self.zip_working_dir,
+            base_dir=self.zip_internal_directory_name,
+        )
+
+    def upload(self):
+        self.context["logger"].info(
+            "Uploading {} ZIP to Azure with filename: {}.".format(self.zip_type, self.get_zip_local_filename())
+        )
+        upload_zip_to_azure(self.context, self.get_zip_local_pathname(), self.get_zip_local_filename())
+
+    def get_zip_local_filename(self) -> str:
+        return "{}.zip".format(self.zip_local_filename_no_extension)
+
+    def get_zip_local_pathname(self) -> str:
+        return "{}/{}".format(self.zip_working_dir, self.get_zip_local_filename())
+
+    def get_zip_local_pathname_no_extension(self) -> str:
+        return "{}/{}".format(self.zip_working_dir, self.zip_local_filename_no_extension)
+
+
+class IATIBulkDataServiceZipper(IATIDataZipper):
+
+    def prepare(self):
+        az_blob_service = BlobServiceClient.from_connection_string(self.context["AZURE_STORAGE_CONNECTION_STRING"])
+
+        self.download_index_to_working_dir(az_blob_service, "minimal")
+
+        self.download_index_to_working_dir(az_blob_service, "full")
+
+        az_blob_service.close()
+
+    @property
+    def zip_type(self) -> str:
+        return "Bulk Data Service"
+
+    def download_index_to_working_dir(self, az_blob_service: BlobServiceClient, index_type: str):
+
+        index_filename = get_index_name(self.context, index_type)
+
+        index_full_pathname = "{}/{}/{}".format(self.zip_working_dir, self.zip_internal_directory_name, index_filename)
+
+        os.makedirs(os.path.dirname(index_full_pathname), exist_ok=True)
+
+        azure_download_blob(
+            az_blob_service, get_azure_container_name(self.context, "xml"), index_filename, index_full_pathname
+        )
+
+
+class CodeforIATILegacyZipper(IATIDataZipper):
+
+    def prepare(self):
+        # rename 'iati-data' directory to 'iati-data-main'
+        if os.path.exists(os.path.join(self.zip_working_dir, super().zip_internal_directory_name)):
+            os.rename(
+                os.path.join(self.zip_working_dir, super().zip_internal_directory_name),
+                os.path.join(self.zip_working_dir, self.zip_internal_directory_name),
+            )
+
+        # rename 'datasets' directory to 'data'
+        if os.path.exists(os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "datasets")):
+            os.rename(
+                os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "datasets"),
+                os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "data"),
+            )
+
+        self.create_empty_files_for_non_downloadable_datasets()
+
+        if not os.path.exists(os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "metadata")):
+            os.makedirs(os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "metadata"))
+
+        self.write_publisher_metadata_files()
+
+        self.write_transformed_dataset_metadata_files()
+
+        # create root 'metadata.json' file with created timestsamp
+        self.create_zip_file_metadata()
+
+    def create_empty_files_for_non_downloadable_datasets(self):
+        for dataset_in_bds_db in self.datasets_in_bds:
+            dataset = self.datasets_in_bds[dataset_in_bds_db]
+            dataset_pathname = self.get_dataset_data_pathname(self.datasets_in_bds[dataset_in_bds_db])
+            dataset_filename = self.get_dataset_data_filename(self.datasets_in_bds[dataset_in_bds_db])
+            if dataset["last_successful_download"] is None:
+                if not os.path.exists(dataset_pathname):
+                    os.makedirs(dataset_pathname, exist_ok=True)
+            if not os.path.exists(dataset_filename):
+                open(dataset_filename, "w").close()
+
+    def write_publisher_metadata_files(self):
+        for dataset_in_bds_db in self.datasets_in_bds:
+            publisher_metadata_filename = self.get_publisher_metadata_filename(
+                self.datasets_in_bds[dataset_in_bds_db]["publisher_name"]
+            )
+            if not os.path.exists(publisher_metadata_filename):
+                with open(publisher_metadata_filename, "w") as pub_file:
+                    pub_file.write(
+                        self.filter_publisher_metadata(
+                            self.datasets_in_bds[dataset_in_bds_db]["registration_service_publisher_metadata"]
+                        )
+                    )
+
+    def get_publisher_metadata_filename(self, publisher_name):
+        return os.path.join(
+            self.zip_working_dir, self.zip_internal_directory_name, "metadata", f"{publisher_name}.json"
+        )
+
+    def filter_publisher_metadata(self, ckan_publisher_metadata: str) -> str:
+        desired_structure = {
+            "id": None,
+            "name": None,
+        }
+
+        filtered_dict = filter_dict_by_structure(json.loads(ckan_publisher_metadata), desired_structure)
+
+        return json.dumps(filtered_dict)
+
+    def write_transformed_dataset_metadata_files(self):
+        for dataset_in_bds_db in self.datasets_in_bds:
+            dataset_metadata_pathname = self.get_dataset_metadata_pathname(self.datasets_in_bds[dataset_in_bds_db])
+            dataset_metadata_filename = self.get_dataset_metadata_filename(self.datasets_in_bds[dataset_in_bds_db])
+            if not os.path.exists(dataset_metadata_pathname):
+                os.makedirs(dataset_metadata_pathname, exist_ok=True)
+            if not os.path.exists(dataset_metadata_filename):
+                with open(dataset_metadata_filename, "w") as pub_file:
+                    pub_file.write(
+                        self.filter_dataset_metadata_file(
+                            self.datasets_in_bds[dataset_in_bds_db]["registration_service_dataset_metadata"]
+                        )
+                    )
+
+    def filter_dataset_metadata_file(self, ckan_dataset_metadata: str) -> str:
+        desired_structure = {
+            "id": None,
+            "license_id": None,
+            "license_title": None,
+            "name": None,
+            "organization": {"id": None, "name": None},
+            "resources": [{"url": None}],
+        }
+        empty_array_entries = ["extras", "tags", "groups", "users"]
+
+        filtered_dict = filter_dict_by_structure(json.loads(ckan_dataset_metadata), desired_structure)
+
+        for empty_array_entry in empty_array_entries:
+            filtered_dict[empty_array_entry] = []
+
+        return json.dumps(filtered_dict)
+
+    def get_dataset_data_pathname(self, dataset_in_bds):
+        return os.path.join(
+            self.zip_working_dir, self.zip_internal_directory_name, "data", f"{dataset_in_bds['publisher_name']}"
+        )
+
+    def get_dataset_data_filename(self, dataset_in_bds):
+        return os.path.join(self.get_dataset_data_pathname(dataset_in_bds), f"{dataset_in_bds['name']}.xml")
+
+    def get_dataset_metadata_pathname(self, dataset_in_bds):
+        return os.path.join(
+            self.zip_working_dir, self.zip_internal_directory_name, "metadata", f"{dataset_in_bds['publisher_name']}"
+        )
+
+    def get_dataset_metadata_filename(self, dataset_in_bds):
+        return os.path.join(self.get_dataset_metadata_pathname(dataset_in_bds), f"{dataset_in_bds['name']}.json")
+
+    @property
+    def zip_type(self) -> str:
+        return "Code for IATI"
+
+    @property
+    def zip_internal_directory_name(self) -> str:
+        return "iati-data-main"
+
+    @property
+    def zip_local_filename_no_extension(self) -> str:
+        return "code-for-iati-data-download"
+
+    def create_zip_file_metadata(self):
+        with open(
+            "{}/{}/{}".format(self.zip_working_dir, self.zip_internal_directory_name, "metadata.json"), "w"
+        ) as metadata_file:
+            json.dump({"created_at": get_timestamp_as_str_z(), "updated_at": get_timestamp_as_str_z()}, metadata_file)

From d8dd0c69c5bbbbb69340ea0160befeed67988b41 Mon Sep 17 00:00:00 2001
From: Simon K <6615834+simon-20@users.noreply.github.com>
Date: Tue, 10 Sep 2024 11:28:45 +0100
Subject: [PATCH 12/12] chore: update release targets ready for release

---
 .github/workflows/deploy-to-dev.yml  | 2 +-
 .github/workflows/deploy-to-prod.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy-to-dev.yml b/.github/workflows/deploy-to-dev.yml
index 9e1d962..9faaa26 100644
--- a/.github/workflows/deploy-to-dev.yml
+++ b/.github/workflows/deploy-to-dev.yml
@@ -19,4 +19,4 @@ jobs:
     secrets: inherit
     with:
       APP_NAME: "bulk-data-service"
-      TARGET_ENVIRONMENT: "test"
+      TARGET_ENVIRONMENT: "dev"
diff --git a/.github/workflows/deploy-to-prod.yml b/.github/workflows/deploy-to-prod.yml
index aee507a..5718a19 100644
--- a/.github/workflows/deploy-to-prod.yml
+++ b/.github/workflows/deploy-to-prod.yml
@@ -13,4 +13,4 @@ jobs:
     secrets: inherit
     with:
       APP_NAME: "bulk-data-service"
-      TARGET_ENVIRONMENT: "test"
+      TARGET_ENVIRONMENT: "prod"