From a141f4c0b277a2aa22f572d24b393e39879436cf Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:25:46 +0100 Subject: [PATCH 01/12] chore: update yoyo migrations; add VS Code helper --- .vscode/launch.json | 31 ++++++++++++++++++++++++++++++- pyproject.toml | 7 ++++--- requirements-dev.txt | 34 +++++++++++++++++++--------------- requirements.txt | 18 +++++++++--------- 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 4a6ec4f..a658f52 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -12,10 +12,39 @@ "args": [ "--operation", "checker", + "--single-run", + "--run-for-n-datasets", + "75" + ], + "console": "integratedTerminal", + "envFile": "${workspaceFolder}/.env" + }, + { + "name": "Python Debugger: Bulk Data Service - Zipper - Single Run", + "type": "debugpy", + "request": "launch", + "program": "src/iati_bulk_data_service.py", + "args": [ + "--operation", + "zipper", "--single-run" ], "console": "integratedTerminal", "envFile": "${workspaceFolder}/.env" - } + }, + { + "name": "Python Debugger: Bulk Data Service - Checker & Zipper Loop", + "type": "debugpy", + "request": "launch", + "program": "src/iati_bulk_data_service.py", + "args": [ + "--operation", + "checker", + "--run-for-n-datasets", + "75" + ], + "console": "integratedTerminal", + "envFile": "${workspaceFolder}/.env" + }, ] } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ac7e687..f982c49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,8 @@ dependencies = [ "azure-storage-blob==12.20.0", "psycopg[binary,pool]==3.1.18", "requests==2.31.0", - "yoyo-migrations==8.2.0", - "prometheus-client==0.20.0" + "yoyo-migrations==9.0.0", + "prometheus-client==0.20.0", ] @@ -22,7 +22,8 @@ dev = [ "flake8", "flake8-pyproject", "types-requests", - "python-dotenv" + "python-dotenv", + "pytest-watcher" ] diff --git a/requirements-dev.txt b/requirements-dev.txt index 9169a7e..d7ec9d2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,13 +8,13 @@ azure-core==1.30.2 # via azure-storage-blob azure-storage-blob==12.20.0 # via bulk-data-service (pyproject.toml) -black==24.4.2 +black==24.8.0 # via bulk-data-service (pyproject.toml) build==1.2.1 # via pip-tools -certifi==2024.7.4 +certifi==2024.8.30 # via requests -cffi==1.16.0 +cffi==1.17.1 # via cryptography charset-normalizer==3.3.2 # via requests @@ -22,17 +22,17 @@ click==8.1.7 # via # black # pip-tools -cryptography==42.0.8 +cryptography==43.0.1 # via azure-storage-blob -flake8==7.1.0 +flake8==7.1.1 # via # bulk-data-service (pyproject.toml) # flake8-pyproject flake8-pyproject==1.2.3 # via bulk-data-service (pyproject.toml) -idna==3.7 +idna==3.8 # via requests -importlib-metadata==8.0.0 +importlib-metadata==8.4.0 # via yoyo-migrations iniconfig==2.0.0 # via pytest @@ -42,7 +42,7 @@ isort==5.13.2 # via bulk-data-service (pyproject.toml) mccabe==0.7.0 # via flake8 -mypy==1.10.1 +mypy==1.11.2 # via bulk-data-service (pyproject.toml) mypy-extensions==1.0.0 # via @@ -69,7 +69,7 @@ psycopg-binary==3.1.18 # via psycopg psycopg-pool==3.2.2 # via psycopg -pycodestyle==2.12.0 +pycodestyle==2.12.1 # via flake8 pycparser==2.22 # via cffi @@ -79,7 +79,9 @@ pyproject-hooks==1.1.0 # via # build # pip-tools -pytest==8.2.2 +pytest==8.3.2 + # via bulk-data-service (pyproject.toml) +pytest-watcher==0.4.3 # via bulk-data-service (pyproject.toml) python-dotenv==1.0.1 # via bulk-data-service (pyproject.toml) @@ -91,11 +93,11 @@ six==1.16.0 # via # azure-core # isodate -sqlparse==0.5.0 +sqlparse==0.5.1 # via yoyo-migrations tabulate==0.9.0 # via yoyo-migrations -types-requests==2.32.0.20240622 +types-requests==2.32.0.20240905 # via bulk-data-service (pyproject.toml) typing-extensions==4.12.2 # via @@ -108,11 +110,13 @@ urllib3==2.2.2 # via # requests # types-requests -wheel==0.43.0 +watchdog==5.0.2 + # via pytest-watcher +wheel==0.44.0 # via pip-tools -yoyo-migrations==8.2.0 +yoyo-migrations==9.0.0 # via bulk-data-service (pyproject.toml) -zipp==3.19.2 +zipp==3.20.1 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements.txt b/requirements.txt index a37dbd9..2e62df3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,23 +2,23 @@ # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile --strip-extras pyproject.toml +# pip-compile --output-file=requirements.txt --strip-extras pyproject.toml # azure-core==1.30.2 # via azure-storage-blob azure-storage-blob==12.20.0 # via bulk-data-service (pyproject.toml) -certifi==2024.7.4 +certifi==2024.8.30 # via requests -cffi==1.16.0 +cffi==1.17.1 # via cryptography charset-normalizer==3.3.2 # via requests -cryptography==42.0.8 +cryptography==43.0.1 # via azure-storage-blob -idna==3.7 +idna==3.8 # via requests -importlib-metadata==8.0.0 +importlib-metadata==8.4.0 # via yoyo-migrations isodate==0.6.1 # via azure-storage-blob @@ -40,7 +40,7 @@ six==1.16.0 # via # azure-core # isodate -sqlparse==0.5.0 +sqlparse==0.5.1 # via yoyo-migrations tabulate==0.9.0 # via yoyo-migrations @@ -52,7 +52,7 @@ typing-extensions==4.12.2 # psycopg-pool urllib3==2.2.2 # via requests -yoyo-migrations==8.2.0 +yoyo-migrations==9.0.0 # via bulk-data-service (pyproject.toml) -zipp==3.19.2 +zipp==3.20.1 # via importlib-metadata From 465f853bb14f42d7fad841874fd7f2774abc15b9 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:32:10 +0100 Subject: [PATCH 02/12] chore: add and tidy config vars --- .env-example | 7 ++++--- .github/workflows/build-and-deploy-job.yml | 3 +++ .../azure-resource-manager-deployment-template.yml | 8 ++++++-- azure-deployment/generate-manifest-from-template.sh | 3 +++ .../manual-azure-deploy-variables-example.env | 7 ++++--- .../add-default-config-to-github-variables.sh | 1 + azure-provision/default-github-config-template.env | 11 +++++++---- 7 files changed, 28 insertions(+), 12 deletions(-) diff --git a/.env-example b/.env-example index e2dd984..4529794 100644 --- a/.env-example +++ b/.env-example @@ -1,5 +1,7 @@ DATA_REGISTRATION=ckan-registry DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search +DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true" +DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1 @@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 -# Log file -LOGFILE= - # Sample local setup - values read by docker compose (for simple Postgres DB # creation), and used by the app DB_NAME=bulk_data_service_db @@ -27,3 +26,5 @@ AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM0 AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip + +CHECKER_LOOP_WAIT_MINS=20 diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml index bda631a..cec52bd 100644 --- a/.github/workflows/build-and-deploy-job.yml +++ b/.github/workflows/build-and-deploy-job.yml @@ -103,12 +103,15 @@ jobs: # Variables which configure the app DATA_REGISTRATION: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRATION')] }} DATA_REGISTRY_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_BASE_URL')] }} + DATA_REGISTRY_PUBLISHER_METADATA_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_URL')] }} + DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS')] }} NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }} FORCE_REDOWNLOAD_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'FORCE_REDOWNLOAD_AFTER_HOURS')] }} REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS')] }} ZIP_WORKING_DIR: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'ZIP_WORKING_DIR')] }} AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML')] }} AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP')] }} + CHECKER_LOOP_WAIT_MINS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'CHECKER_LOOP_WAIT_MINS')] }} run: | ./azure-deployment/generate-manifest-from-template.sh diff --git a/azure-deployment/azure-resource-manager-deployment-template.yml b/azure-deployment/azure-resource-manager-deployment-template.yml index c650182..6d0c651 100644 --- a/azure-deployment/azure-resource-manager-deployment-template.yml +++ b/azure-deployment/azure-resource-manager-deployment-template.yml @@ -32,16 +32,20 @@ properties: # Properties of container group value: "#DATA_REGISTRATION#" - name: DATA_REGISTRY_BASE_URL value: "#DATA_REGISTRY_BASE_URL#" + - name: DATA_REGISTRY_PUBLISHER_METADATA_URL + value: "#DATA_REGISTRY_PUBLISHER_METADATA_URL#" + - name: DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS + value: "#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#" - name: WEB_BASE_URL value: "#WEB_BASE_URL#" + - name: CHECKER_LOOP_WAIT_MINS + value: "#CHECKER_LOOP_WAIT_MINS#" - name: NUMBER_DOWNLOADER_THREADS value: "#NUMBER_DOWNLOADER_THREADS#" - name: FORCE_REDOWNLOAD_AFTER_HOURS value: "#FORCE_REDOWNLOAD_AFTER_HOURS#" - name: REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS value: "#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#" - - name: LOGFILE - value: "" - name: ZIP_WORKING_DIR value: "#ZIP_WORKING_DIR#" - name: AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML diff --git a/azure-deployment/generate-manifest-from-template.sh b/azure-deployment/generate-manifest-from-template.sh index f6b65bb..19abc6a 100755 --- a/azure-deployment/generate-manifest-from-template.sh +++ b/azure-deployment/generate-manifest-from-template.sh @@ -43,6 +43,8 @@ sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#DATA_REGISTRY_BASE_URL#^$DATA_REGISTRY_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml +sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_URL#^$DATA_REGISTRY_PUBLISHER_METADATA_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml +sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#^$DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#NUMBER_DOWNLOADER_THREADS#^$NUMBER_DOWNLOADER_THREADS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#FORCE_REDOWNLOAD_AFTER_HOURS#^$FORCE_REDOWNLOAD_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml @@ -50,3 +52,4 @@ sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resou sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml sed -i ''s^#WEB_BASE_URL#^$WEB_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml +sed -i ''s^#CHECKER_LOOP_WAIT_MINS#^$CHECKER_LOOP_WAIT_MINS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml diff --git a/azure-deployment/manual-azure-deploy-variables-example.env b/azure-deployment/manual-azure-deploy-variables-example.env index fccf05e..198f3ac 100644 --- a/azure-deployment/manual-azure-deploy-variables-example.env +++ b/azure-deployment/manual-azure-deploy-variables-example.env @@ -8,14 +8,15 @@ WEB_BASE_URL= DATA_REGISTRATION=ckan-registry DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search +DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true" +DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24 FORCE_REDOWNLOAD_AFTER_HOURS=24 -# Log file -LOGFILE= - NUMBER_DOWNLOADER_THREADS=25 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 ZIP_WORKING_DIR=/tmp/bulk-data-service-zip + +CHECKER_LOOP_WAIT_MINS=20 diff --git a/azure-provision/add-default-config-to-github-variables.sh b/azure-provision/add-default-config-to-github-variables.sh index ce17d50..404c11a 100755 --- a/azure-provision/add-default-config-to-github-variables.sh +++ b/azure-provision/add-default-config-to-github-variables.sh @@ -32,5 +32,6 @@ TARGET_ENVIRONMENT="$1" cp -f azure-provision/default-github-config-template.env azure-provision/default-github-config.env sed -i "s/^/${TARGET_ENVIRONMENT^^}/g" azure-provision/default-github-config.env +sed -i "s/{{TARGET_ENVIRONMENT}}/${TARGET_ENVIRONMENT}/g" azure-provision/default-github-config.env gh variable set --env-file ./azure-provision/default-github-config.env diff --git a/azure-provision/default-github-config-template.env b/azure-provision/default-github-config-template.env index da2fafc..481da52 100644 --- a/azure-provision/default-github-config-template.env +++ b/azure-provision/default-github-config-template.env @@ -1,9 +1,12 @@ -_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=\$web -_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=\$web +_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="$web" +_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="$web" _DATA_REGISTRATION=ckan-registry _DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search +_DATA_REGISTRY_PUBLISHER_METADATA_URL=https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true +_DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24 _FORCE_REDOWNLOAD_AFTER_HOURS=24 -_LOGFILE=log _NUMBER_DOWNLOADER_THREADS=25 _REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 -_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip \ No newline at end of file +_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip +_WEB_BASE_URL="https://{{TARGET_ENVIRONMENT}}-bulk-data.iatistandard.org" +_CHECKER_LOOP_WAIT_MINS=20 From 10f865b383fcd9d701fe71f7ab2c8696f89bfb89 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:32:44 +0100 Subject: [PATCH 03/12] feat: turn caching on CDN off --- azure-provision/azure-create-resources.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/azure-provision/azure-create-resources.sh b/azure-provision/azure-create-resources.sh index ebc0652..585bb1b 100755 --- a/azure-provision/azure-create-resources.sh +++ b/azure-provision/azure-create-resources.sh @@ -206,6 +206,11 @@ az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \ --origin-host-header "$AZURE_BASE_HOSTNAME" \ --location global +az cdn endpoint rule add --resource-group "$RESOURCE_GROUP_NAME" \ + --profile-name "$CDN_PROFILE_NAME" \ + --name "$CDN_ENDPOINT_NAME" \ + --action-name "CacheExpiration" \ + --cache-behavior BypassCache --rule-name global --order 0 read -p "Press any key when the CNAME has been created on Cloudflare " -n 1 -r From 398cd8e224ce02052b5083ba268428828c625772 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:37:14 +0100 Subject: [PATCH 04/12] chore: env var for wait time; urblib warnings off --- Dockerfile | 2 +- src/bulk_data_service/checker.py | 3 ++- src/config/config.py | 4 +++- src/config/initialisation.py | 9 +++++++++ src/iati_bulk_data_service.py | 3 +++ 5 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 src/config/initialisation.py diff --git a/Dockerfile b/Dockerfile index 3b702c2..b84c3fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.12-slim-bookworm +FROM python:3.12.5-slim-bookworm RUN apt-get update -y diff --git a/src/bulk_data_service/checker.py b/src/bulk_data_service/checker.py index 5d6d417..f0e1240 100644 --- a/src/bulk_data_service/checker.py +++ b/src/bulk_data_service/checker.py @@ -33,7 +33,8 @@ def checker_service_loop(context: dict): zipper_run(context, datasets_in_zip, datasets_in_bds) - time.sleep(60 * 30) + context["logger"].info("Pausing for {} mins".format(context["CHECKER_LOOP_WAIT_MINS"])) + time.sleep(60 * int(context["CHECKER_LOOP_WAIT_MINS"])) except Exception as e: context["logger"].error( diff --git a/src/config/config.py b/src/config/config.py index 5d1d1ea..28908e4 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -3,11 +3,12 @@ _config_variables = [ "DATA_REGISTRATION", "DATA_REGISTRY_BASE_URL", + "DATA_REGISTRY_PUBLISHER_METADATA_URL", + "DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS", "WEB_BASE_URL", "NUMBER_DOWNLOADER_THREADS", "FORCE_REDOWNLOAD_AFTER_HOURS", "REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS", - "LOGFILE", "ZIP_WORKING_DIR", "DB_NAME", "DB_USER", @@ -19,6 +20,7 @@ "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML", "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP", + "CHECKER_LOOP_WAIT_MINS" ] diff --git a/src/config/initialisation.py b/src/config/initialisation.py new file mode 100644 index 0000000..b8dddf9 --- /dev/null +++ b/src/config/initialisation.py @@ -0,0 +1,9 @@ +import urllib3 + + +def misc_global_initialisation(context: dict): + # these warnings are turned off because many IATI XML files are served + # from web servers with misconfigured certificates, so we can't verify + # certificates, and without this the logs fill up with warnings saying we + # should turn verification on + urllib3.disable_warnings(category=urllib3.exceptions.InsecureRequestWarning) diff --git a/src/iati_bulk_data_service.py b/src/iati_bulk_data_service.py index fc9d51e..ff4e7ed 100644 --- a/src/iati_bulk_data_service.py +++ b/src/iati_bulk_data_service.py @@ -3,6 +3,7 @@ from bulk_data_service.checker import checker from bulk_data_service.zipper import zipper from config.config import get_config +from config.initialisation import misc_global_initialisation from utilities.azure import create_azure_blob_containers from utilities.db import apply_db_migrations from utilities.logging import initialise_logging @@ -20,6 +21,8 @@ def main(args: argparse.Namespace): create_azure_blob_containers(context) + misc_global_initialisation(context) + if args.operation == "checker": checker(context) elif args.operation == "zipper": From 7e4a898ffe8f4ca49b0a744e3f9d8aaaf71f2b17 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:20:28 +0100 Subject: [PATCH 05/12] test: update test env --- tests-local-environment/.env | 11 ++++--- ...n-registration-and-data-server-config.json | 33 +++++++++++++++++++ tests/artifacts/config-files/env-file-1 | 9 +++-- tests/artifacts/config-files/env-file-2 | 9 +++-- 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/tests-local-environment/.env b/tests-local-environment/.env index 717369a..d3658f2 100644 --- a/tests-local-environment/.env +++ b/tests-local-environment/.env @@ -1,5 +1,7 @@ DATA_REGISTRATION=ckan-registry DATA_REGISTRY_BASE_URL=http://localhost:3000/registration/datasets-03 +DATA_REGISTRY_PUBLISHER_METADATA_URL=http://localhost:3000/registration/ckan-publishers +DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1 @@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 -# Logs directory -LOGFILE= - ZIP_WORKING_DIR=/tmp/bulk-data-service-zip DB_NAME=bulk_data_service_db @@ -27,5 +26,7 @@ DB_CONNECTION_TIMEOUT=30 # Azurite Emulator (run from docker compose) AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:11000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:11001/devstoreaccount1;TableEndpoint=http://127.0.0.1:11002/devstoreaccount1; -AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml -AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip +AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=test-data +AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=test-data + +CHECKER_LOOP_WAIT_MINS=1 diff --git a/tests-local-environment/mockoon-registration-and-data-server-config.json b/tests-local-environment/mockoon-registration-and-data-server-config.json index f3f2c39..774d281 100644 --- a/tests-local-environment/mockoon-registration-and-data-server-config.json +++ b/tests-local-environment/mockoon-registration-and-data-server-config.json @@ -402,6 +402,35 @@ } ], "responseMode": null + }, + { + "uuid": "52cc8a4f-d934-4056-a1dd-1fd970079d14", + "type": "http", + "documentation": "", + "method": "get", + "endpoint": "registration/ckan-publishers", + "responses": [ + { + "uuid": "dc002f3b-6e85-446b-95cb-223ba39082f6", + "body": "{\n \"help\": \"https://iatiregistry.org/api/3/action/help_show?name=organization_list\",\n \"success\": true,\n \"result\": [\n {\n \"description\": \"\",\n \"id\": \"4f0f8498-20d2-4ca5-a20f-f441eedb1d4f\",\n \"image_display_url\": \"\",\n \"image_url\": \"\",\n \"is_organization\": true,\n \"license_id\": \"notspecified\",\n \"name\": \"test_org_a\",\n \"num_followers\": 0,\n \"package_count\": 0,\n \"publisher_agencies\": \"\",\n \"publisher_constraints\": \"\",\n \"publisher_contact\": \"s'Gravenhekje 1a\\r\\n1011TG Amsterdam\\r\\nThe Netherlands\",\n \"publisher_contact_email\": \"please@update.email\",\n \"publisher_country\": \"NL\",\n \"publisher_data_quality\": \"\",\n \"publisher_description\": \"\",\n \"publisher_field_exclusions\": \"\",\n \"publisher_frequency\": \"\",\n \"publisher_frequency_select\": \"not_specified\",\n \"publisher_iati_id\": \"TEST_ORG_A\",\n \"publisher_implementation_schedule\": \"\",\n \"publisher_organization_type\": \"60\",\n \"publisher_record_exclusions\": \"\",\n \"publisher_refs\": \"\",\n \"publisher_segmentation\": \"\",\n \"publisher_source_type\": \"primary_source\",\n \"publisher_thresholds\": \"\",\n \"publisher_timeliness\": \"\",\n \"publisher_ui\": \"\",\n \"publisher_units\": \"\",\n \"publisher_url\": \"http://www.onepercentclub.com\",\n \"state\": \"active\",\n \"title\": \"Test Organisation A\",\n \"type\": \"organization\",\n \"users\": [\n {\n \"capacity\": \"admin\",\n \"name\": \"onepercentclub\"\n }\n ],\n \"tags\": [\n \n ],\n \"groups\": [\n \n ]\n }, \n {\n \"description\": \"\",\n \"id\": \"31ffc713-cba9-4af9-a71e-9306d00c11e8\",\n \"image_display_url\": \"\",\n \"image_url\": \"\",\n \"is_organization\": true,\n \"license_id\": \"notspecified\",\n \"name\": \"onepercentclub\",\n \"num_followers\": 0,\n \"package_count\": 0,\n \"publisher_agencies\": \"\",\n \"publisher_constraints\": \"\",\n \"publisher_contact\": \"s'Gravenhekje 1a\\r\\n1011TG Amsterdam\\r\\nThe Netherlands\",\n \"publisher_contact_email\": \"please@update.email\",\n \"publisher_country\": \"NL\",\n \"publisher_data_quality\": \"\",\n \"publisher_description\": \"\",\n \"publisher_field_exclusions\": \"\",\n \"publisher_frequency\": \"\",\n \"publisher_frequency_select\": \"not_specified\",\n \"publisher_iati_id\": \"NL-KVK-3427895\",\n \"publisher_implementation_schedule\": \"\",\n \"publisher_organization_type\": \"60\",\n \"publisher_record_exclusions\": \"\",\n \"publisher_refs\": \"\",\n \"publisher_segmentation\": \"\",\n \"publisher_source_type\": \"primary_source\",\n \"publisher_thresholds\": \"\",\n \"publisher_timeliness\": \"\",\n \"publisher_ui\": \"\",\n \"publisher_units\": \"\",\n \"publisher_url\": \"http://www.onepercentclub.com\",\n \"state\": \"active\",\n \"title\": \"1%Club\",\n \"type\": \"organization\",\n \"users\": [\n {\n \"capacity\": \"admin\",\n \"name\": \"onepercentclub\"\n }\n ],\n \"tags\": [\n \n ],\n \"groups\": [\n \n ]\n },\n {\n \"description\": \"\",\n \"id\": \"1a3e3f42-6704-4adf-897a-9bdf5b854a00\",\n \"image_display_url\": \"\",\n \"image_url\": \"\",\n \"is_organization\": true,\n \"license_id\": \"notspecified\",\n \"name\": \"3fi\",\n \"num_followers\": 0,\n \"package_count\": 1,\n \"publisher_agencies\": \"\",\n \"publisher_constraints\": \"\",\n \"publisher_contact\": \"3F\\r\\nKampmannsgade 4\\r\\nDK-1790 Copenhagen V\\r\\nDenmark \",\n \"publisher_contact_email\": \"jesper.nielsen@3f.dk\",\n \"publisher_country\": \"DK\",\n \"publisher_data_quality\": \"\",\n \"publisher_description\": \"\",\n \"publisher_field_exclusions\": \"\",\n \"publisher_first_publish_date\": \"\",\n \"publisher_frequency\": \"\",\n \"publisher_frequency_select\": \"not_specified\",\n \"publisher_iati_id\": \"DK-CVR-31378028\",\n \"publisher_implementation_schedule\": \"\",\n \"publisher_organization_type\": \"22\",\n \"publisher_record_exclusions\": \"\",\n \"publisher_refs\": \"\",\n \"publisher_segmentation\": \"\",\n \"publisher_source_type\": \"primary_source\",\n \"publisher_thresholds\": \"\",\n \"publisher_timeliness\": \"\",\n \"publisher_ui\": \"\",\n \"publisher_units\": \"\",\n \"publisher_url\": \"https://tema.3f.dk/international \",\n \"state\": \"active\",\n \"title\": \"3F International\",\n \"type\": \"organization\",\n \"users\": [\n {\n \"capacity\": \"admin\",\n \"name\": \"3-f_international\"\n }\n ],\n \"tags\": [\n \n ],\n \"groups\": [\n \n ]\n },\n {\n \"description\": \"\",\n \"id\": \"8c858655-cb84-4527-befa-b77d4de07e22\",\n \"image_display_url\": \"\",\n \"image_url\": \"\",\n \"is_organization\": true,\n \"license_id\": \"odc-pddl\",\n \"name\": \"fifty_eight\",\n \"num_followers\": 0,\n \"package_count\": 1,\n \"publisher_agencies\": \"\",\n \"publisher_constraints\": \"\",\n \"publisher_contact\": \"2030hub\\r\\n23 Argyle St\\r\\nLiverpool\\r\\nL1 5BL\",\n \"publisher_contact_email\": \"keithshortley@50eight.com\",\n \"publisher_country\": \"GB\",\n \"publisher_data_quality\": \"\",\n \"publisher_description\": \"Fifty Eight partners with companies, NGOs, public sector and others ​to address the challenges of modern slavery and improve working conditions in global supply chains. \",\n \"publisher_field_exclusions\": \"\",\n \"publisher_first_publish_date\": \"2020-02-17T00:00:00.000000\",\n \"publisher_frequency\": \"\",\n \"publisher_frequency_select\": \"quarterly\",\n \"publisher_iati_id\": \"GB-COH-07291220\",\n \"publisher_implementation_schedule\": \"\",\n \"publisher_organization_type\": \"70\",\n \"publisher_record_exclusions\": \"\",\n \"publisher_refs\": \"\",\n \"publisher_segmentation\": \"IATI data will be published at country level where possible and appropriate.\",\n \"publisher_source_type\": \"primary_source\",\n \"publisher_thresholds\": \"\",\n \"publisher_timeliness\": \"Within 14 days of period end\",\n \"publisher_ui\": \"\",\n \"publisher_units\": \"UK Aid Connect EAPEC \",\n \"publisher_url\": \"https://www.50eight.com/\",\n \"state\": \"active\",\n \"title\": \"50 Eight Limited\",\n \"type\": \"organization\",\n \"users\": [\n {\n \"capacity\": \"admin\",\n \"name\": \"keithshortley\"\n }\n ],\n \"tags\": [\n \n ],\n \"groups\": [\n \n ]\n }\n ]\n}\n", + "latency": 0, + "statusCode": 200, + "label": "", + "headers": [], + "bodyType": "INLINE", + "filePath": "", + "databucketID": "", + "sendFileAsBody": false, + "rules": [], + "rulesOperator": "OR", + "disableTemplating": false, + "fallbackTo404": false, + "default": true, + "crudKey": "id", + "callbacks": [] + } + ], + "responseMode": null } ], "rootChildren": [ @@ -429,6 +458,10 @@ "type": "route", "uuid": "b81b26b1-62f6-473f-9815-bbb219c7667b" }, + { + "type": "route", + "uuid": "52cc8a4f-d934-4056-a1dd-1fd970079d14" + }, { "type": "route", "uuid": "fe5038e8-f3a5-43d2-ab12-3fc2c72b2c11" diff --git a/tests/artifacts/config-files/env-file-1 b/tests/artifacts/config-files/env-file-1 index 49311fe..9fe7c7f 100644 --- a/tests/artifacts/config-files/env-file-1 +++ b/tests/artifacts/config-files/env-file-1 @@ -1,5 +1,7 @@ DATA_REGISTRATION=ckan-registry DATA_REGISTRY_BASE_URL=http://localhost:3000/registration-service/ +DATA_REGISTRY_PUBLISHER_METADATA_URL=http://localhost:3000/registration/ckan-publishers +DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1 @@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 -# Log file -LOGFILE= - ZIP_WORKING_DIR=/tmp/bulk-data-service-zip DB_NAME=bulk_data_service_db @@ -29,5 +28,5 @@ DB_CONNECTION_TIMEOUT=30 AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1; -AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml -AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip +AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="test-data" +AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="test-data" diff --git a/tests/artifacts/config-files/env-file-2 b/tests/artifacts/config-files/env-file-2 index 6577004..131472d 100644 --- a/tests/artifacts/config-files/env-file-2 +++ b/tests/artifacts/config-files/env-file-2 @@ -1,5 +1,7 @@ DATA_REGISTRATION=ckan-registry DATA_REGISTRY_BASE_URL=http://localhost:3000/registration-service/ +DATA_REGISTRY_PUBLISHER_METADATA_URL=http://localhost:3000/registration/ckan-publishers +DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24 WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1/ @@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72 -# Log file -LOGFILE= - ZIP_WORKING_DIR=/tmp/bulk-data-service-zip DB_NAME=bulk_data_service_db @@ -29,5 +28,5 @@ DB_CONNECTION_TIMEOUT=30 AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1; -AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml -AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip +AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="test-data" +AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="test-data" From 2d472990bcba0d0c225a79dee7d0b64de728d886 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:21:11 +0100 Subject: [PATCH 06/12] test: tests for utility functions --- tests/unit/test_utilities_http.py | 15 +++++ tests/unit/test_utilities_misc.py | 106 +++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_utilities_http.py diff --git a/tests/unit/test_utilities_http.py b/tests/unit/test_utilities_http.py new file mode 100644 index 0000000..ef7a883 --- /dev/null +++ b/tests/unit/test_utilities_http.py @@ -0,0 +1,15 @@ +import datetime + +import pytest + +from utilities.http import parse_last_modified_header + + +@pytest.mark.parametrize("input,expected", [ + ("Fri, 06 Sep 2024 13:08:28 GMT", datetime.datetime(2024, 9, 6, 13, 8, 28, 0, datetime.timezone.utc)), + ("Wed, 21 Oct 2015 07:28:00 GMT", datetime.datetime(2015, 10, 21, 7, 28, 0, 0, datetime.timezone.utc)), + ("Wed, 21 Oct 2015 07:28:00", None), + ("Wed, 21 October 2015 07:28:00 +00:00", None), +]) +def test_parse_http_last_modified_header(input, expected): + assert parse_last_modified_header(input) == expected diff --git a/tests/unit/test_utilities_misc.py b/tests/unit/test_utilities_misc.py index e6ea5fa..46e4f18 100644 --- a/tests/unit/test_utilities_misc.py +++ b/tests/unit/test_utilities_misc.py @@ -1,4 +1,6 @@ -from utilities.misc import get_hash, get_hash_excluding_generated_timestamp +import pytest + +from utilities.misc import filter_dict_by_structure, get_hash, get_hash_excluding_generated_timestamp def test_get_hash(): @@ -51,3 +53,105 @@ def test_get_hash_excluding_generated_timestamp(): hash = get_hash_excluding_generated_timestamp(yiplActivitiesXmlFile) assert(hash == "759eaa39276381f3fc146232cefd2111a2abc199") + + +@pytest.mark.parametrize("input,structure,expected", [ + ({"a": 10}, {"a" : None}, {"a": 10}), + ({"a": None}, {"a" : None}, {"a": None}), + ({"a": 10, "b": "should be filtered"}, {"a" : None}, {"a": 10}), + ({"a": 10, "b": "should be filtered", "c" : "also filtered"}, {"a" : None}, {"a": 10}), + ({"a": 10, "b": "included"}, {"a" : None, "b": None}, {"a": 10, "b": "included"}), + ({"b": "included"}, {"a" : None, "b": None}, {"b": "included"}), + ({"c": 10}, {"a" : None, "b": None}, {}), + ({}, {"a" : None, "b": None}, {}), + ({}, {}, {}), +]) +def test_filter_dict_atomic_value(input, structure, expected): + + assert filter_dict_by_structure(input, structure) == expected + + +@pytest.mark.parametrize("input,structure,expected", [ + ({}, + {"a": {"include": None}}, + {}), + + ({"a": {}}, + {"a": {"include": None}}, + {"a": {}}), + + ({"a": {"include": 10, "filter out": 20}}, + {"a": {"include": None}}, + {"a": {"include": 10}}), + + ({"a": {"filter out": 20}}, + {"a": {"include": None}}, + {"a": {}}), + + ({"a": {"filter out": 20}}, + {"a": {}}, + {"a": {}}), + + ({"a": None}, + {"a": {}}, + {"a": None}), + + ({"a": None}, + {"a": {"b": None}}, + {"a": None}), + + ({"a": 10}, + {"a": {"b": {}}}, + {"a": 10}), + + ({"a": ["test"]}, + {"a": {"b": {}}}, + {"a": ["test"]}), + + +]) +def test_filter_dict_nested_dict(input, structure, expected): + + assert filter_dict_by_structure(input, structure) == expected + + +@pytest.mark.parametrize("input,structure,expected", [ + ({}, + {"a": []}, + {}), + + ({"a": []}, + {"a": None}, + {"a": []}), + + ({"a": ["one", "two"]}, + {"a": None}, + {"a": ["one", "two"]}), + +]) +def test_filter_dict_with_list_no_dict_items(input, structure, expected): + + assert filter_dict_by_structure(input, structure) == expected + + +@pytest.mark.parametrize("input,structure,expected", [ + ({}, + {"a": [{}]}, + {}), + + ({"a": []}, + {"a": [{}]}, + {"a": []}), + + ({"a": ["one", "two"]}, + {"a": [{}]}, + {"a": [{}, {}]}), + + ({"a": [{"include": 10, "filter": 20}]}, + {"a": [{"include": None}]}, + {"a": [{"include": 10}]}), + +]) +def test_filter_dict_with_list_with_dict_items(input, structure, expected): + + assert filter_dict_by_structure(input, structure) == expected From 3377c9d93f652be33b39daa569ad12fa19e46fdb Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:21:36 +0100 Subject: [PATCH 07/12] test: tests for fetching, storing CKAN metadata --- tests/integration/test_dataset_add.py | 206 +++++++++++++++++- .../integration/test_dataset_registration.py | 82 ++++++- tests/integration/test_dataset_update.py | 202 ++++++++++++++++- 3 files changed, 486 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_dataset_add.py b/tests/integration/test_dataset_add.py index c9ef98c..e5c5c1a 100644 --- a/tests/integration/test_dataset_add.py +++ b/tests/integration/test_dataset_add.py @@ -1,3 +1,4 @@ +import json import uuid import pytest @@ -11,7 +12,107 @@ ("publisher_id", uuid.UUID("ea055d99-f7e9-456f-9f99-963e95493c1b")), ("publisher_name", "test_foundation_a"), ("source_url", "http://localhost:3000/data/test_foundation_a-dataset-404.xml"), - ("type", "activity") + ("type", "activity"), + ("registration_service_name", "ckan-registry"), + ("registration_service_dataset_metadata", json.dumps( + { + "author": None, + "author_email": "publisher@email-here.com", + "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb", + "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159", + "isopen": True, + "license_id": "other-at", + "license_title": "Other (Attribution)", + "maintainer": None, + "maintainer_email": None, + "metadata_created": "2024-03-04T10:24:11.373108", + "metadata_modified": "2024-05-07T15:38:58.740018", + "name": "test_foundation_a-dataset-001", + "notes": "", + "num_resources": 1, + "num_tags": 0, + "organization": { + "id": "ea055d99-f7e9-456f-9f99-963e95493c1b", + "name": "test_foundation_a", + "title": "Test Foundation A", + "type": "organization", + "description": "", + "image_url": "", + "created": "2020-02-24T20:56:01.763851", + "is_organization": True, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "5d04f169-c702-45fe-8162-da7834859d86", + "private": False, + "state": "active", + "title": "040324", + "type": "dataset", + "url": None, + "version": None, + "extras": [ + { + "key": "activity_count", + "value": "10" + }, + { + "key": "country", + "value": "GB" + }, + { + "key": "data_updated", + "value": "2024-03-01 14:24:09" + }, + { + "key": "filetype", + "value": "activity" + }, + { + "key": "iati_version", + "value": "2.03" + }, + { + "key": "language", + "value": "" + }, + { + "key": "secondary_publisher", + "value": "" + }, + { + "key": "validation_status", + "value": "Not Found" + } + ], + "resources": [ + { + "cache_last_updated": None, + "cache_url": None, + "created": "2024-05-07T15:38:57.312249", + "description": None, + "format": "IATI-XML", + "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac", + "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe", + "last_modified": None, + "metadata_modified": "2024-05-07T15:38:58.757860", + "mimetype": "", + "mimetype_inner": None, + "name": None, + "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9", + "position": 0, + "resource_type": None, + "size": 399382, + "state": "active", + "url": "http://localhost:3000/data/test_foundation_a-dataset-404.xml", + "url_type": None + } + ], + "tags": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } + )) ]) def test_add_new_undownloadable_dataset(get_and_clear_up_context, field, expected): # noqa: F811 @@ -21,6 +122,7 @@ def test_add_new_undownloadable_dataset(get_and_clear_up_context, field, expecte # dataset c8a40aa5-9f31-... with 404 context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + datasets_in_bds = {} checker_run(context, datasets_in_bds) @@ -37,7 +139,107 @@ def test_add_new_undownloadable_dataset(get_and_clear_up_context, field, expecte ("source_url", "http://localhost:3000/data/test_foundation_a-dataset-001.xml"), ("type", "activity"), ("hash", "7703103493edafde0dbce66e507abb642fc7bd52"), - ("hash_excluding_generated_timestamp", "ef0eead52dfc3bd86311c84327282f607402dfff") + ("hash_excluding_generated_timestamp", "ef0eead52dfc3bd86311c84327282f607402dfff"), + ("registration_service_name", "ckan-registry"), + ("registration_service_dataset_metadata", json.dumps( + { + "author": None, + "author_email": "publisher@email-here.com", + "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb", + "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159", + "isopen": True, + "license_id": "other-at", + "license_title": "Other (Attribution)", + "maintainer": None, + "maintainer_email": None, + "metadata_created": "2024-03-04T10:24:11.373108", + "metadata_modified": "2024-05-07T15:38:58.740018", + "name": "test_foundation_a-dataset-001", + "notes": "", + "num_resources": 1, + "num_tags": 0, + "organization": { + "id": "ea055d99-f7e9-456f-9f99-963e95493c1b", + "name": "test_foundation_a", + "title": "Test Foundation A", + "type": "organization", + "description": "", + "image_url": "", + "created": "2020-02-24T20:56:01.763851", + "is_organization": True, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "5d04f169-c702-45fe-8162-da7834859d86", + "private": False, + "state": "active", + "title": "040324", + "type": "dataset", + "url": None, + "version": None, + "extras": [ + { + "key": "activity_count", + "value": "10" + }, + { + "key": "country", + "value": "GB" + }, + { + "key": "data_updated", + "value": "2024-03-01 14:24:09" + }, + { + "key": "filetype", + "value": "activity" + }, + { + "key": "iati_version", + "value": "2.03" + }, + { + "key": "language", + "value": "" + }, + { + "key": "secondary_publisher", + "value": "" + }, + { + "key": "validation_status", + "value": "Not Found" + } + ], + "resources": [ + { + "cache_last_updated": None, + "cache_url": None, + "created": "2024-05-07T15:38:57.312249", + "description": None, + "format": "IATI-XML", + "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac", + "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe", + "last_modified": None, + "metadata_modified": "2024-05-07T15:38:58.757860", + "mimetype": "", + "mimetype_inner": None, + "name": None, + "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9", + "position": 0, + "resource_type": None, + "size": 399382, + "state": "active", + "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml", + "url_type": None + } + ], + "tags": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } + )) ]) def test_add_new_downloadable_dataset(get_and_clear_up_context, field, expected): # noqa: F811 diff --git a/tests/integration/test_dataset_registration.py b/tests/integration/test_dataset_registration.py index 5c2bc1c..2028927 100644 --- a/tests/integration/test_dataset_registration.py +++ b/tests/integration/test_dataset_registration.py @@ -1,11 +1,15 @@ +import json + import pytest from bulk_data_service.checker import checker_run +from dataset_registration.iati_registry_ckan import get_publisher_metadata_as_str from helpers.helpers import get_and_clear_up_context # noqa: F401 +from utilities.http import get_requests_session @pytest.mark.parametrize("http_status_code", ["400", "404", "500"]) -def test_ckan_reg_url_400(get_and_clear_up_context, http_status_code): # noqa: F811 +def test_ckan_registry_url_400(get_and_clear_up_context, http_status_code): # noqa: F811 context = get_and_clear_up_context context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/error-response/" + http_status_code @@ -14,3 +18,79 @@ def test_ckan_reg_url_400(get_and_clear_up_context, http_status_code): # noqa: checker_run(context, datasets_in_bds) assert len(datasets_in_bds) == 0 + + +def test_ckan_registry_get_metadata_known_publisher(get_and_clear_up_context): # noqa: F811 + + expected = json.dumps({ + "description": "", + "id": "1a3e3f42-6704-4adf-897a-9bdf5b854a00", + "image_display_url": "", + "image_url": "", + "is_organization": True, + "license_id": "notspecified", + "name": "3fi", + "num_followers": 0, + "package_count": 1, + "publisher_agencies": "", + "publisher_constraints": "", + "publisher_contact": "3F\r\nKampmannsgade 4\r\nDK-1790 Copenhagen V\r\nDenmark ", + "publisher_contact_email": "jesper.nielsen@3f.dk", + "publisher_country": "DK", + "publisher_data_quality": "", + "publisher_description": "", + "publisher_field_exclusions": "", + "publisher_first_publish_date": "", + "publisher_frequency": "", + "publisher_frequency_select": "not_specified", + "publisher_iati_id": "DK-CVR-31378028", + "publisher_implementation_schedule": "", + "publisher_organization_type": "22", + "publisher_record_exclusions": "", + "publisher_refs": "", + "publisher_segmentation": "", + "publisher_source_type": "primary_source", + "publisher_thresholds": "", + "publisher_timeliness": "", + "publisher_ui": "", + "publisher_units": "", + "publisher_url": "https://tema.3f.dk/international ", + "state": "active", + "title": "3F International", + "type": "organization", + "users": [ + { + "capacity": "admin", + "name": "3-f_international" + } + ], + "tags": [ + + ], + "groups": [ + + ] + }) + + context = get_and_clear_up_context + + context["DATA_REGISTRY_PUBLISHER_METADATA_URL"] = "http://localhost:3000/registration/ckan-publishers" + + session = get_requests_session() + + publisher_metadata_str = get_publisher_metadata_as_str(context, session, "3fi") + + assert publisher_metadata_str == expected + + +def test_ckan_registry_get_metadata_unknown_publisher(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + context["DATA_REGISTRY_PUBLISHER_METADATA_URL"] = "http://localhost:3000/registration/ckan-publishers" + + session = get_requests_session() + + publisher_metadata_str = get_publisher_metadata_as_str(context, session, "unknown_publisher") + + assert publisher_metadata_str == "{}" diff --git a/tests/integration/test_dataset_update.py b/tests/integration/test_dataset_update.py index ea6f3e2..53c55ec 100644 --- a/tests/integration/test_dataset_update.py +++ b/tests/integration/test_dataset_update.py @@ -1,3 +1,4 @@ +import json import uuid import pytest @@ -12,7 +13,206 @@ ("publisher_name", "test_foundation_a", "test_org_a"), ("source_url", "http://localhost:3000/data/test_foundation_a-dataset-001.xml", "http://localhost:3000/not_found"), - ("type", "activity", "organisation") + ("type", "activity", "organisation"), + ("registration_service_dataset_metadata", json.dumps( + { + "author": None, + "author_email": "publisher@email-here.com", + "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb", + "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159", + "isopen": True, + "license_id": "other-at", + "license_title": "Other (Attribution)", + "maintainer": None, + "maintainer_email": None, + "metadata_created": "2024-03-04T10:24:11.373108", + "metadata_modified": "2024-05-07T15:38:58.740018", + "name": "test_foundation_a-dataset-001", + "notes": "", + "num_resources": 1, + "num_tags": 0, + "organization": { + "id": "ea055d99-f7e9-456f-9f99-963e95493c1b", + "name": "test_foundation_a", + "title": "Test Foundation A", + "type": "organization", + "description": "", + "image_url": "", + "created": "2020-02-24T20:56:01.763851", + "is_organization": True, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "5d04f169-c702-45fe-8162-da7834859d86", + "private": False, + "state": "active", + "title": "040324", + "type": "dataset", + "url": None, + "version": None, + "extras": [ + { + "key": "activity_count", + "value": "10" + }, + { + "key": "country", + "value": "GB" + }, + { + "key": "data_updated", + "value": "2024-03-01 14:24:09" + }, + { + "key": "filetype", + "value": "activity" + }, + { + "key": "iati_version", + "value": "2.03" + }, + { + "key": "language", + "value": "" + }, + { + "key": "secondary_publisher", + "value": "" + }, + { + "key": "validation_status", + "value": "Not Found" + } + ], + "resources": [ + { + "cache_last_updated": None, + "cache_url": None, + "created": "2024-05-07T15:38:57.312249", + "description": None, + "format": "IATI-XML", + "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac", + "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe", + "last_modified": None, + "metadata_modified": "2024-05-07T15:38:58.757860", + "mimetype": "", + "mimetype_inner": None, + "name": None, + "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9", + "position": 0, + "resource_type": None, + "size": 399382, + "state": "active", + "url": "http://localhost:3000/data/test_foundation_a-dataset-001.xml", + "url_type": None + } + ], + "tags": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } + ), + json.dumps( + { + "author": None, + "author_email": "publisher@email-here.com", + "creator_user_id": "4abc4897-94b7-4b0e-84c2-c8778f435ccb", + "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159", + "isopen": True, + "license_id": "other-at", + "license_title": "Other (Attribution)", + "maintainer": None, + "maintainer_email": None, + "metadata_created": "2024-03-04T10:24:11.373108", + "metadata_modified": "2024-05-07T15:38:58.740018", + "name": "test_foundation_a-dataset-001", + "notes": "", + "num_resources": 1, + "num_tags": 0, + "organization": { + "id": "4f0f8498-20d2-4ca5-a20f-f441eedb1d4f", + "name": "test_org_a", + "title": "Test Foundation A", + "type": "organization", + "description": "", + "image_url": "", + "created": "2020-02-24T20:56:01.763851", + "is_organization": True, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "5d04f169-c702-45fe-8162-da7834859d86", + "private": False, + "state": "active", + "title": "040324", + "type": "dataset", + "url": None, + "version": None, + "extras": [ + { + "key": "activity_count", + "value": "10" + }, + { + "key": "country", + "value": "GB" + }, + { + "key": "data_updated", + "value": "2024-03-01 14:24:09" + }, + { + "key": "filetype", + "value": "organisation" + }, + { + "key": "iati_version", + "value": "2.03" + }, + { + "key": "language", + "value": "" + }, + { + "key": "secondary_publisher", + "value": "" + }, + { + "key": "validation_status", + "value": "Not Found" + } + ], + "resources": [ + { + "cache_last_updated": None, + "cache_url": None, + "created": "2024-05-07T15:38:57.312249", + "description": None, + "format": "IATI-XML", + "hash": "f6bb14d61bb2652f1014d6ebfee3c4b873241bac", + "id": "d1b3d323-c8ba-48c5-89ce-6e745241d7fe", + "last_modified": None, + "metadata_modified": "2024-05-07T15:38:58.757860", + "mimetype": "", + "mimetype_inner": None, + "name": None, + "package_id": "b83ebe89-d522-4d3b-87e9-53aa9ac8eee9", + "position": 0, + "resource_type": None, + "size": 399382, + "state": "active", + "url": "http://localhost:3000/not_found", + "url_type": None + } + ], + "tags": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } + ) + ) ]) def test_update_dataset_publisher_details(get_and_clear_up_context, # noqa: F811 field, From 63040d754d722ed35f5845537836bacf661c4d22 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:22:06 +0100 Subject: [PATCH 08/12] test: tests for new ZIP file creation --- tests/helpers/helpers.py | 8 + tests/integration/test_dataset_indexing.py | 6 +- tests/integration/test_zip_creation.py | 228 ++++++++++++++++++++- 3 files changed, 231 insertions(+), 11 deletions(-) diff --git a/tests/helpers/helpers.py b/tests/helpers/helpers.py index 9133b18..4d8d55a 100644 --- a/tests/helpers/helpers.py +++ b/tests/helpers/helpers.py @@ -1,4 +1,6 @@ import glob +import os +import shutil from unittest import mock import pytest @@ -41,3 +43,9 @@ def get_and_clear_up_context(): yield context truncate_db_table(context) delete_azure_blob_containers(context) + # this is a sanity check to ensure we don't remove important files on a misconfiguration + if context["ZIP_WORKING_DIR"].startswith("/tmp"): + zip_dirs = [context["ZIP_WORKING_DIR"], f"{context['ZIP_WORKING_DIR']}-1", f"{context['ZIP_WORKING_DIR']}-2"] + for zip_dir in zip_dirs: + if os.path.exists(zip_dir): + shutil.rmtree(zip_dir) diff --git a/tests/integration/test_dataset_indexing.py b/tests/integration/test_dataset_indexing.py index 8a157d5..25a5dcf 100644 --- a/tests/integration/test_dataset_indexing.py +++ b/tests/integration/test_dataset_indexing.py @@ -4,7 +4,7 @@ from azure.storage.blob import BlobServiceClient from bulk_data_service.checker import checker_run -from bulk_data_service.dataset_indexing import get_index_name, get_minimal_index_fields +from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields from helpers.helpers import get_and_clear_up_context # noqa: F401 from utilities.azure import get_azure_container_name @@ -87,13 +87,13 @@ def test_full_index_creation(get_and_clear_up_context): # noqa: F811 full_index_item = full_index["datasets"][dataset["name"]] - for field in dataset.keys(): + for field in get_full_index_fields(context): if isinstance(dataset[field], uuid.UUID): assert uuid.UUID(full_index_item[field]) == dataset[field] elif isinstance(dataset[field], str): assert full_index_item[field] == dataset[field] # -2 because of the two autogenerated fields - assert len(full_index_item.keys()) - 2 == len(dataset.keys()) + assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context)) blob_service_client.close() diff --git a/tests/integration/test_zip_creation.py b/tests/integration/test_zip_creation.py index 4a2a2bb..0683d3f 100644 --- a/tests/integration/test_zip_creation.py +++ b/tests/integration/test_zip_creation.py @@ -1,40 +1,252 @@ +import json +import os +import zipfile + from bulk_data_service.checker import checker_run from bulk_data_service.zipper import zipper_run from helpers.helpers import get_and_clear_up_context # noqa: F401 from helpers.helpers import get_number_xml_files_in_working_dir -def test_zip_creation_when_download_always_failed(get_and_clear_up_context): # noqa: F811 +def test_dataset_saved_for_download_success(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context - context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + run_checker_then_zipper_download_ok(context) + + assert get_number_xml_files_in_working_dir(context) == 1 + assert os.path.exists("{}{}".format( + context["ZIP_WORKING_DIR"], + "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml")) is True + + +def test_dataset_not_saved_for_download_fail_and_no_cache(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_fail(context) + + assert get_number_xml_files_in_working_dir(context) == 0 + + +def test_dataset_saved_for_download_fail_but_cached(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_fail_but_cached(context) + + assert get_number_xml_files_in_working_dir(context) == 1 + assert os.path.exists("{}{}".format( + context["ZIP_WORKING_DIR"], + "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml")) is True + + +def test_publisher_metadata_saved_for_failed_metadata_dl(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_ok(context) + + assert os.path.exists(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_foundation_a.json") is True + + +def test_publisher_metadata_saved_for_successful_metadata_dl(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-05" datasets_in_bds = {} checker_run(context, datasets_in_bds) datasets_in_zip = {} zipper_run(context, datasets_in_zip, datasets_in_bds) - assert len(datasets_in_zip) == 0 + assert os.path.exists(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_org_a.json") is True - assert get_number_xml_files_in_working_dir(context) == 0 +def test_publisher_metadata_content_for_failed_metadata_dl(get_and_clear_up_context): # noqa: F811 -def test_zip_creation_when_download_recently_failed(get_and_clear_up_context): # noqa: F811 + context = get_and_clear_up_context + + run_checker_then_zipper_download_ok(context) + + with open(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_foundation_a.json", "r") as f: + assert f.read() == "{}" + + +def test_publisher_metadata_content_for_successful_metadata_dl(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-05" + datasets_in_bds = {} + checker_run(context, datasets_in_bds) + + datasets_in_zip = {} + zipper_run(context, datasets_in_zip, datasets_in_bds) + + with open(context["ZIP_WORKING_DIR"] + "-2/iati-data-main/metadata/test_org_a.json", "r") as f: + assert f.read() == json.dumps( + { + "id": "4f0f8498-20d2-4ca5-a20f-f441eedb1d4f", + "name": "test_org_a", + } + ) + + +def test_bds_zip_content_for_download_success(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context + run_checker_then_zipper_download_ok(context) + + bds_zip_filename = context["ZIP_WORKING_DIR"] + "-1/iati-data.zip" + + assert os.path.exists(bds_zip_filename) is True + + bds_zip_file = zipfile.ZipFile(bds_zip_filename) + + filelist = bds_zip_file.namelist() + + assert ("iati-data/dataset-index-minimal.json" in filelist) is True + assert ("iati-data/dataset-index-full.json" in filelist) is True + assert ("iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True + + +def test_bds_zip_content_for_download_fail_but_cached(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_fail_but_cached(context) + + bds_zip_filename = context["ZIP_WORKING_DIR"] + "-1/iati-data.zip" + + assert os.path.exists(bds_zip_filename) is True + + bds_zip_file = zipfile.ZipFile(bds_zip_filename) + + filelist = bds_zip_file.namelist() + + assert ("iati-data/dataset-index-minimal.json" in filelist) is True + assert ("iati-data/dataset-index-full.json" in filelist) is True + assert ("iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True + + +def test_bds_zip_content_for_download_fail_no_cached(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_fail(context) + + bds_zip_filename = context["ZIP_WORKING_DIR"] + "-1/iati-data.zip" + + assert os.path.exists(bds_zip_filename) is True + + bds_zip_file = zipfile.ZipFile(bds_zip_filename) + + filelist = bds_zip_file.namelist() + + assert ("iati-data/dataset-index-minimal.json" in filelist) is True + assert ("iati-data/dataset-index-full.json" in filelist) is True + assert ("iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is False + + +def test_codeforiati_zip_content_for_download_success(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_ok(context) + + bds_zip_filename = context["ZIP_WORKING_DIR"] + "-2/code-for-iati-data-download.zip" + + assert os.path.exists(bds_zip_filename) is True + + bds_zip_file = zipfile.ZipFile(bds_zip_filename) + + filelist = bds_zip_file.namelist() + + assert ("iati-data/dataset-index-minimal.json" in filelist) is False + assert ("iati-data/dataset-index-full.json" in filelist) is False + assert ("iati-data-main/metadata.json" in filelist) is True + assert ("iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True + assert ("iati-data-main/metadata/test_foundation_a.json" in filelist) is True + assert ("iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" in filelist) is True + + +def test_codeforiati_zip_content_for_download_fail_but_cached(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_fail_but_cached(context) + + bds_zip_filename = context["ZIP_WORKING_DIR"] + "-2/code-for-iati-data-download.zip" + + assert os.path.exists(bds_zip_filename) is True + + bds_zip_file = zipfile.ZipFile(bds_zip_filename) + + filelist = bds_zip_file.namelist() + + assert ("iati-data/dataset-index-minimal.json" in filelist) is False + assert ("iati-data/dataset-index-full.json" in filelist) is False + assert ("iati-data-main/metadata.json" in filelist) is True + assert ("iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True + assert ("iati-data-main/metadata/test_foundation_a.json" in filelist) is True + assert ("iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" in filelist) is True + + +def test_codeforiati_zip_content_for_download_fail_no_cached(get_and_clear_up_context): # noqa: F811 + + context = get_and_clear_up_context + + run_checker_then_zipper_download_fail(context) + + bds_zip_filename = context["ZIP_WORKING_DIR"] + "-2/code-for-iati-data-download.zip" + + assert os.path.exists(bds_zip_filename) is True + + bds_zip_file = zipfile.ZipFile(bds_zip_filename) + + fileinfolist = bds_zip_file.infolist() + filelist = [fileinfo.filename for fileinfo in fileinfolist] + + assert ("iati-data/dataset-index-minimal.json" in filelist) is False + assert ("iati-data/dataset-index-full.json" in filelist) is False + assert ("iati-data-main/metadata.json" in filelist) is True + assert ("iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" in filelist) is True + assert ("iati-data-main/metadata/test_foundation_a.json" in filelist) is True + assert ("iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" in filelist) is True + + fileinfoxml = list(filter(lambda f: f.filename == "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml", fileinfolist))[0] + assert fileinfoxml.file_size == 0 + + +def run_checker_then_zipper_download_ok(context): context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01" datasets_in_bds = {} checker_run(context, datasets_in_bds) - # this is same dataset as above, only with a 404 URL + datasets_in_zip = {} + zipper_run(context, datasets_in_zip, datasets_in_bds) + + +def run_checker_then_zipper_download_fail(context): context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + datasets_in_bds = {} checker_run(context, datasets_in_bds) datasets_in_zip = {} zipper_run(context, datasets_in_zip, datasets_in_bds) - assert len(datasets_in_zip) == 1 - assert get_number_xml_files_in_working_dir(context) == 1 +def run_checker_then_zipper_download_fail_but_cached(context): + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01" + datasets_in_bds = {} + checker_run(context, datasets_in_bds) + + # this is same dataset as above, only with a 404 URL + context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03" + checker_run(context, datasets_in_bds) + + datasets_in_zip = {} + zipper_run(context, datasets_in_zip, datasets_in_bds) From dc39f9c33c266b4f231194b808a01acc20715415 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:22:53 +0100 Subject: [PATCH 09/12] chore: utility functions for new features --- src/utilities/azure.py | 40 ++++++++++++++++++++++++++++++----- src/utilities/http.py | 18 +++++++++++++--- src/utilities/logging.py | 5 ----- src/utilities/misc.py | 45 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/utilities/azure.py b/src/utilities/azure.py index 4320614..0302dfd 100644 --- a/src/utilities/azure.py +++ b/src/utilities/azure.py @@ -4,6 +4,11 @@ from azure.storage.blob import BlobServiceClient, ContentSettings +def azure_blob_exists(az_blob_service: BlobServiceClient, container_name: str, blob_name: str) -> bool: + blob_client = az_blob_service.get_blob_client(container_name, blob_name) + return blob_client.exists() + + def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str, blob_name: str, filename: str): blob_client = az_blob_service.get_blob_client(container_name, blob_name) @@ -17,7 +22,7 @@ def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str, def azure_upload_to_blob( az_blob_service: BlobServiceClient, container_name: str, blob_name: str, content: Any, content_type: str -): +) -> dict[str, Any]: blob_client = az_blob_service.get_blob_client(container_name, blob_name) @@ -26,7 +31,7 @@ def azure_upload_to_blob( if content_type == "application/xml": content_settings.content_encoding = "UTF-8" - blob_client.upload_blob(content, overwrite=True, content_settings=content_settings) + return blob_client.upload_blob(content, overwrite=True, content_settings=content_settings) def create_azure_blob_containers(context: dict): @@ -42,7 +47,16 @@ def create_azure_blob_containers(context: dict): if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] not in container_names: blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"]) except Exception as e: - context["logger"].info("Could not create Azure blob storage containers: {}".format(e)) + context["logger"].error( + "Could not create Azure blob storage containers." + "XML container name: {}. " + "ZIP container name: {}. " + "Error details: {}".format( + context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"], + context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], + e, + ) + ) raise e finally: blob_service.close() @@ -57,10 +71,11 @@ def delete_azure_blob_containers(context: dict): try: if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"] in container_names: blob_service.delete_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"]) + container_names.remove(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"]) if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] in container_names: blob_service.delete_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"]) except Exception as e: - context["logger"].info("Could not delete Azure blob storage container: {}".format(e)) + context["logger"].error("Could not delete Azure blob storage container: {}".format(e)) raise e finally: blob_service.close() @@ -77,7 +92,7 @@ def delete_azure_iati_blob(context: dict, blob_service_client: BlobServiceClient blob_client.delete_blob() except azure.core.exceptions.ResourceNotFoundError as e: - context["logger"].warning( + context["logger"].error( "dataset id: {} - Problem deleting blob that was " "expected to exist: {}".format(dataset["id"], e).replace("\n", "") ) @@ -102,3 +117,18 @@ def get_azure_blob_public_url(context: dict, dataset: dict, iati_blob_type: str) blob_name_for_url, get_azure_blob_name(dataset, iati_blob_type), ) + + +def upload_zip_to_azure(context: dict, zip_local_pathname: str, zip_azure_filename: str): + az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) + + blob_client = az_blob_service.get_blob_client( + context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], zip_azure_filename + ) + + content_settings = ContentSettings(content_type="zip") + + with open(zip_local_pathname, "rb") as data: + blob_client.upload_blob(data, overwrite=True, content_settings=content_settings) + + az_blob_service.close() diff --git a/src/utilities/http.py b/src/utilities/http.py index 2cbf9b5..d6afadf 100644 --- a/src/utilities/http.py +++ b/src/utilities/http.py @@ -1,10 +1,22 @@ -from typing import Any +import datetime +from typing import Any, Optional import requests from requests.adapters import HTTPAdapter from urllib3.util import Retry +def parse_last_modified_header(last_modified_header: str) -> Optional[datetime.datetime]: + last_modified_header_parsed = None + try: + last_modified_header_parsed = datetime.datetime.strptime( + last_modified_header, "%a, %d %b %Y %H:%M:%S %Z" + ).replace(tzinfo=datetime.timezone.utc) + except ValueError: + pass + return last_modified_header_parsed + + def get_requests_session() -> requests.Session: session = requests.Session() session.headers.update({"User-Agent": "IATI Bulk Data Service 0.1"}) @@ -28,7 +40,7 @@ def http_get_json(session: requests.Session, url: str, timeout: int = 30, except def http_head_dataset(session: requests.Session, url: str, timeout: int = 10, retries: int = 2) -> requests.Response: - response = session.head(url=url, timeout=timeout, allow_redirects=True) + response = session.head(url=url, timeout=timeout, allow_redirects=True, verify=False) if response.status_code != 200: raise RuntimeError( @@ -49,7 +61,7 @@ def http_download_dataset( session: requests.Session, url: str, timeout: int = 25, retries: int = 2 ) -> requests.Response: - response = session.get(url=url, timeout=timeout, allow_redirects=True) + response = session.get(url=url, timeout=timeout, allow_redirects=True, verify=False) if response.status_code != 200: raise RuntimeError( diff --git a/src/utilities/logging.py b/src/utilities/logging.py index 095af4f..b63adb8 100644 --- a/src/utilities/logging.py +++ b/src/utilities/logging.py @@ -20,9 +20,4 @@ def initialise_logging(config: dict) -> logging.Logger: bds_logger.addHandler(handler_stdout) - if config["LOGFILE"] != "": - handler_file = logging.FileHandler(config["LOGFILE"]) - handler_file.setFormatter(formatter) - bds_logger.addHandler(handler_file) - return bds_logger diff --git a/src/utilities/misc.py b/src/utilities/misc.py index b9a7642..d1d9275 100644 --- a/src/utilities/misc.py +++ b/src/utilities/misc.py @@ -1,4 +1,5 @@ import datetime +import glob import hashlib import io import re @@ -35,6 +36,16 @@ def get_timestamp_as_str(isodate: str = "") -> str: return datetime.datetime.now(tz=datetime.timezone.utc).isoformat() +def get_timestamp_as_str_z(isodate: str = "") -> str: + dt = ( + datetime.datetime.fromisoformat(isodate).astimezone() + if isodate != "" + else datetime.datetime.now(tz=datetime.timezone.utc) + ) + dt = dt.replace(microsecond=0) + return dt.isoformat().replace("+00:00", "Z") + + def set_timestamp_tz_utc(date: datetime.datetime) -> datetime.datetime: return date.replace(tzinfo=datetime.timezone.utc) @@ -47,3 +58,37 @@ def zip_data_as_single_file(filename: str, data: str) -> bytes: xml_zipped.writestr(filename, data) return zip_buffer.getvalue() + + +def get_number_xml_files_in_dir(dir_name): + return len(glob.glob("**/*.xml", root_dir=dir_name, recursive=True)) + + +def filter_dict_by_structure(source: dict, structure_to_retain: dict) -> dict: + filtered_dict = {} + + for key, value in structure_to_retain.items(): + if key not in source: + continue + + if structure_to_retain[key] is None: + filtered_dict[key] = source[key] + + elif isinstance(structure_to_retain[key], dict) and not isinstance(source[key], dict): + filtered_dict[key] = source[key] + + elif isinstance(structure_to_retain[key], dict) and isinstance(source[key], dict): + filtered_dict[key] = filter_dict_by_structure(source[key], structure_to_retain[key]) + + elif isinstance(structure_to_retain[key], list) and not isinstance(source[key], list): + filtered_dict[key] = source[key] + + elif isinstance(structure_to_retain[key], list) and isinstance(source[key], list): + filtered_dict[key] = [ + filter_dict_by_structure( + item, structure_to_retain[key][0] if len(structure_to_retain[key]) == 1 else {} + ) + for item in source[key] + ] + + return filtered_dict From bbdabd6a6b86be2146a7fa3f327a954cfdbd7cb7 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:24:46 +0100 Subject: [PATCH 10/12] feat: store CKAN metadata * Database migrations * Download CKAN publisher, dataset metadata * Save to DB * Redone indexing code to avoid including legacy metadata in JSON index --- db-migrations/20240827_01_pVOLG.rollback.sql | 10 +++ db-migrations/20240827_01_pVOLG.sql | 23 ++++++ src/bulk_data_service/dataset_indexing.py | 72 ++++++++++++++--- src/bulk_data_service/dataset_updater.py | 48 +++++++++-- .../iati_registry_ckan.py | 79 +++++++++++++++++-- src/utilities/db.py | 5 +- 6 files changed, 209 insertions(+), 28 deletions(-) create mode 100644 db-migrations/20240827_01_pVOLG.rollback.sql create mode 100644 db-migrations/20240827_01_pVOLG.sql diff --git a/db-migrations/20240827_01_pVOLG.rollback.sql b/db-migrations/20240827_01_pVOLG.rollback.sql new file mode 100644 index 0000000..1cabf1b --- /dev/null +++ b/db-migrations/20240827_01_pVOLG.rollback.sql @@ -0,0 +1,10 @@ +-- +-- depends: 20240531_01_iY5Qa +ALTER TABLE + iati_datasets DROP COLUMN IF EXISTS registration_service_dataset_metadata; + +ALTER TABLE + iati_datasets DROP COLUMN IF EXISTS registration_service_publisher_metadata; + +ALTER TABLE + iati_datasets DROP COLUMN IF EXISTS registration_service_name; \ No newline at end of file diff --git a/db-migrations/20240827_01_pVOLG.sql b/db-migrations/20240827_01_pVOLG.sql new file mode 100644 index 0000000..3d4c0b1 --- /dev/null +++ b/db-migrations/20240827_01_pVOLG.sql @@ -0,0 +1,23 @@ +-- +-- depends: 20240531_01_iY5Qa +-- +ALTER TABLE + iati_datasets +ADD + registration_service_dataset_metadata VARCHAR; + +COMMENT ON COLUMN iati_datasets.registration_service_dataset_metadata IS 'the original dataset metadata record from the data registration service'; + +ALTER TABLE + iati_datasets +ADD + registration_service_publisher_metadata VARCHAR; + +COMMENT ON COLUMN iati_datasets.registration_service_publisher_metadata IS 'the original publisher metadata record from the data registration service'; + +ALTER TABLE + iati_datasets +ADD + registration_service_name VARCHAR; + +COMMENT ON COLUMN iati_datasets.registration_service_name IS 'the name of the data registration service'; \ No newline at end of file diff --git a/src/bulk_data_service/dataset_indexing.py b/src/bulk_data_service/dataset_indexing.py index 06f20a5..6894570 100644 --- a/src/bulk_data_service/dataset_indexing.py +++ b/src/bulk_data_service/dataset_indexing.py @@ -10,6 +10,8 @@ def create_and_upload_indices(context: dict, datasets_in_bds: dict[uuid.UUID, dict]): + context["logger"].info("Creating indices") + minimal_index = create_index_json(context, datasets_in_bds, "minimal") full_index = create_index_json(context, datasets_in_bds, "full") @@ -18,6 +20,8 @@ def create_and_upload_indices(context: dict, datasets_in_bds: dict[uuid.UUID, di upload_index_json_to_azure(context, get_index_name(context, "full"), full_index) + context["logger"].info("Creation of indices finished") + def upload_index_json_to_azure(context: dict, index_name: str, index_json: str): @@ -44,24 +48,24 @@ def create_index_json(context: dict, datasets_in_bds: dict[uuid.UUID, dict], ind def get_dataset_index(context: dict, datasets_in_bds: dict[uuid.UUID, dict], index_type: str) -> dict: - return {v["name"]: get_index_item(context, v, index_type) for _, v in datasets_in_bds.items()} + return {v["name"]: get_index_entry(context, v, index_type) for _, v in datasets_in_bds.items()} -def get_index_item(context: dict, dataset: dict, index_type: str) -> dict[str, Any]: +def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str, Any]: if index_type == "minimal": - dataset_index = {k: v for k, v in dataset.items() if k in get_minimal_index_fields(context)} + dataset_index_entry = get_minimal_index_entry_from_dataset(context, dataset) else: - dataset_index = {k: v for k, v in dataset.items()} + dataset_index_entry = get_full_index_entry_from_dataset(context, dataset) - dataset_index["url_xml"] = "" - dataset_index["url_zip"] = "" + dataset_index_entry["url_xml"] = "" + dataset_index_entry["url_zip"] = "" - if dataset_index["last_successful_download"] is not None: - dataset_index["url_xml"] = get_azure_blob_public_url(context, dataset, "xml") - dataset_index["url_zip"] = get_azure_blob_public_url(context, dataset, "zip") + if dataset_index_entry["last_successful_download"] is not None: + dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml") + dataset_index_entry["url_zip"] = get_azure_blob_public_url(context, dataset, "zip") - return dataset_index + return dataset_index_entry def get_index_name(context: dict, index_type: str) -> str: @@ -71,7 +75,53 @@ def get_index_name(context: dict, index_type: str) -> str: return "dataset-index-{}.json".format(index_type) -def get_minimal_index_fields(context: dict) -> list: +def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict: + return {k: v for k, v in dataset.items() if k in get_minimal_index_fields(context)} + + +def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict: + full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)} + + field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details") + + field_from_json_str_to_object(full_index_entry, "head_error_message", "head_error_details") + + return full_index_entry + + +def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: str): + entry[dest_field] = json.loads( + entry[source_field] if entry[source_field] is not None and entry[source_field] != "" else "{}" + ) + del entry[source_field] + + +def get_full_index_fields(context: dict) -> list[str]: + return [ + "id", + "name", + "publisher_name", + "type", + "source_url", + "hash", + "hash_excluding_generated_timestamp", + "last_update_check", + "last_head_attempt", + "last_head_http_status", + "head_error_message", + "last_download_attempt", + "last_download_http_status", + "last_successful_download", + "last_verified_on_server", + "download_error_message", + "content_modified", + "content_modified_excluding_generated_timestamp", + "server_header_last_modified", + "server_header_etag", + ] + + +def get_minimal_index_fields(context: dict) -> list[str]: return [ "id", "name", diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py index d953299..56dd26d 100644 --- a/src/bulk_data_service/dataset_updater.py +++ b/src/bulk_data_service/dataset_updater.py @@ -1,6 +1,7 @@ import concurrent.futures -import uuid import json +import traceback +import uuid from datetime import datetime, timedelta from itertools import batched from random import random @@ -9,9 +10,9 @@ import requests from azure.storage.blob import BlobServiceClient -from utilities.azure import azure_upload_to_blob +from utilities.azure import azure_blob_exists, azure_upload_to_blob from utilities.db import get_db_connection, insert_or_update_dataset -from utilities.http import get_requests_session, http_download_dataset, http_head_dataset +from utilities.http import get_requests_session, http_download_dataset, http_head_dataset, parse_last_modified_header from utilities.misc import ( get_hash, get_hash_excluding_generated_timestamp, @@ -227,8 +228,10 @@ def check_dataset_etag_last_mod_header( except Exception as e: context["logger"].warning( - "dataset id: {} - EXCEPTION with HEAD request, details: " "{}".format(bds_dataset["id"], e) + "dataset id: {} - EXCEPTION with HEAD request, details: {}".format(bds_dataset["id"], e) ) + if "{}".format(e) == "str.replace() takes no keyword arguments": + context["logger"].error("Full traceback: " "{}".format(traceback.format_exc())) return attempt_download @@ -252,7 +255,7 @@ def download_and_save_dataset( else: iati_xml_zipped = zip_data_as_single_file(bds_dataset["name"] + ".xml", download_response.text) - azure_upload_to_blob( + response_xml = azure_upload_to_blob( az_blob_service, context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"], "{}/{}.xml".format(bds_dataset["publisher_name"], bds_dataset["name"]), @@ -260,7 +263,11 @@ def download_and_save_dataset( "application/xml", ) - azure_upload_to_blob( + context["logger"].debug( + "dataset id: {} - Azure XML upload response: {}".format(bds_dataset["id"], response_xml) + ) + + response_zip = azure_upload_to_blob( az_blob_service, context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], "{}/{}.zip".format(bds_dataset["publisher_name"], bds_dataset["name"]), @@ -268,6 +275,20 @@ def download_and_save_dataset( "application/zip", ) + if not azure_blob_exists( + az_blob_service, + context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"], + "{}/{}.xml".format(bds_dataset["publisher_name"], bds_dataset["name"]), + ): + context["logger"].error("dataset id: {} - Azure XML upload failed") + context["logger"].debug( + "dataset id: {} - Azure ZIP upload response: {}".format(bds_dataset["id"], response_xml) + ) + + last_modified_header = None + if download_response.headers.get("Last-Modified", None) is not None: + last_modified_header = parse_last_modified_header(download_response.headers.get("Last-Modified", "")) + bds_dataset.update( { "hash": hash, @@ -280,7 +301,7 @@ def download_and_save_dataset( "download_error_message": None, "content_modified": None, "content_modified_excluding_generated_timestamp": None, - "server_header_last_modified": download_response.headers.get("Last-Modified", None), + "server_header_last_modified": last_modified_header, "server_header_etag": download_response.headers.get("ETag", None), } ) @@ -315,9 +336,20 @@ def create_bds_dataset(registered_dataset: dict) -> dict: "content_modified_excluding_generated_timestamp": None, "server_header_last_modified": None, "server_header_etag": None, + "registration_service_dataset_metadata": registered_dataset["registration_service_dataset_metadata"], + "registration_service_publisher_metadata": registered_dataset["registration_service_publisher_metadata"], + "registration_service_name": registered_dataset["registration_service_name"], } def update_bds_dataset_registration_info(bds_dataset: dict, registered_dataset: dict): - for field in ["publisher_id", "publisher_name", "type", "source_url"]: + for field in [ + "publisher_id", + "publisher_name", + "type", + "source_url", + "registration_service_dataset_metadata", + "registration_service_publisher_metadata", + "registration_service_name", + ]: bds_dataset[field] = registered_dataset[field] diff --git a/src/dataset_registration/iati_registry_ckan.py b/src/dataset_registration/iati_registry_ckan.py index ea5e103..5ebb28f 100644 --- a/src/dataset_registration/iati_registry_ckan.py +++ b/src/dataset_registration/iati_registry_ckan.py @@ -1,11 +1,17 @@ +import json import random import uuid +from datetime import datetime, timedelta from logging import Logger from typing import Any import requests from utilities.http import http_get_json +from utilities.misc import get_timestamp + +PUBLISHER_METADATA: dict[str, Any] = {} +PUBLISHER_METADATA_LAST_UPDATE: datetime = get_timestamp() - timedelta(hours=25) def fetch_datasets_metadata(context: dict, session: requests.Session) -> dict[uuid.UUID, dict]: @@ -16,6 +22,8 @@ def fetch_datasets_metadata(context: dict, session: requests.Session) -> dict[uu cleaned_datasets_metadata = clean_datasets_metadata(context["logger"], datasets_list_from_registry) + add_publisher_metadata(context, session, cleaned_datasets_metadata) + datasets_metadata = convert_datasets_metadata(cleaned_datasets_metadata) return datasets_metadata @@ -47,6 +55,52 @@ def fetch_datasets_metadata_from_iati_registry(context: dict, session: requests. return datasets_metadata +def add_publisher_metadata(context: dict, session: requests.Session, datasets_from_registry: list[dict[str, Any]]): + for registry_dataset in datasets_from_registry: + publisher_metadata = "" + if "organization" in registry_dataset and "name" in registry_dataset["organization"]: + publisher_metadata = get_publisher_metadata_as_str( + context, session, registry_dataset["organization"]["name"] + ) + registry_dataset["registration_service_publisher_metadata"] = publisher_metadata + + +def get_publisher_metadata_as_str(context: dict, session: requests.Session, publisher_name: str) -> str: + global PUBLISHER_METADATA, PUBLISHER_METADATA_LAST_UPDATE + + if PUBLISHER_METADATA_LAST_UPDATE < get_timestamp() - timedelta( + hours=int(context["DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS"]) + ): + context["logger"].info( + "Refreshing publisher metadata from IATI Registry (CKAN) " + "after {} hours...".format(context["DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS"]) + ) + update_publisher_metadata_cache(context, session) + PUBLISHER_METADATA_LAST_UPDATE = get_timestamp() + + return PUBLISHER_METADATA[publisher_name] if publisher_name in PUBLISHER_METADATA else "{}" + + +def update_publisher_metadata_cache(context: dict, session: requests.Session): + global PUBLISHER_METADATA, PUBLISHER_METADATA_LAST_UPDATE + + url = context["DATA_REGISTRY_PUBLISHER_METADATA_URL"] + + PUBLISHER_METADATA = {} + + try: + publishers_metadata = http_get_json(session, url, 60, True) + + PUBLISHER_METADATA = {publisher["name"]: json.dumps(publisher) for publisher in publishers_metadata["result"]} + + except RuntimeError as e: + context["logger"].error("HTTP error when fetching publisher metadata from IATI Registry (CKAN): {}".format(e)) + except Exception as e: + context["logger"].error( + "Unexpected error when fetching publisher metadata from IATI Registry (CKAN): {}".format(e) + ) + + def clean_datasets_metadata(logger: Logger, datasets_from_registry: list[dict[str, Any]]) -> list[dict[str, Any]]: """Cleans the list of datasets from the IATI Registry @@ -113,15 +167,24 @@ def ckan_field_is_valid(value: Any) -> bool: def convert_datasets_metadata(datasets_from_registry: list[dict]) -> dict[uuid.UUID, dict]: registered_datasets = { - uuid.UUID(k["id"]): { - "id": uuid.UUID(k["id"]), - "name": k["name"], - "publisher_id": uuid.UUID(k["organization"]["id"]), - "publisher_name": k["organization"]["name"], - "source_url": get_source_url(k), - "type": list(filter(lambda x: x["key"] == "filetype", k["extras"]))[0]["value"], + uuid.UUID(dataset["id"]): { + "id": uuid.UUID(dataset["id"]), + "name": dataset["name"], + "publisher_id": uuid.UUID(dataset["organization"]["id"]), + "publisher_name": dataset["organization"]["name"], + "source_url": get_source_url(dataset), + "type": list(filter(lambda x: x["key"] == "filetype", dataset["extras"]))[0]["value"], + "registration_service_dataset_metadata": json.dumps( + {k: dataset[k] for k in dataset if k != "registration_service_publisher_metadata"} + ), + "registration_service_publisher_metadata": ( + dataset["registration_service_publisher_metadata"] + if "registration_service_publisher_metadata" in dataset + else "" + ), + "registration_service_name": "ckan-registry", } - for k in datasets_from_registry + for dataset in datasets_from_registry } return registered_datasets diff --git a/src/utilities/db.py b/src/utilities/db.py index 6e2d55a..fc3f75d 100644 --- a/src/utilities/db.py +++ b/src/utilities/db.py @@ -74,7 +74,10 @@ def insert_or_update_dataset(connection: psycopg.Connection, data): content_modified_excluding_generated_timestamp = %(content_modified_excluding_generated_timestamp)s, server_header_last_modified = %(server_header_last_modified)s, - server_header_etag = %(server_header_etag)s + server_header_etag = %(server_header_etag)s, + registration_service_dataset_metadata = %(registration_service_dataset_metadata)s, + registration_service_publisher_metadata = %(registration_service_publisher_metadata)s, + registration_service_name = %(registration_service_name)s WHERE iati_datasets.id = %(id)s """.format( From aa188d583218813b05bdd4a72ff6691a5edaa81a Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:25:57 +0100 Subject: [PATCH 11/12] feat: second ZIP file format * Added code to make outputting of new ZIP file formats easier * Implemented a second ZIP format, which is stripped down version of legacy Data Dump --- src/bulk_data_service/zipper.py | 108 +++++++-------- src/bulk_data_service/zippers.py | 230 +++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+), 60 deletions(-) create mode 100644 src/bulk_data_service/zippers.py diff --git a/src/bulk_data_service/zipper.py b/src/bulk_data_service/zipper.py index 7b2b2ea..ed45d0e 100644 --- a/src/bulk_data_service/zipper.py +++ b/src/bulk_data_service/zipper.py @@ -4,10 +4,11 @@ import time import uuid -from azure.storage.blob import BlobServiceClient, ContentSettings from azure.core.exceptions import ResourceNotFoundError +from azure.storage.blob import BlobServiceClient from bulk_data_service.dataset_indexing import get_index_name +from bulk_data_service.zippers import CodeforIATILegacyZipper, IATIBulkDataServiceZipper from utilities.azure import azure_download_blob, get_azure_blob_name, get_azure_container_name from utilities.db import get_datasets_in_bds @@ -22,59 +23,75 @@ def zipper(context: dict): zipper_service_loop(context, {}, datasets) -def zipper_service_loop(context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]): +def zipper_service_loop( + context: dict, datasets_in_working_dir: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict] +): while True: - zipper_run(context, datasets_in_zip, datasets_in_bds) + zipper_run(context, datasets_in_working_dir, datasets_in_bds) time.sleep(60 * 30) -def zipper_run(context: dict, datasets_in_zip: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]): +def zipper_run(context: dict, datasets_in_working_dir: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict]): run_start = datetime.datetime.now(datetime.UTC) - context["logger"].info("Zipper starting run") + context["logger"].info("Zipper run starting") + + setup_working_dir_with_downloaded_datasets(context, datasets_in_working_dir, datasets_in_bds) + + zip_creators = [ + IATIBulkDataServiceZipper( + context, "{}-1".format(context["ZIP_WORKING_DIR"]), datasets_in_working_dir, datasets_in_bds + ), + CodeforIATILegacyZipper( + context, "{}-2".format(context["ZIP_WORKING_DIR"]), datasets_in_working_dir, datasets_in_bds + ), + ] + + for zip_creator in zip_creators: + + if os.path.exists(zip_creator.zip_working_dir): + shutil.rmtree(zip_creator.zip_working_dir) + shutil.copytree(context["ZIP_WORKING_DIR"], zip_creator.zip_working_dir) + + zip_creator.prepare() + + zip_creator.zip() + + zip_creator.upload() + + run_end = datetime.datetime.now(datetime.UTC) + context["logger"].info("Zipper run finished in {}.".format(run_end - run_start)) - clean_working_dir(context, datasets_in_zip) + +def setup_working_dir_with_downloaded_datasets( + context: dict, datasets_in_working_dir: dict[uuid.UUID, dict], datasets_in_bds: dict[uuid.UUID, dict] +): + + clean_working_dir(context, datasets_in_working_dir) datasets_with_downloads = {k: v for k, v in datasets_in_bds.items() if v["last_successful_download"] is not None} - remove_datasets_without_dls_from_working_dir(context, datasets_in_zip, datasets_with_downloads) + remove_datasets_without_dls_from_working_dir(context, datasets_in_working_dir, datasets_with_downloads) new_or_updated_datasets = { k: v for k, v in datasets_with_downloads.items() - if k not in datasets_in_zip or datasets_in_zip[k]["hash"] != datasets_with_downloads[k]["hash"] + if k not in datasets_in_working_dir or datasets_in_working_dir[k]["hash"] != datasets_with_downloads[k]["hash"] } context["logger"].info( - "Found {} datasets to ZIP. {} are new or updated and will be (re-)downloaded.".format( + "Found {} datasets with downloads. " + "{} are new or updated and will be (re-)downloaded.".format( len(datasets_with_downloads), len(new_or_updated_datasets) ) ) download_new_or_updated_to_working_dir(context, new_or_updated_datasets) - download_indices_to_working_dir(context) - - context["logger"].info("Zipping {} datasets.".format(len(datasets_with_downloads))) - shutil.make_archive( - get_big_zip_local_pathname_no_extension(context), - "zip", - root_dir=context["ZIP_WORKING_DIR"], - base_dir="iati-data", - ) - - context["logger"].info("Uploading zipped datasets.") - upload_zip_to_azure(context) - - run_end = datetime.datetime.now(datetime.UTC) - context["logger"].info( - "Zipper finished in {}. Datasets zipped: {}.".format(run_end - run_start, len(datasets_with_downloads)) - ) - - datasets_in_zip.clear() - datasets_in_zip.update(datasets_with_downloads) + datasets_in_working_dir.clear() + datasets_in_working_dir.update(datasets_with_downloads) def clean_working_dir(context: dict, datasets_in_zip: dict[uuid.UUID, dict]): @@ -92,21 +109,6 @@ def remove_datasets_without_dls_from_working_dir( delete_local_xml_from_zip_working_dir(context, dataset) -def upload_zip_to_azure(context: dict): - az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) - - blob_client = az_blob_service.get_blob_client( - context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"], get_big_zip_full_filename(context) - ) - - content_settings = ContentSettings(content_type="zip") - - with open(get_big_zip_local_pathname(context), "rb") as data: - blob_client.upload_blob(data, overwrite=True, content_settings=content_settings) - - az_blob_service.close() - - def download_indices_to_working_dir(context: dict): az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"]) @@ -134,6 +136,8 @@ def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict xml_container_name = get_azure_container_name(context, "xml") + os.makedirs("{}/iati-data/datasets".format(context["ZIP_WORKING_DIR"]), exist_ok=True) + for dataset in updated_datasets.values(): filename = get_local_pathname_dataset_xml(context, dataset) @@ -151,22 +155,6 @@ def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict az_blob_service.close() -def get_big_zip_local_pathname_no_extension(context: dict) -> str: - return "{}/{}".format(context["ZIP_WORKING_DIR"], get_big_zip_base_filename(context)) - - -def get_big_zip_local_pathname(context: dict) -> str: - return "{}/{}".format(context["ZIP_WORKING_DIR"], get_big_zip_full_filename(context)) - - -def get_big_zip_base_filename(context: dict) -> str: - return "iati-data" - - -def get_big_zip_full_filename(context: dict) -> str: - return "{}.zip".format(get_big_zip_base_filename(context)) - - def get_local_pathname_dataset_xml(context: dict, dataset: dict) -> str: return "{}/iati-data/datasets/{}".format(context["ZIP_WORKING_DIR"], get_azure_blob_name(dataset, "xml")) diff --git a/src/bulk_data_service/zippers.py b/src/bulk_data_service/zippers.py new file mode 100644 index 0000000..95d9a8a --- /dev/null +++ b/src/bulk_data_service/zippers.py @@ -0,0 +1,230 @@ +import json +import os +import shutil +import uuid +from abc import ABC, abstractmethod + +from azure.storage.blob import BlobServiceClient + +from bulk_data_service.dataset_indexing import get_index_name +from utilities.azure import azure_download_blob, get_azure_container_name, upload_zip_to_azure +from utilities.misc import filter_dict_by_structure, get_number_xml_files_in_dir, get_timestamp_as_str_z + + +class IATIDataZipper(ABC): + + def __init__( + self, + context, + zip_working_dir: str, + datasets_in_working_dir: dict[uuid.UUID, dict], + datasets_in_bds: dict[uuid.UUID, dict], + ): + self.context = context + self.datasets_in_working_dir = datasets_in_working_dir + self.datasets_in_bds = datasets_in_bds + self.zip_working_dir = zip_working_dir + + @abstractmethod + def prepare(self): + pass + + @property + @abstractmethod + def zip_type(self) -> str: + pass + + @property + def zip_internal_directory_name(self) -> str: + return "iati-data" + + @property + def zip_local_filename_no_extension(self) -> str: + return "iati-data" + + def zip(self): + self.context["logger"].info("Zipping {} datasets.".format(get_number_xml_files_in_dir(self.zip_working_dir))) + shutil.make_archive( + self.get_zip_local_pathname_no_extension(), + "zip", + root_dir=self.zip_working_dir, + base_dir=self.zip_internal_directory_name, + ) + + def upload(self): + self.context["logger"].info( + "Uploading {} ZIP to Azure with filename: {}.".format(self.zip_type, self.get_zip_local_filename()) + ) + upload_zip_to_azure(self.context, self.get_zip_local_pathname(), self.get_zip_local_filename()) + + def get_zip_local_filename(self) -> str: + return "{}.zip".format(self.zip_local_filename_no_extension) + + def get_zip_local_pathname(self) -> str: + return "{}/{}".format(self.zip_working_dir, self.get_zip_local_filename()) + + def get_zip_local_pathname_no_extension(self) -> str: + return "{}/{}".format(self.zip_working_dir, self.zip_local_filename_no_extension) + + +class IATIBulkDataServiceZipper(IATIDataZipper): + + def prepare(self): + az_blob_service = BlobServiceClient.from_connection_string(self.context["AZURE_STORAGE_CONNECTION_STRING"]) + + self.download_index_to_working_dir(az_blob_service, "minimal") + + self.download_index_to_working_dir(az_blob_service, "full") + + az_blob_service.close() + + @property + def zip_type(self) -> str: + return "Bulk Data Service" + + def download_index_to_working_dir(self, az_blob_service: BlobServiceClient, index_type: str): + + index_filename = get_index_name(self.context, index_type) + + index_full_pathname = "{}/{}/{}".format(self.zip_working_dir, self.zip_internal_directory_name, index_filename) + + os.makedirs(os.path.dirname(index_full_pathname), exist_ok=True) + + azure_download_blob( + az_blob_service, get_azure_container_name(self.context, "xml"), index_filename, index_full_pathname + ) + + +class CodeforIATILegacyZipper(IATIDataZipper): + + def prepare(self): + # rename 'iati-data' directory to 'iati-data-main' + if os.path.exists(os.path.join(self.zip_working_dir, super().zip_internal_directory_name)): + os.rename( + os.path.join(self.zip_working_dir, super().zip_internal_directory_name), + os.path.join(self.zip_working_dir, self.zip_internal_directory_name), + ) + + # rename 'datasets' directory to 'data' + if os.path.exists(os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "datasets")): + os.rename( + os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "datasets"), + os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "data"), + ) + + self.create_empty_files_for_non_downloadable_datasets() + + if not os.path.exists(os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "metadata")): + os.makedirs(os.path.join(self.zip_working_dir, self.zip_internal_directory_name, "metadata")) + + self.write_publisher_metadata_files() + + self.write_transformed_dataset_metadata_files() + + # create root 'metadata.json' file with created timestsamp + self.create_zip_file_metadata() + + def create_empty_files_for_non_downloadable_datasets(self): + for dataset_in_bds_db in self.datasets_in_bds: + dataset = self.datasets_in_bds[dataset_in_bds_db] + dataset_pathname = self.get_dataset_data_pathname(self.datasets_in_bds[dataset_in_bds_db]) + dataset_filename = self.get_dataset_data_filename(self.datasets_in_bds[dataset_in_bds_db]) + if dataset["last_successful_download"] is None: + if not os.path.exists(dataset_pathname): + os.makedirs(dataset_pathname, exist_ok=True) + if not os.path.exists(dataset_filename): + open(dataset_filename, "w").close() + + def write_publisher_metadata_files(self): + for dataset_in_bds_db in self.datasets_in_bds: + publisher_metadata_filename = self.get_publisher_metadata_filename( + self.datasets_in_bds[dataset_in_bds_db]["publisher_name"] + ) + if not os.path.exists(publisher_metadata_filename): + with open(publisher_metadata_filename, "w") as pub_file: + pub_file.write( + self.filter_publisher_metadata( + self.datasets_in_bds[dataset_in_bds_db]["registration_service_publisher_metadata"] + ) + ) + + def get_publisher_metadata_filename(self, publisher_name): + return os.path.join( + self.zip_working_dir, self.zip_internal_directory_name, "metadata", f"{publisher_name}.json" + ) + + def filter_publisher_metadata(self, ckan_publisher_metadata: str) -> str: + desired_structure = { + "id": None, + "name": None, + } + + filtered_dict = filter_dict_by_structure(json.loads(ckan_publisher_metadata), desired_structure) + + return json.dumps(filtered_dict) + + def write_transformed_dataset_metadata_files(self): + for dataset_in_bds_db in self.datasets_in_bds: + dataset_metadata_pathname = self.get_dataset_metadata_pathname(self.datasets_in_bds[dataset_in_bds_db]) + dataset_metadata_filename = self.get_dataset_metadata_filename(self.datasets_in_bds[dataset_in_bds_db]) + if not os.path.exists(dataset_metadata_pathname): + os.makedirs(dataset_metadata_pathname, exist_ok=True) + if not os.path.exists(dataset_metadata_filename): + with open(dataset_metadata_filename, "w") as pub_file: + pub_file.write( + self.filter_dataset_metadata_file( + self.datasets_in_bds[dataset_in_bds_db]["registration_service_dataset_metadata"] + ) + ) + + def filter_dataset_metadata_file(self, ckan_dataset_metadata: str) -> str: + desired_structure = { + "id": None, + "license_id": None, + "license_title": None, + "name": None, + "organization": {"id": None, "name": None}, + "resources": [{"url": None}], + } + empty_array_entries = ["extras", "tags", "groups", "users"] + + filtered_dict = filter_dict_by_structure(json.loads(ckan_dataset_metadata), desired_structure) + + for empty_array_entry in empty_array_entries: + filtered_dict[empty_array_entry] = [] + + return json.dumps(filtered_dict) + + def get_dataset_data_pathname(self, dataset_in_bds): + return os.path.join( + self.zip_working_dir, self.zip_internal_directory_name, "data", f"{dataset_in_bds['publisher_name']}" + ) + + def get_dataset_data_filename(self, dataset_in_bds): + return os.path.join(self.get_dataset_data_pathname(dataset_in_bds), f"{dataset_in_bds['name']}.xml") + + def get_dataset_metadata_pathname(self, dataset_in_bds): + return os.path.join( + self.zip_working_dir, self.zip_internal_directory_name, "metadata", f"{dataset_in_bds['publisher_name']}" + ) + + def get_dataset_metadata_filename(self, dataset_in_bds): + return os.path.join(self.get_dataset_metadata_pathname(dataset_in_bds), f"{dataset_in_bds['name']}.json") + + @property + def zip_type(self) -> str: + return "Code for IATI" + + @property + def zip_internal_directory_name(self) -> str: + return "iati-data-main" + + @property + def zip_local_filename_no_extension(self) -> str: + return "code-for-iati-data-download" + + def create_zip_file_metadata(self): + with open( + "{}/{}/{}".format(self.zip_working_dir, self.zip_internal_directory_name, "metadata.json"), "w" + ) as metadata_file: + json.dump({"created_at": get_timestamp_as_str_z(), "updated_at": get_timestamp_as_str_z()}, metadata_file) From d8dd0c69c5bbbbb69340ea0160befeed67988b41 Mon Sep 17 00:00:00 2001 From: Simon K <6615834+simon-20@users.noreply.github.com> Date: Tue, 10 Sep 2024 11:28:45 +0100 Subject: [PATCH 12/12] chore: update release targets ready for release --- .github/workflows/deploy-to-dev.yml | 2 +- .github/workflows/deploy-to-prod.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-to-dev.yml b/.github/workflows/deploy-to-dev.yml index 9e1d962..9faaa26 100644 --- a/.github/workflows/deploy-to-dev.yml +++ b/.github/workflows/deploy-to-dev.yml @@ -19,4 +19,4 @@ jobs: secrets: inherit with: APP_NAME: "bulk-data-service" - TARGET_ENVIRONMENT: "test" + TARGET_ENVIRONMENT: "dev" diff --git a/.github/workflows/deploy-to-prod.yml b/.github/workflows/deploy-to-prod.yml index aee507a..5718a19 100644 --- a/.github/workflows/deploy-to-prod.yml +++ b/.github/workflows/deploy-to-prod.yml @@ -13,4 +13,4 @@ jobs: secrets: inherit with: APP_NAME: "bulk-data-service" - TARGET_ENVIRONMENT: "test" + TARGET_ENVIRONMENT: "prod"