Skip to content

Commit

Permalink
Merge pull request #10 from IATI/multi_zip__cdn_change
Browse files Browse the repository at this point in the history
feat: save CKAN metadata; output second ZIP
  • Loading branch information
simon-20 authored Sep 11, 2024
2 parents 8fcca55 + d8dd0c6 commit 1aac6df
Show file tree
Hide file tree
Showing 43 changed files with 1,581 additions and 177 deletions.
7 changes: 4 additions & 3 deletions .env-example
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true"
DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24

WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1

Expand All @@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24

REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72

# Log file
LOGFILE=

# Sample local setup - values read by docker compose (for simple Postgres DB
# creation), and used by the app
DB_NAME=bulk_data_service_db
Expand All @@ -27,3 +26,5 @@ AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM0

AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip

CHECKER_LOOP_WAIT_MINS=20
3 changes: 3 additions & 0 deletions .github/workflows/build-and-deploy-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,15 @@ jobs:
# Variables which configure the app
DATA_REGISTRATION: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRATION')] }}
DATA_REGISTRY_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_BASE_URL')] }}
DATA_REGISTRY_PUBLISHER_METADATA_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_URL')] }}
DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS')] }}
NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }}
FORCE_REDOWNLOAD_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'FORCE_REDOWNLOAD_AFTER_HOURS')] }}
REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS')] }}
ZIP_WORKING_DIR: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'ZIP_WORKING_DIR')] }}
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML')] }}
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP')] }}
CHECKER_LOOP_WAIT_MINS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'CHECKER_LOOP_WAIT_MINS')] }}

run: |
./azure-deployment/generate-manifest-from-template.sh
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/deploy-to-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ jobs:
secrets: inherit
with:
APP_NAME: "bulk-data-service"
TARGET_ENVIRONMENT: "test"
TARGET_ENVIRONMENT: "dev"
2 changes: 1 addition & 1 deletion .github/workflows/deploy-to-prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ jobs:
secrets: inherit
with:
APP_NAME: "bulk-data-service"
TARGET_ENVIRONMENT: "test"
TARGET_ENVIRONMENT: "prod"
31 changes: 30 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,39 @@
"args": [
"--operation",
"checker",
"--single-run",
"--run-for-n-datasets",
"75"
],
"console": "integratedTerminal",
"envFile": "${workspaceFolder}/.env"
},
{
"name": "Python Debugger: Bulk Data Service - Zipper - Single Run",
"type": "debugpy",
"request": "launch",
"program": "src/iati_bulk_data_service.py",
"args": [
"--operation",
"zipper",
"--single-run"
],
"console": "integratedTerminal",
"envFile": "${workspaceFolder}/.env"
}
},
{
"name": "Python Debugger: Bulk Data Service - Checker & Zipper Loop",
"type": "debugpy",
"request": "launch",
"program": "src/iati_bulk_data_service.py",
"args": [
"--operation",
"checker",
"--run-for-n-datasets",
"75"
],
"console": "integratedTerminal",
"envFile": "${workspaceFolder}/.env"
},
]
}
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.12-slim-bookworm
FROM python:3.12.5-slim-bookworm

RUN apt-get update -y

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,20 @@ properties: # Properties of container group
value: "#DATA_REGISTRATION#"
- name: DATA_REGISTRY_BASE_URL
value: "#DATA_REGISTRY_BASE_URL#"
- name: DATA_REGISTRY_PUBLISHER_METADATA_URL
value: "#DATA_REGISTRY_PUBLISHER_METADATA_URL#"
- name: DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS
value: "#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#"
- name: WEB_BASE_URL
value: "#WEB_BASE_URL#"
- name: CHECKER_LOOP_WAIT_MINS
value: "#CHECKER_LOOP_WAIT_MINS#"
- name: NUMBER_DOWNLOADER_THREADS
value: "#NUMBER_DOWNLOADER_THREADS#"
- name: FORCE_REDOWNLOAD_AFTER_HOURS
value: "#FORCE_REDOWNLOAD_AFTER_HOURS#"
- name: REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS
value: "#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#"
- name: LOGFILE
value: ""
- name: ZIP_WORKING_DIR
value: "#ZIP_WORKING_DIR#"
- name: AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML
Expand Down
3 changes: 3 additions & 0 deletions azure-deployment/generate-manifest-from-template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment

sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#DATA_REGISTRY_BASE_URL#^$DATA_REGISTRY_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_URL#^$DATA_REGISTRY_PUBLISHER_METADATA_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#^$DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#NUMBER_DOWNLOADER_THREADS#^$NUMBER_DOWNLOADER_THREADS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#FORCE_REDOWNLOAD_AFTER_HOURS#^$FORCE_REDOWNLOAD_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#WEB_BASE_URL#^$WEB_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#CHECKER_LOOP_WAIT_MINS#^$CHECKER_LOOP_WAIT_MINS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
7 changes: 4 additions & 3 deletions azure-deployment/manual-azure-deploy-variables-example.env
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ WEB_BASE_URL=

DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true"
DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24

FORCE_REDOWNLOAD_AFTER_HOURS=24

# Log file
LOGFILE=

NUMBER_DOWNLOADER_THREADS=25

REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72

ZIP_WORKING_DIR=/tmp/bulk-data-service-zip

CHECKER_LOOP_WAIT_MINS=20
1 change: 1 addition & 0 deletions azure-provision/add-default-config-to-github-variables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@ TARGET_ENVIRONMENT="$1"
cp -f azure-provision/default-github-config-template.env azure-provision/default-github-config.env

sed -i "s/^/${TARGET_ENVIRONMENT^^}/g" azure-provision/default-github-config.env
sed -i "s/{{TARGET_ENVIRONMENT}}/${TARGET_ENVIRONMENT}/g" azure-provision/default-github-config.env

gh variable set --env-file ./azure-provision/default-github-config.env
5 changes: 5 additions & 0 deletions azure-provision/azure-create-resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
--origin-host-header "$AZURE_BASE_HOSTNAME" \
--location global

az cdn endpoint rule add --resource-group "$RESOURCE_GROUP_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--name "$CDN_ENDPOINT_NAME" \
--action-name "CacheExpiration" \
--cache-behavior BypassCache --rule-name global --order 0

read -p "Press any key when the CNAME has been created on Cloudflare " -n 1 -r

Expand Down
11 changes: 7 additions & 4 deletions azure-provision/default-github-config-template.env
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=\$web
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=\$web
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="$web"
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="$web"
_DATA_REGISTRATION=ckan-registry
_DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
_DATA_REGISTRY_PUBLISHER_METADATA_URL=https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true
_DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
_FORCE_REDOWNLOAD_AFTER_HOURS=24
_LOGFILE=log
_NUMBER_DOWNLOADER_THREADS=25
_REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
_WEB_BASE_URL="https://{{TARGET_ENVIRONMENT}}-bulk-data.iatistandard.org"
_CHECKER_LOOP_WAIT_MINS=20
10 changes: 10 additions & 0 deletions db-migrations/20240827_01_pVOLG.rollback.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
--
-- depends: 20240531_01_iY5Qa
ALTER TABLE
iati_datasets DROP COLUMN IF EXISTS registration_service_dataset_metadata;

ALTER TABLE
iati_datasets DROP COLUMN IF EXISTS registration_service_publisher_metadata;

ALTER TABLE
iati_datasets DROP COLUMN IF EXISTS registration_service_name;
23 changes: 23 additions & 0 deletions db-migrations/20240827_01_pVOLG.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
--
-- depends: 20240531_01_iY5Qa
--
ALTER TABLE
iati_datasets
ADD
registration_service_dataset_metadata VARCHAR;

COMMENT ON COLUMN iati_datasets.registration_service_dataset_metadata IS 'the original dataset metadata record from the data registration service';

ALTER TABLE
iati_datasets
ADD
registration_service_publisher_metadata VARCHAR;

COMMENT ON COLUMN iati_datasets.registration_service_publisher_metadata IS 'the original publisher metadata record from the data registration service';

ALTER TABLE
iati_datasets
ADD
registration_service_name VARCHAR;

COMMENT ON COLUMN iati_datasets.registration_service_name IS 'the name of the data registration service';
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ dependencies = [
"azure-storage-blob==12.20.0",
"psycopg[binary,pool]==3.1.18",
"requests==2.31.0",
"yoyo-migrations==8.2.0",
"prometheus-client==0.20.0"
"yoyo-migrations==9.0.0",
"prometheus-client==0.20.0",
]


Expand All @@ -22,7 +22,8 @@ dev = [
"flake8",
"flake8-pyproject",
"types-requests",
"python-dotenv"
"python-dotenv",
"pytest-watcher"
]


Expand Down
34 changes: 19 additions & 15 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@ azure-core==1.30.2
# via azure-storage-blob
azure-storage-blob==12.20.0
# via bulk-data-service (pyproject.toml)
black==24.4.2
black==24.8.0
# via bulk-data-service (pyproject.toml)
build==1.2.1
# via pip-tools
certifi==2024.7.4
certifi==2024.8.30
# via requests
cffi==1.16.0
cffi==1.17.1
# via cryptography
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# black
# pip-tools
cryptography==42.0.8
cryptography==43.0.1
# via azure-storage-blob
flake8==7.1.0
flake8==7.1.1
# via
# bulk-data-service (pyproject.toml)
# flake8-pyproject
flake8-pyproject==1.2.3
# via bulk-data-service (pyproject.toml)
idna==3.7
idna==3.8
# via requests
importlib-metadata==8.0.0
importlib-metadata==8.4.0
# via yoyo-migrations
iniconfig==2.0.0
# via pytest
Expand All @@ -42,7 +42,7 @@ isort==5.13.2
# via bulk-data-service (pyproject.toml)
mccabe==0.7.0
# via flake8
mypy==1.10.1
mypy==1.11.2
# via bulk-data-service (pyproject.toml)
mypy-extensions==1.0.0
# via
Expand All @@ -69,7 +69,7 @@ psycopg-binary==3.1.18
# via psycopg
psycopg-pool==3.2.2
# via psycopg
pycodestyle==2.12.0
pycodestyle==2.12.1
# via flake8
pycparser==2.22
# via cffi
Expand All @@ -79,7 +79,9 @@ pyproject-hooks==1.1.0
# via
# build
# pip-tools
pytest==8.2.2
pytest==8.3.2
# via bulk-data-service (pyproject.toml)
pytest-watcher==0.4.3
# via bulk-data-service (pyproject.toml)
python-dotenv==1.0.1
# via bulk-data-service (pyproject.toml)
Expand All @@ -91,11 +93,11 @@ six==1.16.0
# via
# azure-core
# isodate
sqlparse==0.5.0
sqlparse==0.5.1
# via yoyo-migrations
tabulate==0.9.0
# via yoyo-migrations
types-requests==2.32.0.20240622
types-requests==2.32.0.20240905
# via bulk-data-service (pyproject.toml)
typing-extensions==4.12.2
# via
Expand All @@ -108,11 +110,13 @@ urllib3==2.2.2
# via
# requests
# types-requests
wheel==0.43.0
watchdog==5.0.2
# via pytest-watcher
wheel==0.44.0
# via pip-tools
yoyo-migrations==8.2.0
yoyo-migrations==9.0.0
# via bulk-data-service (pyproject.toml)
zipp==3.19.2
zipp==3.20.1
# via importlib-metadata

# The following packages are considered to be unsafe in a requirements file:
Expand Down
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile --strip-extras pyproject.toml
# pip-compile --output-file=requirements.txt --strip-extras pyproject.toml
#
azure-core==1.30.2
# via azure-storage-blob
azure-storage-blob==12.20.0
# via bulk-data-service (pyproject.toml)
certifi==2024.7.4
certifi==2024.8.30
# via requests
cffi==1.16.0
cffi==1.17.1
# via cryptography
charset-normalizer==3.3.2
# via requests
cryptography==42.0.8
cryptography==43.0.1
# via azure-storage-blob
idna==3.7
idna==3.8
# via requests
importlib-metadata==8.0.0
importlib-metadata==8.4.0
# via yoyo-migrations
isodate==0.6.1
# via azure-storage-blob
Expand All @@ -40,7 +40,7 @@ six==1.16.0
# via
# azure-core
# isodate
sqlparse==0.5.0
sqlparse==0.5.1
# via yoyo-migrations
tabulate==0.9.0
# via yoyo-migrations
Expand All @@ -52,7 +52,7 @@ typing-extensions==4.12.2
# psycopg-pool
urllib3==2.2.2
# via requests
yoyo-migrations==8.2.0
yoyo-migrations==9.0.0
# via bulk-data-service (pyproject.toml)
zipp==3.19.2
zipp==3.20.1
# via importlib-metadata
Loading

0 comments on commit 1aac6df

Please sign in to comment.