Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: save CKAN metadata; output second ZIP #10

Merged
merged 12 commits into from
Sep 11, 2024
7 changes: 4 additions & 3 deletions .env-example
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true"
DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24

WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1

Expand All @@ -9,9 +11,6 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24

REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72

# Log file
LOGFILE=

# Sample local setup - values read by docker compose (for simple Postgres DB
# creation), and used by the app
DB_NAME=bulk_data_service_db
Expand All @@ -27,3 +26,5 @@ AZURE_STORAGE_CONNECTION_STRING=AccountName=devstoreaccount1;AccountKey=Eby8vdM0

AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip

CHECKER_LOOP_WAIT_MINS=20
3 changes: 3 additions & 0 deletions .github/workflows/build-and-deploy-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,15 @@ jobs:
# Variables which configure the app
DATA_REGISTRATION: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRATION')] }}
DATA_REGISTRY_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_BASE_URL')] }}
DATA_REGISTRY_PUBLISHER_METADATA_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_URL')] }}
DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS')] }}
NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }}
FORCE_REDOWNLOAD_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'FORCE_REDOWNLOAD_AFTER_HOURS')] }}
REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS')] }}
ZIP_WORKING_DIR: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'ZIP_WORKING_DIR')] }}
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML')] }}
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP')] }}
CHECKER_LOOP_WAIT_MINS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'CHECKER_LOOP_WAIT_MINS')] }}

run: |
./azure-deployment/generate-manifest-from-template.sh
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/deploy-to-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ jobs:
secrets: inherit
with:
APP_NAME: "bulk-data-service"
TARGET_ENVIRONMENT: "test"
TARGET_ENVIRONMENT: "dev"
2 changes: 1 addition & 1 deletion .github/workflows/deploy-to-prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ jobs:
secrets: inherit
with:
APP_NAME: "bulk-data-service"
TARGET_ENVIRONMENT: "test"
TARGET_ENVIRONMENT: "prod"
31 changes: 30 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,39 @@
"args": [
"--operation",
"checker",
"--single-run",
"--run-for-n-datasets",
"75"
],
"console": "integratedTerminal",
"envFile": "${workspaceFolder}/.env"
},
{
"name": "Python Debugger: Bulk Data Service - Zipper - Single Run",
"type": "debugpy",
"request": "launch",
"program": "src/iati_bulk_data_service.py",
"args": [
"--operation",
"zipper",
"--single-run"
],
"console": "integratedTerminal",
"envFile": "${workspaceFolder}/.env"
}
},
{
"name": "Python Debugger: Bulk Data Service - Checker & Zipper Loop",
"type": "debugpy",
"request": "launch",
"program": "src/iati_bulk_data_service.py",
"args": [
"--operation",
"checker",
"--run-for-n-datasets",
"75"
],
"console": "integratedTerminal",
"envFile": "${workspaceFolder}/.env"
},
]
}
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.12-slim-bookworm
FROM python:3.12.5-slim-bookworm

RUN apt-get update -y

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,20 @@ properties: # Properties of container group
value: "#DATA_REGISTRATION#"
- name: DATA_REGISTRY_BASE_URL
value: "#DATA_REGISTRY_BASE_URL#"
- name: DATA_REGISTRY_PUBLISHER_METADATA_URL
value: "#DATA_REGISTRY_PUBLISHER_METADATA_URL#"
- name: DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS
value: "#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#"
- name: WEB_BASE_URL
value: "#WEB_BASE_URL#"
- name: CHECKER_LOOP_WAIT_MINS
value: "#CHECKER_LOOP_WAIT_MINS#"
- name: NUMBER_DOWNLOADER_THREADS
value: "#NUMBER_DOWNLOADER_THREADS#"
- name: FORCE_REDOWNLOAD_AFTER_HOURS
value: "#FORCE_REDOWNLOAD_AFTER_HOURS#"
- name: REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS
value: "#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#"
- name: LOGFILE
value: ""
- name: ZIP_WORKING_DIR
value: "#ZIP_WORKING_DIR#"
- name: AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML
Expand Down
3 changes: 3 additions & 0 deletions azure-deployment/generate-manifest-from-template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ sed -i ''s^#DB_CONNECTION_TIMEOUT#^$DB_CONNECTION_TIMEOUT^g'' ./azure-deployment

sed -i ''s^#DATA_REGISTRATION#^$DATA_REGISTRATION^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#DATA_REGISTRY_BASE_URL#^$DATA_REGISTRY_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_URL#^$DATA_REGISTRY_PUBLISHER_METADATA_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS#^$DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#NUMBER_DOWNLOADER_THREADS#^$NUMBER_DOWNLOADER_THREADS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#FORCE_REDOWNLOAD_AFTER_HOURS#^$FORCE_REDOWNLOAD_AFTER_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#WEB_BASE_URL#^$WEB_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#CHECKER_LOOP_WAIT_MINS#^$CHECKER_LOOP_WAIT_MINS^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
7 changes: 4 additions & 3 deletions azure-deployment/manual-azure-deploy-variables-example.env
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ WEB_BASE_URL=

DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
DATA_REGISTRY_PUBLISHER_METADATA_URL="https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true"
DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24

FORCE_REDOWNLOAD_AFTER_HOURS=24

# Log file
LOGFILE=

NUMBER_DOWNLOADER_THREADS=25

REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72

ZIP_WORKING_DIR=/tmp/bulk-data-service-zip

CHECKER_LOOP_WAIT_MINS=20
1 change: 1 addition & 0 deletions azure-provision/add-default-config-to-github-variables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,6 @@ TARGET_ENVIRONMENT="$1"
cp -f azure-provision/default-github-config-template.env azure-provision/default-github-config.env

sed -i "s/^/${TARGET_ENVIRONMENT^^}/g" azure-provision/default-github-config.env
sed -i "s/{{TARGET_ENVIRONMENT}}/${TARGET_ENVIRONMENT}/g" azure-provision/default-github-config.env

gh variable set --env-file ./azure-provision/default-github-config.env
5 changes: 5 additions & 0 deletions azure-provision/azure-create-resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
--origin-host-header "$AZURE_BASE_HOSTNAME" \
--location global

az cdn endpoint rule add --resource-group "$RESOURCE_GROUP_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--name "$CDN_ENDPOINT_NAME" \
--action-name "CacheExpiration" \
--cache-behavior BypassCache --rule-name global --order 0

read -p "Press any key when the CNAME has been created on Cloudflare " -n 1 -r

Expand Down
11 changes: 7 additions & 4 deletions azure-provision/default-github-config-template.env
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=\$web
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=\$web
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML="$web"
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP="$web"
_DATA_REGISTRATION=ckan-registry
_DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
_DATA_REGISTRY_PUBLISHER_METADATA_URL=https://iatiregistry.org/api/action/organization_list?all_fields=true&include_extras=true&include_tags=true
_DATA_REGISTRY_PUBLISHER_METADATA_REFRESH_AFTER_HOURS=24
_FORCE_REDOWNLOAD_AFTER_HOURS=24
_LOGFILE=log
_NUMBER_DOWNLOADER_THREADS=25
_REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
_ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
_WEB_BASE_URL="https://{{TARGET_ENVIRONMENT}}-bulk-data.iatistandard.org"
_CHECKER_LOOP_WAIT_MINS=20
10 changes: 10 additions & 0 deletions db-migrations/20240827_01_pVOLG.rollback.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
--
-- depends: 20240531_01_iY5Qa
ALTER TABLE
iati_datasets DROP COLUMN IF EXISTS registration_service_dataset_metadata;

ALTER TABLE
iati_datasets DROP COLUMN IF EXISTS registration_service_publisher_metadata;

ALTER TABLE
iati_datasets DROP COLUMN IF EXISTS registration_service_name;
23 changes: 23 additions & 0 deletions db-migrations/20240827_01_pVOLG.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
--
-- depends: 20240531_01_iY5Qa
--
ALTER TABLE
iati_datasets
ADD
registration_service_dataset_metadata VARCHAR;

COMMENT ON COLUMN iati_datasets.registration_service_dataset_metadata IS 'the original dataset metadata record from the data registration service';

ALTER TABLE
iati_datasets
ADD
registration_service_publisher_metadata VARCHAR;

COMMENT ON COLUMN iati_datasets.registration_service_publisher_metadata IS 'the original publisher metadata record from the data registration service';

ALTER TABLE
iati_datasets
ADD
registration_service_name VARCHAR;

COMMENT ON COLUMN iati_datasets.registration_service_name IS 'the name of the data registration service';
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ dependencies = [
"azure-storage-blob==12.20.0",
"psycopg[binary,pool]==3.1.18",
"requests==2.31.0",
"yoyo-migrations==8.2.0",
"prometheus-client==0.20.0"
"yoyo-migrations==9.0.0",
"prometheus-client==0.20.0",
]


Expand All @@ -22,7 +22,8 @@ dev = [
"flake8",
"flake8-pyproject",
"types-requests",
"python-dotenv"
"python-dotenv",
"pytest-watcher"
]


Expand Down
34 changes: 19 additions & 15 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@ azure-core==1.30.2
# via azure-storage-blob
azure-storage-blob==12.20.0
# via bulk-data-service (pyproject.toml)
black==24.4.2
black==24.8.0
# via bulk-data-service (pyproject.toml)
build==1.2.1
# via pip-tools
certifi==2024.7.4
certifi==2024.8.30
# via requests
cffi==1.16.0
cffi==1.17.1
# via cryptography
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# black
# pip-tools
cryptography==42.0.8
cryptography==43.0.1
# via azure-storage-blob
flake8==7.1.0
flake8==7.1.1
# via
# bulk-data-service (pyproject.toml)
# flake8-pyproject
flake8-pyproject==1.2.3
# via bulk-data-service (pyproject.toml)
idna==3.7
idna==3.8
# via requests
importlib-metadata==8.0.0
importlib-metadata==8.4.0
# via yoyo-migrations
iniconfig==2.0.0
# via pytest
Expand All @@ -42,7 +42,7 @@ isort==5.13.2
# via bulk-data-service (pyproject.toml)
mccabe==0.7.0
# via flake8
mypy==1.10.1
mypy==1.11.2
# via bulk-data-service (pyproject.toml)
mypy-extensions==1.0.0
# via
Expand All @@ -69,7 +69,7 @@ psycopg-binary==3.1.18
# via psycopg
psycopg-pool==3.2.2
# via psycopg
pycodestyle==2.12.0
pycodestyle==2.12.1
# via flake8
pycparser==2.22
# via cffi
Expand All @@ -79,7 +79,9 @@ pyproject-hooks==1.1.0
# via
# build
# pip-tools
pytest==8.2.2
pytest==8.3.2
# via bulk-data-service (pyproject.toml)
pytest-watcher==0.4.3
# via bulk-data-service (pyproject.toml)
python-dotenv==1.0.1
# via bulk-data-service (pyproject.toml)
Expand All @@ -91,11 +93,11 @@ six==1.16.0
# via
# azure-core
# isodate
sqlparse==0.5.0
sqlparse==0.5.1
# via yoyo-migrations
tabulate==0.9.0
# via yoyo-migrations
types-requests==2.32.0.20240622
types-requests==2.32.0.20240905
# via bulk-data-service (pyproject.toml)
typing-extensions==4.12.2
# via
Expand All @@ -108,11 +110,13 @@ urllib3==2.2.2
# via
# requests
# types-requests
wheel==0.43.0
watchdog==5.0.2
# via pytest-watcher
wheel==0.44.0
# via pip-tools
yoyo-migrations==8.2.0
yoyo-migrations==9.0.0
# via bulk-data-service (pyproject.toml)
zipp==3.19.2
zipp==3.20.1
# via importlib-metadata

# The following packages are considered to be unsafe in a requirements file:
Expand Down
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile --strip-extras pyproject.toml
# pip-compile --output-file=requirements.txt --strip-extras pyproject.toml
#
azure-core==1.30.2
# via azure-storage-blob
azure-storage-blob==12.20.0
# via bulk-data-service (pyproject.toml)
certifi==2024.7.4
certifi==2024.8.30
# via requests
cffi==1.16.0
cffi==1.17.1
# via cryptography
charset-normalizer==3.3.2
# via requests
cryptography==42.0.8
cryptography==43.0.1
# via azure-storage-blob
idna==3.7
idna==3.8
# via requests
importlib-metadata==8.0.0
importlib-metadata==8.4.0
# via yoyo-migrations
isodate==0.6.1
# via azure-storage-blob
Expand All @@ -40,7 +40,7 @@ six==1.16.0
# via
# azure-core
# isodate
sqlparse==0.5.0
sqlparse==0.5.1
# via yoyo-migrations
tabulate==0.9.0
# via yoyo-migrations
Expand All @@ -52,7 +52,7 @@ typing-extensions==4.12.2
# psycopg-pool
urllib3==2.2.2
# via requests
yoyo-migrations==8.2.0
yoyo-migrations==9.0.0
# via bulk-data-service (pyproject.toml)
zipp==3.19.2
zipp==3.20.1
# via importlib-metadata
Loading