Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: serve all from $web, and using Azure CDN #7

Merged
merged 7 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env-example
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search

BLOB_STORAGE_BASE_PUBLIC_URL=http://127.0.0.1:10000/devstoreaccount1
WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1

NUMBER_DOWNLOADER_THREADS=1 # makes for easier testing locally

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ This automated test environment is configured via the following files:

You can use the Mockoon GUI application to edit the mockoon server configuration file (`mockoon-registration-and-data-server-config.json`).

The automated tests are safe to run alongside the `docker compose` setup for development.

## Provisioning and Deployment

### Initial Provisioning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ properties: # Properties of container group
value: "#DATA_REGISTRATION#"
- name: DATA_REGISTRY_BASE_URL
value: "#DATA_REGISTRY_BASE_URL#"
- name: BLOB_STORAGE_BASE_PUBLIC_URL
value: "https://sabulkdataservice#TARGET_ENVIRONMENT#.blob.core.windows.net"
- name: WEB_BASE_URL
value: "#WEB_BASE_URL#"
- name: NUMBER_DOWNLOADER_THREADS
value: "#NUMBER_DOWNLOADER_THREADS#"
- name: FORCE_REDOWNLOAD_AFTER_HOURS
Expand Down
1 change: 1 addition & 0 deletions azure-deployment/generate-manifest-from-template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOW
sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#WEB_BASE_URL#^$WEB_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
13 changes: 13 additions & 0 deletions azure-deployment/manual-azure-deploy-from-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ if [ ! -d ".git" ]; then
exit 1
fi

if [ ! -f "./azure-deployment/manual-azure-deploy-secrets.env" ]; then
echo "$0: there must be a file 'manual-azure-deploy-secrets.env' in"
echo "'azure-deployment' containing the secrets. See the examples in manual-azure-deploy-secrets-example.env'"
exit 1
fi

if [ ! -f "./azure-deployment/manual-azure-deploy-variables.env" ]; then
echo "$0: there must be a file 'manual-azure-deploy-variables.env' in"
echo "'azure-deployment' containing the config variables. See example: manual-azure-deploy-variables-example.env'"
exit 1
fi


(git remote -v 2> /dev/null | grep "IATI/bulk-data-service.git" > /dev/null) || (echo "$0: script must be run from the root of the bulk-data-service repository"; exit 1)

. ./azure-deployment/manual-azure-deploy-secrets.env
Expand Down
3 changes: 1 addition & 2 deletions azure-deployment/manual-azure-deploy-variables-example.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip

# Value of BLOB_STORAGE_BASE_PUBLIC_URL generated automatically by deploy scripts
# BLOB_STORAGE_BASE_PUBLIC_URL=
WEB_BASE_URL=

DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
Expand Down
107 changes: 87 additions & 20 deletions azure-provision/azure-create-resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,23 @@ POSTGRES_SERVER_NAME="${APP_NAME}-db-$TARGET_ENVIRONMENT"

SERVICE_PRINCIPAL_NAME="sp-${APP_NAME}-$TARGET_ENVIRONMENT"

CDN_PROFILE_NAME="cdn-profile-${APP_NAME}-$TARGET_ENVIRONMENT"

CDN_ENDPOINT_NAME="cdn-endpoint-${APP_NAME}-$TARGET_ENVIRONMENT"

CDN_CUSTOM_DOMAIN_NAME="cdn-custom-domain-${APP_NAME}-$TARGET_ENVIRONMENT"

HOST_FOR_CLOUDFLARE="$CDN_ENDPOINT_NAME.azureedge.net"

LOCATION="uksouth"

WEB_BASE_URL_PREFIX=$([[ "$TARGET_ENVIRONMENT" == "prod" ]] && echo "" || echo "${TARGET_ENVIRONMENT}-")

WEB_BASE_URL="https://${WEB_BASE_URL_PREFIX}bulk-data.iatistandard.org"
SUBDOMAIN="${WEB_BASE_URL_PREFIX}bulk-data"

CUSTOM_DOMAIN="${SUBDOMAIN}.iatistandard.org"

WEB_BASE_URL="https://${CUSTOM_DOMAIN}"

echo
echo "Proceeding will create Azure services with the following names:"
Expand All @@ -71,12 +83,20 @@ echo "Log analytics workspace name : $LOG_ANALYTICS_NAME"
echo "Storage account name : $STORAGE_ACCOUNT_NAME"
echo "Postgres server name : $POSTGRES_SERVER_NAME"
echo "Service principal name : $SERVICE_PRINCIPAL_NAME"
echo "CDN profile name : $CDN_PROFILE_NAME"
echo "CDN endpoint name : $CDN_ENDPOINT_NAME"
echo "CDN custom domain id/name : $CDN_CUSTOM_DOMAIN_NAME"
echo "Custom domain : $CUSTOM_DOMAIN"
echo "Public-facing access URL : $WEB_BASE_URL"
echo
echo
echo "(Using subscription: $SUBSCRIPTION_ID)"
echo
echo
echo "**NOTE:** Before continuing you should ensure that there is a CNAME record created in Cloudflare"
echo " for subdomain $SUBDOMAIN on iatistandard.org pointing to "
echo " $HOST_FOR_CLOUDFLARE"
echo
echo

read -p "Do you want to continue? ([y]es or [n]o) " -n 1 -r
echo ""
Expand Down Expand Up @@ -118,13 +138,13 @@ echo az storage account create --resource-group "$RESOURCE_GROUP_NAME" \
--name $STORAGE_ACCOUNT_NAME \
--location $LOCATION \
--sku Standard_LRS \
--enable-hierarchical-namespace true \
--enable-hierarchical-namespace false \
--kind StorageV2
az storage account create --resource-group "$RESOURCE_GROUP_NAME" \
--name $STORAGE_ACCOUNT_NAME \
--location $LOCATION \
--sku Standard_LRS \
--enable-hierarchical-namespace true \
--enable-hierarchical-namespace false \
--kind StorageV2
echo

Expand All @@ -133,33 +153,74 @@ STORAGE_ACCOUNT_ID=$(az storage account list | jq -r ".[] | select(.name==\"$STO
echo az resource update --ids="$STORAGE_ACCOUNT_ID" --set properties.allowBlobPublicAccess=true
az resource update --ids="$STORAGE_ACCOUNT_ID" --set properties.allowBlobPublicAccess=true

echo "Waiting for 30 seconds before creating containers on the new storage account"
sleep 30

echo az storage container create --name iati-xml --account-name $STORAGE_ACCOUNT_NAME --public-access container
az storage container create --name iati-xml --account-name $STORAGE_ACCOUNT_NAME --public-access container | jq
echo "Waiting for 10 seconds before updating properties on the new storage account"
sleep 10

echo az storage container create --name iati-zip --account-name $STORAGE_ACCOUNT_NAME --public-access container
az storage container create --name iati-zip --account-name $STORAGE_ACCOUNT_NAME --public-access container | jq
echo "Creating a static website on the storage account..."
echo az storage blob service-properties update --account-name "$STORAGE_ACCOUNT_NAME" \
--static-website --404-document 404.html \
--index-document index.html

az storage blob service-properties update --account-name $STORAGE_ACCOUNT_NAME \
az storage blob service-properties update --account-name "$STORAGE_ACCOUNT_NAME" \
--static-website --404-document 404.html \
--index-document index.html

echo az storage account show-connection-string --name $STORAGE_ACCOUNT_NAME \
--resource-group "$RESOURCE_GROUP_NAME" \
\| jq -r '.connectionString'
echo "Fetching connection string for storage account..."

STORAGE_ACCOUNT_CONNECTION_STRING=$(az storage account show-connection-string --name $STORAGE_ACCOUNT_NAME --resource-group "$RESOURCE_GROUP_NAME" | jq -r '.connectionString')
STORAGE_ACCOUNT_CONNECTION_STRING=$(az storage account show-connection-string --name "$STORAGE_ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP_NAME" | jq -r '.connectionString')

# Shown to user, as may be needed for Cloudflare setup on very first run
AZURE_BASE_URL=$(az storage account show -n "$STORAGE_ACCOUNT_NAME" -g "$RESOURCE_GROUP_NAME" --query "primaryEndpoints.web" --output tsv)

# Calculated above from TARGET_ENVIRONMENT, bearing in mind 'prod' doesn' thave prefix
AZURE_BASE_HOSTNAME="$(sed "s#https://\(.*\)/#\1#" <<< $AZURE_BASE_URL)"

# WEB_BASE_URL is calculated above from TARGET_ENVIRONMENT, bearing in mind 'prod' doesn't have prefix
sed -e "s#{{WEB_BASE_URL}}#$WEB_BASE_URL#" web/index-template.html > web/index.html

echo "Uploading index and 404 pages to storage account..."
az storage blob upload-batch -s web -d '$web' --account-name $STORAGE_ACCOUNT_NAME --overwrite


# Create a CDN profile and endpoint, so we can use SSL with a custom domain
echo "Creating CDN profile and endpoint for https with a custom domain..."

echo az cdn profile create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_PROFILE_NAME" \
--sku Standard_Microsoft

az cdn profile create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_PROFILE_NAME" \
--sku Standard_Microsoft

echo az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--origin "$AZURE_BASE_HOSTNAME" \
--origin-host-header "$AZURE_BASE_HOSTNAME" \
--location global

az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--origin "$AZURE_BASE_HOSTNAME" \
--origin-host-header "$AZURE_BASE_HOSTNAME" \
--location global


read -p "Press any key when the CNAME has been created on Cloudflare " -n 1 -r


az cdn custom-domain create --resource-group "$RESOURCE_GROUP_NAME" \
--endpoint-name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--name "$CDN_CUSTOM_DOMAIN_NAME" \
--hostname "$CUSTOM_DOMAIN"

az cdn custom-domain enable-https --resource-group "$RESOURCE_GROUP_NAME" \
--endpoint-name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--name "$CDN_CUSTOM_DOMAIN_NAME"

# Provision Postgres Server
echo az postgres flexible-server create -y -g "$RESOURCE_GROUP_NAME" \
-n "$POSTGRES_SERVER_NAME" --location "$LOCATION" \
Expand Down Expand Up @@ -219,12 +280,10 @@ echo "--------------------------------------------------"
echo "Configuration settings you will need:"

echo

echo "Base URL for Azure Storage Account: ${AZURE_BASE_URL}"
echo "(You may need to put this into the Cloudflare DNS setup if recreating dev/production)"

echo
echo

echo "--------------------------------------------------"
echo "Credentials to put into the Github repo's secrets:"
echo
Expand Down Expand Up @@ -273,6 +332,14 @@ echo "Log analytics workspace key: (Secret name: ${TARGET_ENVIRONMENT_UPPER}_LOG

echo "$LOG_ANALYTICS_WORKSPACE_KEY"

echo "--------------------------------------------------"
echo "Values to put into the Github repo's variables:"
echo

echo "Public-facing base URL: (Variable name: ${TARGET_ENVIRONMENT_UPPER}_WEB_BASE_URL)"

echo $WEB_BASE_URL

echo

echo "You also need to ensure the repository has the following secrets setup, which are not specific to the target environment:"
Expand Down
4 changes: 2 additions & 2 deletions azure-provision/default-github-config-template.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=\$web
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=\$web
_DATA_REGISTRATION=ckan-registry
_DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
_FORCE_REDOWNLOAD_AFTER_HOURS=24
Expand Down
11 changes: 6 additions & 5 deletions src/bulk_data_service/dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ def upload_index_json_to_azure(context: dict, index_name: str, index_json: str):

az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])

for container in [
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
]:

for container in set(
[
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
]
):
azure_upload_to_blob(az_blob_service, container, index_name, index_json, "application/json")

az_blob_service.close()
Expand Down
35 changes: 20 additions & 15 deletions src/bulk_data_service/dataset_updater.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import concurrent.futures
import uuid
import json
from datetime import datetime, timedelta
from itertools import batched
from random import random
Expand Down Expand Up @@ -111,8 +112,8 @@ def add_or_update_registered_dataset(
context["logger"].info("dataset id: {} - Added/updated dataset".format(bds_dataset["id"]))

except RuntimeError as e:
bds_dataset["download_error_message"] = "Download of IATI XML failed with non-200 HTTP status: {}".format(
e
bds_dataset["download_error_message"] = json.dumps(
{"bds_message": "Download of IATI XML failed with non-200 HTTP status"} | e.args[0]
)
context["logger"].warning(
"dataset id: {} - {}".format(registered_dataset_id, bds_dataset["download_error_message"])
Expand All @@ -122,8 +123,11 @@ def add_or_update_registered_dataset(
insert_or_update_dataset(db_conn, bds_dataset)
except Exception as e:
bds_dataset["last_download_attempt"] = get_timestamp()
bds_dataset["download_error_message"] = (
"Download of IATI XML produced EXCEPTION with GET request: {}".format(e)
bds_dataset["download_error_message"] = json.dumps(
{
"bds_message": "Download of IATI XML produced EXCEPTION with GET request",
"message": "{}".format(e),
}
)
context["logger"].warning(
"dataset id: {} - {}".format(registered_dataset_id, bds_dataset["download_error_message"])
Expand Down Expand Up @@ -195,21 +199,22 @@ def check_dataset_etag_last_mod_header(
except RuntimeError as e:

if dataset_downloaded_within(bds_dataset, 6):
extra_err_message = (
"Dataset downloaded within the last 6 hours so not " "forcing full re-download attempt."
)
extra_err_message = "Dataset downloaded within the last 6 hours so not forcing full re-download attempt."
attempt_download = False
else:
extra_err_message = (
"Dataset not downloaded within the last 6 hours so " "forcing full re-download attempt."
)
extra_err_message = "Dataset not downloaded within the last 6 hours so forcing full re-download attempt."
attempt_download = True

bds_dataset["head_error_message"] = (
"Last successful download within {} hours, "
"but HEAD request to check ETag/Last-Modified "
"return non-200 status. {} "
"HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
bds_dataset["head_error_message"] = json.dumps(
{
"bds_message": (
"Last successful download within {} hours, "
"but HEAD request to check ETag/Last-Modified "
"return non-200 status. {} "
"HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
),
"message": "{}".format(e),
}
)

context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"]))
Expand Down
8 changes: 7 additions & 1 deletion src/bulk_data_service/zipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import uuid

from azure.storage.blob import BlobServiceClient, ContentSettings
from azure.core.exceptions import ResourceNotFoundError

from bulk_data_service.dataset_indexing import get_index_name
from utilities.azure import azure_download_blob, get_azure_blob_name, get_azure_container_name
Expand Down Expand Up @@ -140,7 +141,12 @@ def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict

context["logger"].info("dataset id: {} - Downloading".format(dataset["id"]))

azure_download_blob(az_blob_service, xml_container_name, get_azure_blob_name(dataset, "xml"), filename)
try:
azure_download_blob(az_blob_service, xml_container_name, get_azure_blob_name(dataset, "xml"), filename)
except ResourceNotFoundError as e:
context["logger"].error(
"dataset id: {} - Failed to download from Azure: {}".format(dataset["id"], e).replace("\n", " ")
)

az_blob_service.close()

Expand Down
4 changes: 2 additions & 2 deletions src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
_config_variables = [
"DATA_REGISTRATION",
"DATA_REGISTRY_BASE_URL",
"BLOB_STORAGE_BASE_PUBLIC_URL",
"WEB_BASE_URL",
"NUMBER_DOWNLOADER_THREADS",
"FORCE_REDOWNLOAD_AFTER_HOURS",
"REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS",
Expand All @@ -25,6 +25,6 @@
def get_config() -> dict[str, str]:
config = {env_var: os.getenv(env_var, "") for env_var in _config_variables}

config["BLOB_STORAGE_BASE_PUBLIC_URL"] = config["BLOB_STORAGE_BASE_PUBLIC_URL"].strip("/")
config["WEB_BASE_URL"] = config["WEB_BASE_URL"].strip("/")

return config
10 changes: 7 additions & 3 deletions src/utilities/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def create_azure_blob_containers(context: dict):
try:
if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"] not in container_names:
blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
container_names.append(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] not in container_names:
blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"])
except Exception as e:
Expand Down Expand Up @@ -93,8 +94,11 @@ def get_azure_blob_name(dataset: dict, iati_blob_type: str) -> str:


def get_azure_blob_public_url(context: dict, dataset: dict, iati_blob_type: str) -> str:
return "{}/{}/{}".format(
context["BLOB_STORAGE_BASE_PUBLIC_URL"],
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_" + iati_blob_type.upper()],
blob_name = get_azure_container_name(context, iati_blob_type)
blob_name_for_url = "{}/".format(blob_name) if blob_name != "$web" else ""

return "{}/{}{}".format(
context["WEB_BASE_URL"],
blob_name_for_url,
get_azure_blob_name(dataset, iati_blob_type),
)
Loading