Skip to content

Commit

Permalink
Merge pull request #7 from IATI/ssl__move_to_static_website
Browse files Browse the repository at this point in the history
feat: serve all from $web, and using Azure CDN
  • Loading branch information
simon-20 authored Sep 4, 2024
2 parents 125bba3 + 99fa1bc commit 4210d15
Show file tree
Hide file tree
Showing 20 changed files with 243 additions and 74 deletions.
2 changes: 1 addition & 1 deletion .env-example
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search

BLOB_STORAGE_BASE_PUBLIC_URL=http://127.0.0.1:10000/devstoreaccount1
WEB_BASE_URL=http://127.0.0.1:10000/devstoreaccount1

NUMBER_DOWNLOADER_THREADS=1 # makes for easier testing locally

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ This automated test environment is configured via the following files:

You can use the Mockoon GUI application to edit the mockoon server configuration file (`mockoon-registration-and-data-server-config.json`).

The automated tests are safe to run alongside the `docker compose` setup for development.

## Provisioning and Deployment

### Initial Provisioning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ properties: # Properties of container group
value: "#DATA_REGISTRATION#"
- name: DATA_REGISTRY_BASE_URL
value: "#DATA_REGISTRY_BASE_URL#"
- name: BLOB_STORAGE_BASE_PUBLIC_URL
value: "https://sabulkdataservice#TARGET_ENVIRONMENT#.blob.core.windows.net"
- name: WEB_BASE_URL
value: "#WEB_BASE_URL#"
- name: NUMBER_DOWNLOADER_THREADS
value: "#NUMBER_DOWNLOADER_THREADS#"
- name: FORCE_REDOWNLOAD_AFTER_HOURS
Expand Down
1 change: 1 addition & 0 deletions azure-deployment/generate-manifest-from-template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ sed -i ''s^#REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS#^$REMOVE_LAST_GOOD_DOW
sed -i ''s^#ZIP_WORKING_DIR#^$ZIP_WORKING_DIR^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP#^$AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
sed -i ''s^#WEB_BASE_URL#^$WEB_BASE_URL^g'' ./azure-deployment/azure-resource-manager-deployment-manifest.yml
13 changes: 13 additions & 0 deletions azure-deployment/manual-azure-deploy-from-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ if [ ! -d ".git" ]; then
exit 1
fi

if [ ! -f "./azure-deployment/manual-azure-deploy-secrets.env" ]; then
echo "$0: there must be a file 'manual-azure-deploy-secrets.env' in"
echo "'azure-deployment' containing the secrets. See the examples in manual-azure-deploy-secrets-example.env'"
exit 1
fi

if [ ! -f "./azure-deployment/manual-azure-deploy-variables.env" ]; then
echo "$0: there must be a file 'manual-azure-deploy-variables.env' in"
echo "'azure-deployment' containing the config variables. See example: manual-azure-deploy-variables-example.env'"
exit 1
fi


(git remote -v 2> /dev/null | grep "IATI/bulk-data-service.git" > /dev/null) || (echo "$0: script must be run from the root of the bulk-data-service repository"; exit 1)

. ./azure-deployment/manual-azure-deploy-secrets.env
Expand Down
3 changes: 1 addition & 2 deletions azure-deployment/manual-azure-deploy-variables-example.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip

# Value of BLOB_STORAGE_BASE_PUBLIC_URL generated automatically by deploy scripts
# BLOB_STORAGE_BASE_PUBLIC_URL=
WEB_BASE_URL=

DATA_REGISTRATION=ckan-registry
DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
Expand Down
107 changes: 87 additions & 20 deletions azure-provision/azure-create-resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,23 @@ POSTGRES_SERVER_NAME="${APP_NAME}-db-$TARGET_ENVIRONMENT"

SERVICE_PRINCIPAL_NAME="sp-${APP_NAME}-$TARGET_ENVIRONMENT"

CDN_PROFILE_NAME="cdn-profile-${APP_NAME}-$TARGET_ENVIRONMENT"

CDN_ENDPOINT_NAME="cdn-endpoint-${APP_NAME}-$TARGET_ENVIRONMENT"

CDN_CUSTOM_DOMAIN_NAME="cdn-custom-domain-${APP_NAME}-$TARGET_ENVIRONMENT"

HOST_FOR_CLOUDFLARE="$CDN_ENDPOINT_NAME.azureedge.net"

LOCATION="uksouth"

WEB_BASE_URL_PREFIX=$([[ "$TARGET_ENVIRONMENT" == "prod" ]] && echo "" || echo "${TARGET_ENVIRONMENT}-")

WEB_BASE_URL="https://${WEB_BASE_URL_PREFIX}bulk-data.iatistandard.org"
SUBDOMAIN="${WEB_BASE_URL_PREFIX}bulk-data"

CUSTOM_DOMAIN="${SUBDOMAIN}.iatistandard.org"

WEB_BASE_URL="https://${CUSTOM_DOMAIN}"

echo
echo "Proceeding will create Azure services with the following names:"
Expand All @@ -71,12 +83,20 @@ echo "Log analytics workspace name : $LOG_ANALYTICS_NAME"
echo "Storage account name : $STORAGE_ACCOUNT_NAME"
echo "Postgres server name : $POSTGRES_SERVER_NAME"
echo "Service principal name : $SERVICE_PRINCIPAL_NAME"
echo "CDN profile name : $CDN_PROFILE_NAME"
echo "CDN endpoint name : $CDN_ENDPOINT_NAME"
echo "CDN custom domain id/name : $CDN_CUSTOM_DOMAIN_NAME"
echo "Custom domain : $CUSTOM_DOMAIN"
echo "Public-facing access URL : $WEB_BASE_URL"
echo
echo
echo "(Using subscription: $SUBSCRIPTION_ID)"
echo
echo
echo "**NOTE:** Before continuing you should ensure that there is a CNAME record created in Cloudflare"
echo " for subdomain $SUBDOMAIN on iatistandard.org pointing to "
echo " $HOST_FOR_CLOUDFLARE"
echo
echo

read -p "Do you want to continue? ([y]es or [n]o) " -n 1 -r
echo ""
Expand Down Expand Up @@ -118,13 +138,13 @@ echo az storage account create --resource-group "$RESOURCE_GROUP_NAME" \
--name $STORAGE_ACCOUNT_NAME \
--location $LOCATION \
--sku Standard_LRS \
--enable-hierarchical-namespace true \
--enable-hierarchical-namespace false \
--kind StorageV2
az storage account create --resource-group "$RESOURCE_GROUP_NAME" \
--name $STORAGE_ACCOUNT_NAME \
--location $LOCATION \
--sku Standard_LRS \
--enable-hierarchical-namespace true \
--enable-hierarchical-namespace false \
--kind StorageV2
echo

Expand All @@ -133,33 +153,74 @@ STORAGE_ACCOUNT_ID=$(az storage account list | jq -r ".[] | select(.name==\"$STO
echo az resource update --ids="$STORAGE_ACCOUNT_ID" --set properties.allowBlobPublicAccess=true
az resource update --ids="$STORAGE_ACCOUNT_ID" --set properties.allowBlobPublicAccess=true

echo "Waiting for 30 seconds before creating containers on the new storage account"
sleep 30

echo az storage container create --name iati-xml --account-name $STORAGE_ACCOUNT_NAME --public-access container
az storage container create --name iati-xml --account-name $STORAGE_ACCOUNT_NAME --public-access container | jq
echo "Waiting for 10 seconds before updating properties on the new storage account"
sleep 10

echo az storage container create --name iati-zip --account-name $STORAGE_ACCOUNT_NAME --public-access container
az storage container create --name iati-zip --account-name $STORAGE_ACCOUNT_NAME --public-access container | jq
echo "Creating a static website on the storage account..."
echo az storage blob service-properties update --account-name "$STORAGE_ACCOUNT_NAME" \
--static-website --404-document 404.html \
--index-document index.html

az storage blob service-properties update --account-name $STORAGE_ACCOUNT_NAME \
az storage blob service-properties update --account-name "$STORAGE_ACCOUNT_NAME" \
--static-website --404-document 404.html \
--index-document index.html

echo az storage account show-connection-string --name $STORAGE_ACCOUNT_NAME \
--resource-group "$RESOURCE_GROUP_NAME" \
\| jq -r '.connectionString'
echo "Fetching connection string for storage account..."

STORAGE_ACCOUNT_CONNECTION_STRING=$(az storage account show-connection-string --name $STORAGE_ACCOUNT_NAME --resource-group "$RESOURCE_GROUP_NAME" | jq -r '.connectionString')
STORAGE_ACCOUNT_CONNECTION_STRING=$(az storage account show-connection-string --name "$STORAGE_ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP_NAME" | jq -r '.connectionString')

# Shown to user, as may be needed for Cloudflare setup on very first run
AZURE_BASE_URL=$(az storage account show -n "$STORAGE_ACCOUNT_NAME" -g "$RESOURCE_GROUP_NAME" --query "primaryEndpoints.web" --output tsv)

# Calculated above from TARGET_ENVIRONMENT, bearing in mind 'prod' doesn' thave prefix
AZURE_BASE_HOSTNAME="$(sed "s#https://\(.*\)/#\1#" <<< $AZURE_BASE_URL)"

# WEB_BASE_URL is calculated above from TARGET_ENVIRONMENT, bearing in mind 'prod' doesn't have prefix
sed -e "s#{{WEB_BASE_URL}}#$WEB_BASE_URL#" web/index-template.html > web/index.html

echo "Uploading index and 404 pages to storage account..."
az storage blob upload-batch -s web -d '$web' --account-name $STORAGE_ACCOUNT_NAME --overwrite


# Create a CDN profile and endpoint, so we can use SSL with a custom domain
echo "Creating CDN profile and endpoint for https with a custom domain..."

echo az cdn profile create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_PROFILE_NAME" \
--sku Standard_Microsoft

az cdn profile create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_PROFILE_NAME" \
--sku Standard_Microsoft

echo az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--origin "$AZURE_BASE_HOSTNAME" \
--origin-host-header "$AZURE_BASE_HOSTNAME" \
--location global

az cdn endpoint create --resource-group "$RESOURCE_GROUP_NAME" \
--name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--origin "$AZURE_BASE_HOSTNAME" \
--origin-host-header "$AZURE_BASE_HOSTNAME" \
--location global


read -p "Press any key when the CNAME has been created on Cloudflare " -n 1 -r


az cdn custom-domain create --resource-group "$RESOURCE_GROUP_NAME" \
--endpoint-name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--name "$CDN_CUSTOM_DOMAIN_NAME" \
--hostname "$CUSTOM_DOMAIN"

az cdn custom-domain enable-https --resource-group "$RESOURCE_GROUP_NAME" \
--endpoint-name "$CDN_ENDPOINT_NAME" \
--profile-name "$CDN_PROFILE_NAME" \
--name "$CDN_CUSTOM_DOMAIN_NAME"

# Provision Postgres Server
echo az postgres flexible-server create -y -g "$RESOURCE_GROUP_NAME" \
-n "$POSTGRES_SERVER_NAME" --location "$LOCATION" \
Expand Down Expand Up @@ -219,12 +280,10 @@ echo "--------------------------------------------------"
echo "Configuration settings you will need:"

echo

echo "Base URL for Azure Storage Account: ${AZURE_BASE_URL}"
echo "(You may need to put this into the Cloudflare DNS setup if recreating dev/production)"

echo
echo

echo "--------------------------------------------------"
echo "Credentials to put into the Github repo's secrets:"
echo
Expand Down Expand Up @@ -273,6 +332,14 @@ echo "Log analytics workspace key: (Secret name: ${TARGET_ENVIRONMENT_UPPER}_LOG

echo "$LOG_ANALYTICS_WORKSPACE_KEY"

echo "--------------------------------------------------"
echo "Values to put into the Github repo's variables:"
echo

echo "Public-facing base URL: (Variable name: ${TARGET_ENVIRONMENT_UPPER}_WEB_BASE_URL)"

echo $WEB_BASE_URL

echo

echo "You also need to ensure the repository has the following secrets setup, which are not specific to the target environment:"
Expand Down
4 changes: 2 additions & 2 deletions azure-provision/default-github-config-template.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=iati-xml
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=iati-zip
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML=\$web
_AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP=\$web
_DATA_REGISTRATION=ckan-registry
_DATA_REGISTRY_BASE_URL=https://iatiregistry.org/api/3/action/package_search
_FORCE_REDOWNLOAD_AFTER_HOURS=24
Expand Down
11 changes: 6 additions & 5 deletions src/bulk_data_service/dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ def upload_index_json_to_azure(context: dict, index_name: str, index_json: str):

az_blob_service = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])

for container in [
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
]:

for container in set(
[
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"],
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"],
]
):
azure_upload_to_blob(az_blob_service, container, index_name, index_json, "application/json")

az_blob_service.close()
Expand Down
35 changes: 20 additions & 15 deletions src/bulk_data_service/dataset_updater.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import concurrent.futures
import uuid
import json
from datetime import datetime, timedelta
from itertools import batched
from random import random
Expand Down Expand Up @@ -111,8 +112,8 @@ def add_or_update_registered_dataset(
context["logger"].info("dataset id: {} - Added/updated dataset".format(bds_dataset["id"]))

except RuntimeError as e:
bds_dataset["download_error_message"] = "Download of IATI XML failed with non-200 HTTP status: {}".format(
e
bds_dataset["download_error_message"] = json.dumps(
{"bds_message": "Download of IATI XML failed with non-200 HTTP status"} | e.args[0]
)
context["logger"].warning(
"dataset id: {} - {}".format(registered_dataset_id, bds_dataset["download_error_message"])
Expand All @@ -122,8 +123,11 @@ def add_or_update_registered_dataset(
insert_or_update_dataset(db_conn, bds_dataset)
except Exception as e:
bds_dataset["last_download_attempt"] = get_timestamp()
bds_dataset["download_error_message"] = (
"Download of IATI XML produced EXCEPTION with GET request: {}".format(e)
bds_dataset["download_error_message"] = json.dumps(
{
"bds_message": "Download of IATI XML produced EXCEPTION with GET request",
"message": "{}".format(e),
}
)
context["logger"].warning(
"dataset id: {} - {}".format(registered_dataset_id, bds_dataset["download_error_message"])
Expand Down Expand Up @@ -195,21 +199,22 @@ def check_dataset_etag_last_mod_header(
except RuntimeError as e:

if dataset_downloaded_within(bds_dataset, 6):
extra_err_message = (
"Dataset downloaded within the last 6 hours so not " "forcing full re-download attempt."
)
extra_err_message = "Dataset downloaded within the last 6 hours so not forcing full re-download attempt."
attempt_download = False
else:
extra_err_message = (
"Dataset not downloaded within the last 6 hours so " "forcing full re-download attempt."
)
extra_err_message = "Dataset not downloaded within the last 6 hours so forcing full re-download attempt."
attempt_download = True

bds_dataset["head_error_message"] = (
"Last successful download within {} hours, "
"but HEAD request to check ETag/Last-Modified "
"return non-200 status. {} "
"HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
bds_dataset["head_error_message"] = json.dumps(
{
"bds_message": (
"Last successful download within {} hours, "
"but HEAD request to check ETag/Last-Modified "
"return non-200 status. {} "
"HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
),
"message": "{}".format(e),
}
)

context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"]))
Expand Down
8 changes: 7 additions & 1 deletion src/bulk_data_service/zipper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import uuid

from azure.storage.blob import BlobServiceClient, ContentSettings
from azure.core.exceptions import ResourceNotFoundError

from bulk_data_service.dataset_indexing import get_index_name
from utilities.azure import azure_download_blob, get_azure_blob_name, get_azure_container_name
Expand Down Expand Up @@ -140,7 +141,12 @@ def download_new_or_updated_to_working_dir(context: dict, updated_datasets: dict

context["logger"].info("dataset id: {} - Downloading".format(dataset["id"]))

azure_download_blob(az_blob_service, xml_container_name, get_azure_blob_name(dataset, "xml"), filename)
try:
azure_download_blob(az_blob_service, xml_container_name, get_azure_blob_name(dataset, "xml"), filename)
except ResourceNotFoundError as e:
context["logger"].error(
"dataset id: {} - Failed to download from Azure: {}".format(dataset["id"], e).replace("\n", " ")
)

az_blob_service.close()

Expand Down
4 changes: 2 additions & 2 deletions src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
_config_variables = [
"DATA_REGISTRATION",
"DATA_REGISTRY_BASE_URL",
"BLOB_STORAGE_BASE_PUBLIC_URL",
"WEB_BASE_URL",
"NUMBER_DOWNLOADER_THREADS",
"FORCE_REDOWNLOAD_AFTER_HOURS",
"REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS",
Expand All @@ -25,6 +25,6 @@
def get_config() -> dict[str, str]:
config = {env_var: os.getenv(env_var, "") for env_var in _config_variables}

config["BLOB_STORAGE_BASE_PUBLIC_URL"] = config["BLOB_STORAGE_BASE_PUBLIC_URL"].strip("/")
config["WEB_BASE_URL"] = config["WEB_BASE_URL"].strip("/")

return config
10 changes: 7 additions & 3 deletions src/utilities/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def create_azure_blob_containers(context: dict):
try:
if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"] not in container_names:
blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
container_names.append(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML"])
if context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"] not in container_names:
blob_service.create_container(context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP"])
except Exception as e:
Expand Down Expand Up @@ -93,8 +94,11 @@ def get_azure_blob_name(dataset: dict, iati_blob_type: str) -> str:


def get_azure_blob_public_url(context: dict, dataset: dict, iati_blob_type: str) -> str:
return "{}/{}/{}".format(
context["BLOB_STORAGE_BASE_PUBLIC_URL"],
context["AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_" + iati_blob_type.upper()],
blob_name = get_azure_container_name(context, iati_blob_type)
blob_name_for_url = "{}/".format(blob_name) if blob_name != "$web" else ""

return "{}/{}{}".format(
context["WEB_BASE_URL"],
blob_name_for_url,
get_azure_blob_name(dataset, iati_blob_type),
)
Loading

0 comments on commit 4210d15

Please sign in to comment.