Merge pull request #25 from IATI/fix_handling_null_values_

fix: unify handling of empty values in index
IATI · Oct 29, 2024 · 4168380 · 4168380
2 parents 9b53334 + 02d649c
commit 4168380
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 14 deletions.
diff --git a/.env-example b/.env-example
@@ -11,6 +11,8 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24
 
 REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72
 
+ZIP_WORKING_DIR=/tmp/bulk-data-service-zip
+
 # Sample local setup - values read by docker compose (for simple Postgres DB
 # creation), and used by the app
 DB_NAME=bulk_data_service_db

diff --git a/README.md b/README.md
@@ -59,12 +59,21 @@ docker compose up
 
 The example `.env` file (`.env-example`) is configured to use the above docker compose setup. If you don't use the docker compose setup, then you will need to change the values in the `.env` file accordingly.
 
-Once the docker compose setup is running, start the bulk download app with:
+Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite):
 
 ```
 dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50
 ```
 
+You can run the zipper operation with:
+
+```
+dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run
+```
+
+It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable.
+
+
 *Note:* not all versions of `dotenv` require a `run` subcommand.
 
 ## Development on the app

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "bulk-data-service"
-version = "0.0.1"
+version = "0.1.6"
 requires-python = ">= 3.12"
 readme = "README.md"
 dependencies = [

diff --git a/src/bulk_data_service/dataset_indexing.py b/src/bulk_data_service/dataset_indexing.py
@@ -58,8 +58,8 @@ def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str,
     else:
         dataset_index_entry = get_full_index_entry_from_dataset(context, dataset)
 
-    dataset_index_entry["url_xml"] = ""
-    dataset_index_entry["url_zip"] = ""
+    dataset_index_entry["url_xml"] = None
+    dataset_index_entry["url_zip"] = None
 
     if dataset_index_entry["last_successful_download"] is not None:
         dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml")
@@ -80,7 +80,7 @@ def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict:
 
 
 def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict:
-    full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)}
+    full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_source_fields(context)}
 
     field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details")
 
@@ -96,7 +96,7 @@ def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: st
     del entry[source_field]
 
 
-def get_full_index_fields(context: dict) -> list[str]:
+def get_full_index_source_fields(context: dict) -> list[str]:
     return [
         "id",
         "name",

diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py
@@ -214,7 +214,8 @@ def check_dataset_etag_last_mod_header(
                     "return non-200 status. {} "
                     "HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
                 )
-            } | e.args[0]
+            }
+            | e.args[0]
         )
 
         context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"]))

diff --git a/src/config/config.py b/src/config/config.py
@@ -20,7 +20,7 @@
     "AZURE_STORAGE_CONNECTION_STRING",
     "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML",
     "AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP",
-    "CHECKER_LOOP_WAIT_MINS"
+    "CHECKER_LOOP_WAIT_MINS",
 ]
 
 

diff --git a/tests/integration/test_dataset_indexing.py b/tests/integration/test_dataset_indexing.py
@@ -4,12 +4,12 @@
 from azure.storage.blob import BlobServiceClient
 
 from bulk_data_service.checker import checker_run
-from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields
+from bulk_data_service.dataset_indexing import get_full_index_source_fields, get_index_name, get_minimal_index_fields
 from helpers.helpers import get_and_clear_up_context  # noqa: F401
 from utilities.azure import get_azure_container_name
 
 
-def test_index_creation(get_and_clear_up_context):  # noqa: F811
+def test_index_uploaded_to_blob_storage(get_and_clear_up_context):  # noqa: F811
 
     context = get_and_clear_up_context
 
@@ -32,7 +32,7 @@ def test_index_creation(get_and_clear_up_context):  # noqa: F811
     blob_service_client.close()
 
 
-def test_minimal_index_creation(get_and_clear_up_context):  # noqa: F811
+def test_minimal_index_creation_for_download_success(get_and_clear_up_context):  # noqa: F811
     context = get_and_clear_up_context
 
     context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
@@ -64,7 +64,45 @@ def test_minimal_index_creation(get_and_clear_up_context):  # noqa: F811
     blob_service_client.close()
 
 
-def test_full_index_creation(get_and_clear_up_context):  # noqa: F811
+def test_minimal_index_creation_for_download_failure(get_and_clear_up_context):  # noqa: F811
+    context = get_and_clear_up_context
+
+    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
+    datasets_in_bds = {}
+    checker_run(context, datasets_in_bds)
+
+    blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+    zip_container_name = get_azure_container_name(context, "zip")
+
+    minimal_index_name = get_index_name(context, "minimal")
+
+    minimal_index_blob = blob_service_client.get_blob_client(zip_container_name, minimal_index_name)
+
+    minimal_index = json.loads(minimal_index_blob.download_blob().readall())
+
+    dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]
+
+    assert dataset["name"] in minimal_index["datasets"]
+
+    index_item = minimal_index["datasets"][dataset["name"]]
+
+    for field in get_minimal_index_fields(context):
+        if isinstance(dataset[field], uuid.UUID):
+            assert uuid.UUID(index_item[field]) == dataset[field]
+        elif dataset[field] is None or isinstance(dataset[field], str):
+            assert index_item[field] == dataset[field]
+
+    assert index_item["url_xml"] is None
+    assert index_item["url_zip"] is None
+
+    # -2 because of the two autogenerated fields
+    assert len(index_item.keys()) - 2 == len(get_minimal_index_fields(context))
+
+    blob_service_client.close()
+
+
+def test_full_index_creation_for_download_success(get_and_clear_up_context):  # noqa: F811
     context = get_and_clear_up_context
 
     context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
@@ -87,13 +125,55 @@ def test_full_index_creation(get_and_clear_up_context):  # noqa: F811
 
     full_index_item = full_index["datasets"][dataset["name"]]
 
-    for field in get_full_index_fields(context):
+    for field in get_full_index_source_fields(context):
+        if field == "download_error_message" or field == "head_error_message":
+            continue
         if isinstance(dataset[field], uuid.UUID):
             assert uuid.UUID(full_index_item[field]) == dataset[field]
         elif isinstance(dataset[field], str):
             assert full_index_item[field] == dataset[field]
 
     # -2 because of the two autogenerated fields
-    assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context))
+    assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context))
+
+    blob_service_client.close()
+
+
+def test_full_index_creation_for_download_failure(get_and_clear_up_context):  # noqa: F811
+    context = get_and_clear_up_context
+
+    context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
+    datasets_in_bds = {}
+    checker_run(context, datasets_in_bds)
+
+    blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])
+
+    zip_container_name = get_azure_container_name(context, "zip")
+
+    full_index_name = get_index_name(context, "full")
+
+    full_index_blob = blob_service_client.get_blob_client(zip_container_name, full_index_name)
+
+    full_index = json.loads(full_index_blob.download_blob().readall())
+
+    dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]
+
+    assert dataset["name"] in full_index["datasets"]
+
+    full_index_item = full_index["datasets"][dataset["name"]]
+
+    for field in get_full_index_source_fields(context):
+        if field == "download_error_message" or field == "head_error_message":
+            continue
+        if isinstance(dataset[field], uuid.UUID):
+            assert uuid.UUID(full_index_item[field]) == dataset[field]
+        elif dataset[field] is None or isinstance(dataset[field], str):
+            assert full_index_item[field] == dataset[field]
+
+    assert full_index_item["url_xml"] is None
+    assert full_index_item["url_zip"] is None
+
+    # -2 because of the two autogenerated fields
+    assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context))
 
     blob_service_client.close()