Skip to content

Commit

Permalink
Merge pull request #25 from IATI/fix_handling_null_values_
Browse files Browse the repository at this point in the history
fix: unify handling of empty values in index
  • Loading branch information
simon-20 authored Oct 29, 2024
2 parents 9b53334 + 02d649c commit 4168380
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .env-example
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ FORCE_REDOWNLOAD_AFTER_HOURS=24

REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS=72

ZIP_WORKING_DIR=/tmp/bulk-data-service-zip

# Sample local setup - values read by docker compose (for simple Postgres DB
# creation), and used by the app
DB_NAME=bulk_data_service_db
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,21 @@ docker compose up

The example `.env` file (`.env-example`) is configured to use the above docker compose setup. If you don't use the docker compose setup, then you will need to change the values in the `.env` file accordingly.

Once the docker compose setup is running, start the bulk download app with:
Once the docker compose setup is running, you can run the dataset updater part of the app with (this will download the datasets and upload them to Azurite):

```
dotenv run python src/iati_bulk_data_service.py -- --operation checker --single-run --run-for-n-datasets=50
```

You can run the zipper operation with:

```
dotenv run python src/iati_bulk_data_service.py -- --operation zipper --single-run
```

It will store the ZIP files in the directory defined in the `ZIP_WORKING_DIR` environment variable.


*Note:* not all versions of `dotenv` require a `run` subcommand.

## Development on the app
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "bulk-data-service"
version = "0.0.1"
version = "0.1.6"
requires-python = ">= 3.12"
readme = "README.md"
dependencies = [
Expand Down
8 changes: 4 additions & 4 deletions src/bulk_data_service/dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def get_index_entry(context: dict, dataset: dict, index_type: str) -> dict[str,
else:
dataset_index_entry = get_full_index_entry_from_dataset(context, dataset)

dataset_index_entry["url_xml"] = ""
dataset_index_entry["url_zip"] = ""
dataset_index_entry["url_xml"] = None
dataset_index_entry["url_zip"] = None

if dataset_index_entry["last_successful_download"] is not None:
dataset_index_entry["url_xml"] = get_azure_blob_public_url(context, dataset, "xml")
Expand All @@ -80,7 +80,7 @@ def get_minimal_index_entry_from_dataset(context: dict, dataset: dict) -> dict:


def get_full_index_entry_from_dataset(context: dict, dataset: dict) -> dict:
full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_fields(context)}
full_index_entry = {k: v for k, v in dataset.items() if k in get_full_index_source_fields(context)}

field_from_json_str_to_object(full_index_entry, "download_error_message", "download_error_details")

Expand All @@ -96,7 +96,7 @@ def field_from_json_str_to_object(entry: dict, source_field: str, dest_field: st
del entry[source_field]


def get_full_index_fields(context: dict) -> list[str]:
def get_full_index_source_fields(context: dict) -> list[str]:
return [
"id",
"name",
Expand Down
3 changes: 2 additions & 1 deletion src/bulk_data_service/dataset_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ def check_dataset_etag_last_mod_header(
"return non-200 status. {} "
"HEAD request exception details: {}".format(download_within_hours, extra_err_message, e)
)
} | e.args[0]
}
| e.args[0]
)

context["logger"].warning("dataset id: {} - {}".format(bds_dataset["id"], bds_dataset["head_error_message"]))
Expand Down
2 changes: 1 addition & 1 deletion src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"AZURE_STORAGE_CONNECTION_STRING",
"AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_XML",
"AZURE_STORAGE_BLOB_CONTAINER_NAME_IATI_ZIP",
"CHECKER_LOOP_WAIT_MINS"
"CHECKER_LOOP_WAIT_MINS",
]


Expand Down
92 changes: 86 additions & 6 deletions tests/integration/test_dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from azure.storage.blob import BlobServiceClient

from bulk_data_service.checker import checker_run
from bulk_data_service.dataset_indexing import get_full_index_fields, get_index_name, get_minimal_index_fields
from bulk_data_service.dataset_indexing import get_full_index_source_fields, get_index_name, get_minimal_index_fields
from helpers.helpers import get_and_clear_up_context # noqa: F401
from utilities.azure import get_azure_container_name


def test_index_creation(get_and_clear_up_context): # noqa: F811
def test_index_uploaded_to_blob_storage(get_and_clear_up_context): # noqa: F811

context = get_and_clear_up_context

Expand All @@ -32,7 +32,7 @@ def test_index_creation(get_and_clear_up_context): # noqa: F811
blob_service_client.close()


def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811
def test_minimal_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
Expand Down Expand Up @@ -64,7 +64,45 @@ def test_minimal_index_creation(get_and_clear_up_context): # noqa: F811
blob_service_client.close()


def test_full_index_creation(get_and_clear_up_context): # noqa: F811
def test_minimal_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
datasets_in_bds = {}
checker_run(context, datasets_in_bds)

blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])

zip_container_name = get_azure_container_name(context, "zip")

minimal_index_name = get_index_name(context, "minimal")

minimal_index_blob = blob_service_client.get_blob_client(zip_container_name, minimal_index_name)

minimal_index = json.loads(minimal_index_blob.download_blob().readall())

dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]

assert dataset["name"] in minimal_index["datasets"]

index_item = minimal_index["datasets"][dataset["name"]]

for field in get_minimal_index_fields(context):
if isinstance(dataset[field], uuid.UUID):
assert uuid.UUID(index_item[field]) == dataset[field]
elif dataset[field] is None or isinstance(dataset[field], str):
assert index_item[field] == dataset[field]

assert index_item["url_xml"] is None
assert index_item["url_zip"] is None

# -2 because of the two autogenerated fields
assert len(index_item.keys()) - 2 == len(get_minimal_index_fields(context))

blob_service_client.close()


def test_full_index_creation_for_download_success(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-01"
Expand All @@ -87,13 +125,55 @@ def test_full_index_creation(get_and_clear_up_context): # noqa: F811

full_index_item = full_index["datasets"][dataset["name"]]

for field in get_full_index_fields(context):
for field in get_full_index_source_fields(context):
if field == "download_error_message" or field == "head_error_message":
continue
if isinstance(dataset[field], uuid.UUID):
assert uuid.UUID(full_index_item[field]) == dataset[field]
elif isinstance(dataset[field], str):
assert full_index_item[field] == dataset[field]

# -2 because of the two autogenerated fields
assert len(full_index_item.keys()) - 2 == len(get_full_index_fields(context))
assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context))

blob_service_client.close()


def test_full_index_creation_for_download_failure(get_and_clear_up_context): # noqa: F811
context = get_and_clear_up_context

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/registration/datasets-03"
datasets_in_bds = {}
checker_run(context, datasets_in_bds)

blob_service_client = BlobServiceClient.from_connection_string(context["AZURE_STORAGE_CONNECTION_STRING"])

zip_container_name = get_azure_container_name(context, "zip")

full_index_name = get_index_name(context, "full")

full_index_blob = blob_service_client.get_blob_client(zip_container_name, full_index_name)

full_index = json.loads(full_index_blob.download_blob().readall())

dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]

assert dataset["name"] in full_index["datasets"]

full_index_item = full_index["datasets"][dataset["name"]]

for field in get_full_index_source_fields(context):
if field == "download_error_message" or field == "head_error_message":
continue
if isinstance(dataset[field], uuid.UUID):
assert uuid.UUID(full_index_item[field]) == dataset[field]
elif dataset[field] is None or isinstance(dataset[field], str):
assert full_index_item[field] == dataset[field]

assert full_index_item["url_xml"] is None
assert full_index_item["url_zip"] is None

# -2 because of the two autogenerated fields
assert len(full_index_item.keys()) - 2 == len(get_full_index_source_fields(context))

blob_service_client.close()

0 comments on commit 4168380

Please sign in to comment.