From d8410bdc435b126b087c0e5bbcf6ebc1802741b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 30 May 2024 17:39:07 +0200 Subject: [PATCH 01/62] Disable the documentation generator on pull requests --- .github/workflows/schemas-doc-generator.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/schemas-doc-generator.yml b/.github/workflows/schemas-doc-generator.yml index 93867e72..b4ec8ebe 100644 --- a/.github/workflows/schemas-doc-generator.yml +++ b/.github/workflows/schemas-doc-generator.yml @@ -1,7 +1,6 @@ name: schemas-doc-generator on: - pull_request: push: branches: [main] From 690544c3a447c3478b79cfc528b216a2f26c20f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 31 May 2024 15:44:31 +0200 Subject: [PATCH 02/62] Make Workflow Run RO-Crate parsing more conformant --- wfexs_backend/utils/rocrate.py | 59 ++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 56db4dbd..5822c41f 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -600,7 +600,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_WORKFLOW_INPUTS_SPARQL: "Final[str]" = """\ -SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid WHERE { ?main_entity bsworkflow:input ?inputfp . ?inputfp @@ -612,6 +612,7 @@ def identifyROCrate( # A file, which is a schema.org MediaObject ?input a s:MediaObject . + BIND (?input AS ?fileid) OPTIONAL { ?input s:contentUrl ?fileuri . @@ -624,6 +625,7 @@ def identifyROCrate( # A directory, which is a schema.org Dataset ?input a s:Dataset . + BIND (?input AS ?fileid) OPTIONAL { ?input s:contentUrl ?fileuri . @@ -655,6 +657,7 @@ def identifyROCrate( s:hasPart+ ?component . ?component a ?leaf_type . + BIND (?component AS ?fileid) OPTIONAL { ?component s:contentUrl ?fileuri . } @@ -671,7 +674,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_WORKFLOW_ENV_SPARQL: "Final[str]" = """\ -SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type +SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid WHERE { ?main_entity wrterm:environment ?envfp . ?envfp @@ -684,6 +687,7 @@ def identifyROCrate( ?env a s:MediaObject ; s:name ?name_env . + BIND (?env AS ?fileid) OPTIONAL { ?env s:contentUrl ?fileuri . @@ -697,6 +701,7 @@ def identifyROCrate( ?env a s:Dataset ; s:name ?name_env . + BIND (?env AS ?fileid) OPTIONAL { ?env s:contentUrl ?fileuri . @@ -730,6 +735,7 @@ def identifyROCrate( s:hasPart+ ?component . ?component a ?leaf_type . + BIND (?component AS ?fileid) OPTIONAL { ?component s:contentUrl ?fileuri . } @@ -775,7 +781,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_EXECUTION_INPUTS_SPARQL: "Final[str]" = """\ -SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid WHERE { ?execution s:object ?input . { @@ -784,6 +790,7 @@ def identifyROCrate( ?input a s:MediaObject ; s:exampleOfWork ?inputfp . + BIND (?input AS ?fileid) OPTIONAL { ?input s:contentUrl ?fileuri . @@ -802,6 +809,7 @@ def identifyROCrate( ?input a s:Dataset ; s:exampleOfWork ?inputfp . + BIND (?input AS ?fileid) OPTIONAL { ?input s:contentUrl ?fileuri . @@ -849,6 +857,7 @@ def identifyROCrate( s:additionalType ?additional_type . ?component a ?leaf_type . + BIND (?component AS ?fileid) OPTIONAL { ?component s:contentUrl ?fileuri . } @@ -865,7 +874,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_EXECUTION_ENV_SPARQL: "Final[str]" = """\ -SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type +SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid WHERE { ?execution wrterm:environment ?env . { @@ -875,6 +884,7 @@ def identifyROCrate( a s:MediaObject ; s:name ?name_env ; s:exampleOfWork ?envfp . + BIND (?env AS ?fileid) OPTIONAL { ?env s:contentUrl ?fileuri . @@ -894,6 +904,7 @@ def identifyROCrate( a s:Dataset ; s:name ?name_env ; s:exampleOfWork ?envfp . + BIND (?env AS ?fileid) OPTIONAL { ?env s:contentUrl ?fileuri . @@ -943,6 +954,7 @@ def identifyROCrate( s:additionalType ?additional_type . ?component a ?leaf_type . + BIND (?component AS ?fileid) OPTIONAL { ?component s:contentUrl ?fileuri . } @@ -1505,10 +1517,6 @@ def __parseInputsResults( # Is it a file or a directory? if additional_type in ("File", "Dataset"): - if inputrow.fileuri is None and inputrow.filepid is None: - errmsg = f"Input parameter {inputrow.name} from {public_name} is of type {additional_type}, but no associated `contentUrl` or `identifier` were found. Stopping." - self.logger.error(errmsg) - raise ROCrateToolboxException(errmsg) valobj = base.setdefault( param_last, { @@ -1519,18 +1527,25 @@ def __parseInputsResults( ) if isinstance(valobj, dict): - licences = self._getLicences(g, inputrow.input, public_name) - if len(licences) == 0: - licences = default_licences the_uri: "str" if inputrow.fileuri is not None: the_uri = str(inputrow.fileuri) elif inputrow.filepid is not None: the_uri = str(inputrow.filepid) else: - raise ROCrateToolboxException( - "FATAL RO-Crate workflow input processing error. Check the code of WfExS" - ) + the_uri = str(inputrow.fileid) + + # Check it is not an originally relative URI + parsed_uri = urllib.parse.urlparse(the_uri) + if parsed_uri.scheme in ("", self.RELATIVE_ROCRATE_SCHEME): + errmsg = f"Input parameter {inputrow.name} from {public_name} is of type {additional_type}, but no associated `contentUrl` or `identifier` were found, and its @id is a relative URI. Stopping." + self.logger.error(errmsg) + raise ROCrateToolboxException(errmsg) + + # Now, the licence + licences = self._getLicences(g, inputrow.input, public_name) + if len(licences) == 0: + licences = default_licences the_url: "Union[str, Mapping[str, Any]]" if len(licences) == 0: @@ -1673,6 +1688,22 @@ def __parseEnvResults( ) if isinstance(valobj, dict): + the_uri: "str" + if envrow.fileuri is not None: + the_uri = str(envrow.fileuri) + elif envrow.filepid is not None: + the_uri = str(envrow.filepid) + else: + the_uri = str(envrow.fileid) + + # Check it is not an originally relative URI + parsed_uri = urllib.parse.urlparse(the_uri) + if parsed_uri.scheme in ("", self.RELATIVE_ROCRATE_SCHEME): + errmsg = f"Environment variable {env_name} from {public_name} is of type {additional_type}, but no associated `contentUrl` or `identifier` were found, and its @id is a relative URI. Stopping." + self.logger.error(errmsg) + raise ROCrateToolboxException(errmsg) + + # Now, the licence licences = self._getLicences(g, envrow.env, public_name) if len(licences) == 0: licences = default_licences From 20649b2eb859ff913d684f9eacd1cfce26cba90d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 31 May 2024 19:07:45 +0200 Subject: [PATCH 03/62] Fixed corner case in associated container images handling for CWL. When a WRROC is generated for a CWL workflow execution, two workflows are registered in it: the original one and the consolidated one. The original one is used to generated the consolidated one, and the consolidated one is used to run the workflow. But the containers were not properly associated to the executions of this last workflow. --- wfexs_backend/ro_crate.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 84f9bca8..89afb33c 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -728,7 +728,7 @@ def __init__( # This is used to avoid including twice the very same value # in the RO-Crate self._item_hash: "MutableMapping[bytes, rocrate.model.entity.Entity]" = {} - self._added_containers: "MutableSequence[Container]" = [] + self._added_container_images: "MutableMapping[int, ContainerImage]" = {} self._wf_to_containers: "MutableMapping[str, MutableSequence[ContainerImage]]" = ( {} ) @@ -1123,10 +1123,6 @@ def _add_containers( if len(containers) > 0: do_attach = CratableItem.Containers in self.payloads for container in containers: - # Skip early what it was already included in the crate - if container in self._added_containers: - continue - container_type_metadata = ContainerTypeMetadataDetails[container.type] crate_cont_type = self.cached_cts.get(container.type) if crate_cont_type is None: @@ -1180,6 +1176,11 @@ def _add_containers( crate_source_cont_type = self.crate.add(container_source_type) self.cached_cts[container.source_type] = crate_source_cont_type + # Skip early what it was already included in the crate + if id(container) in self._added_container_images: + added_containers.append(self._added_container_images[id(container)]) + continue + software_container: "ContainerImage" registry, tag_name, tag_label = container.decompose_docker_tagged_name original_container_type = ( @@ -1268,9 +1269,6 @@ def _add_containers( crate_cont = self.crate.dereference(software_container.id) if crate_cont is None: - # Record the container - self._added_containers.append(container) - # Now, add container metadata, which is going to be # consumed by WfExS or third parties metadataLocalPath: "Optional[str]" = None @@ -1322,6 +1320,11 @@ def _add_containers( "softwareRequirements", crate_cont, compact=True ) + # Record the container image + self._added_container_images[id(container)] = cast( + "ContainerImage", crate_cont + ) + added_containers.append(cast("ContainerImage", crate_cont)) return added_containers From daa9c9c5caf4f2d71006cf0342adddc01d70581c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 4 Jun 2024 03:10:56 +0200 Subject: [PATCH 04/62] Container caching infrastructure overhaul (1/3). * Added ContainerCacheHandler, where all the needed methods to manage container image snapshots are included. * Simplified several parameters passed and used to instantiate ContainerFactory and its descendants. * Rewritten SingularityContainerFactory, so it uses new instance of ContainerCacheHandler. --- wfexs_backend/container_factories/__init__.py | 289 +++++++++++++++-- .../container_factories/docker_container.py | 19 +- .../container_factories/no_container.py | 6 +- .../container_factories/podman_container.py | 19 +- .../singularity_container.py | 290 ++++++------------ wfexs_backend/workflow_engines/__init__.py | 20 +- .../workflow_engines/nextflow_engine.py | 21 +- 7 files changed, 396 insertions(+), 268 deletions(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 090d659d..29ee6279 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -99,6 +99,14 @@ class DockerManifestMetadata(TypedDict): from .. import common +from ..utils.contents import ( + link_or_copy, + real_unlink_if_exists, +) + +from ..utils.digests import ComputeDigestFromFile + + # A couple of constants needed for several fixes DOCKER_SCHEME: "Final[str]" = "docker" DOCKER_URI_PREFIX: "Final[str]" = DOCKER_SCHEME + ":" @@ -235,26 +243,255 @@ class ContainerNotFoundException(ContainerFactoryException): pass +class ContainerCacheHandler: + """ + This class abstracts all the common caching handling + """ + + def __init__( + self, + containers_cache_dir: "Optional[AbsPath]", + engine_name: "str", + simple_file_name_method: "ContainerFileNamingMethod", + ): + # Getting a logger focused on specific classes + self.logger = logging.getLogger( + dict(inspect.getmembers(self))["__module__"] + + "::" + + self.__class__.__name__ + ) + + # TODO: create caching database??? + # containers_cache_dir + if containers_cache_dir is None: + containers_cache_dir = cast( + "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") + ) + # Assuring this temporal directory is removed at the end + atexit.register(shutil.rmtree, containers_cache_dir) + else: + os.makedirs(containers_cache_dir, exist_ok=True) + + # But, for materialized containers, we should use common directories + # This for the containers themselves + self.containersCacheDir = containers_cache_dir + + # This for the symlinks to the containers, following the engine convention + self.engineContainersSymlinkDir = cast( + "AbsPath", os.path.join(self.containersCacheDir, engine_name) + ) + os.makedirs(self.engineContainersSymlinkDir, exist_ok=True) + + self.simpleFileNameMethod = simple_file_name_method + + def _genContainerPaths( + self, container: "ContainerTaggedName" + ) -> "Tuple[AbsPath, AbsPath]": + containerFilename = self.simpleFileNameMethod( + cast("URIType", container.origTaggedName) + ) + containerFilenameMeta = containerFilename + META_JSON_POSTFIX + localContainerPath = cast( + "AbsPath", + os.path.join(self.engineContainersSymlinkDir, containerFilename), + ) + localContainerPathMeta = cast( + "AbsPath", + os.path.join(self.engineContainersSymlinkDir, containerFilenameMeta), + ) + + return localContainerPath, localContainerPathMeta + + def _computeFingerprint(self, image_path: "AnyPath") -> "Fingerprint": + return cast("Fingerprint", ComputeDigestFromFile(image_path)) + + def _computeCanonicalImagePath( + self, image_path: "AbsPath" + ) -> "Tuple[AbsPath, Fingerprint]": + imageSignature = self._computeFingerprint(image_path) + + # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols + canonical_image_path = os.path.join( + self.containersCacheDir, + imageSignature.replace("=", "~").replace("/", "-").replace("+", "_"), + ) + + return cast("AbsPath", canonical_image_path), imageSignature + + def query( + self, container: "ContainerTaggedName" + ) -> "Tuple[bool, AbsPath, AbsPath, Optional[Fingerprint]]": + """ + This method checks whether the container snapshot and its + metadata are in the caching directory + """ + localContainerPath, localContainerPathMeta = self._genContainerPaths(container) + + trusted_copy = False + imageSignature: "Optional[Fingerprint]" = None + if os.path.isfile(localContainerPath): + if os.path.islink(localContainerPath): + # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols + unlinkedContainerPath = os.readlink(localContainerPath) + fsImageSignature = os.path.basename(unlinkedContainerPath) + imageSignature = cast( + "Fingerprint", + fsImageSignature.replace("~", "=") + .replace("-", "/") + .replace("_", "+"), + ) + + # Do not trust paths outside the caching directory + canonicalContainerPath = os.path.join( + self.containersCacheDir, + fsImageSignature, + ) + + trusted_copy = os.path.samefile( + os.path.realpath(localContainerPath), + os.path.realpath(canonicalContainerPath), + ) + else: + ( + canonicalContainerPath, + imageSignature, + ) = self._computeCanonicalImagePath(localContainerPath) + + if os.path.samefile(localContainerPath, canonicalContainerPath): + trusted_copy = True + elif os.path.isfile(canonicalContainerPath): + canonicalImageSignature = self._computeFingerprint( + canonicalContainerPath + ) + + trusted_copy = canonicalImageSignature == imageSignature + + if trusted_copy: + trusted_copy = os.path.isfile(localContainerPathMeta) + + return trusted_copy, localContainerPath, localContainerPathMeta, imageSignature + + def transfer( + self, + container: "ContainerTaggedName", + stagedContainersDir: "AnyPath", + force: "bool" = False, + ) -> "Optional[Tuple[AbsPath, AbsPath]]": + """ + This method is used to transfer both the container snapshot and + its metadata from the caching directory to stagedContainersDir + """ + # First, get the local paths + ( + trusted_copy, + localContainerPath, + localContainerPathMeta, + imageSignature, + ) = self.query(container) + if not trusted_copy: + return None + + # Last, but not the least important + # Hardlink or copy the container and its metadata + containerFilename = self.simpleFileNameMethod( + cast("URIType", container.origTaggedName) + ) + containerFilenameMeta = containerFilename + META_JSON_POSTFIX + + os.makedirs(stagedContainersDir, exist_ok=True) + containerPath = cast( + "AbsPath", os.path.join(stagedContainersDir, containerFilename) + ) + + containerPathMeta = cast( + "AbsPath", os.path.join(stagedContainersDir, containerFilenameMeta) + ) + + if force or not os.path.exists(containerPath): + link_or_copy(localContainerPath, containerPath) + if force or not os.path.exists(containerPathMeta): + link_or_copy(localContainerPathMeta, containerPathMeta) + + return (containerPath, containerPathMeta) + + def update( + self, + container: "ContainerTaggedName", + image_path: "AbsPath", + image_metadata_path: "AbsPath", + do_move: "bool" = True, + ) -> "None": + # First, let's remove what it is still there + self.invalidate(container) + + # Then, get the local paths + localContainerPath, localContainerPathMeta = self._genContainerPaths(container) + + # Now, compute the hash + canonicalContainerPath, imageSignature = self._computeCanonicalImagePath( + image_path + ) + canonicalContainerPathMeta = cast( + "AbsPath", canonicalContainerPath + META_JSON_POSTFIX + ) + + # And ..... transfer!!! + if do_move: + shutil.move(image_path, canonicalContainerPath) + shutil.move(image_metadata_path, canonicalContainerPathMeta) + else: + link_or_copy(image_path, canonicalContainerPath, force_copy=True) + link_or_copy( + image_metadata_path, canonicalContainerPathMeta, force_copy=True + ) + + # Last, the symbolic links + os.symlink( + os.path.relpath(canonicalContainerPath, self.engineContainersSymlinkDir), + localContainerPath, + ) + + os.symlink( + os.path.relpath( + canonicalContainerPathMeta, self.engineContainersSymlinkDir + ), + localContainerPathMeta, + ) + + def invalidate(self, container: "ContainerTaggedName") -> "None": + # First, get the local paths + localContainerPath, localContainerPathMeta = self._genContainerPaths(container) + + # Let's remove what it is still there + real_unlink_if_exists(localContainerPath) + real_unlink_if_exists(localContainerPathMeta) + + class ContainerFactory(abc.ABC): # Is this implementation enabled? ENABLED: "ClassVar[bool]" = True def __init__( self, - cacheDir: "Optional[AnyPath]" = None, + simpleFileNameMethod: "ContainerFileNamingMethod", + containersCacheDir: "Optional[AnyPath]" = None, stagedContainersDir: "Optional[AnyPath]" = None, - local_config: "Optional[ContainerLocalConfig]" = None, + tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", tempDir: "Optional[AnyPath]" = None, ): """ Abstract init method - + containersCacheDir: Base directory where """ - if local_config is None: - local_config = dict() - self.local_config = local_config + # This factory was created by the workflow engine, which + # provides its file naming method + self.simpleFileNameMethod = simpleFileNameMethod + + if tools_config is None: + tools_config = dict() + self.tools_config = tools_config # Getting a logger focused on specific classes self.logger = logging.getLogger( @@ -263,17 +500,19 @@ def __init__( + self.__class__.__name__ ) - # cacheDir - if cacheDir is None: - cacheDir = local_config.get("cacheDir") - if cacheDir: - os.makedirs(cacheDir, exist_ok=True) - else: - cacheDir = cast( - "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") - ) - # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, cacheDir) + # But, for materialized containers, we should use common directories + # This for the containers themselves + # containersCacheDir + if containersCacheDir is None: + self.containersCacheDir = cast( + "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") + ) + # Assuring this temporal directory is removed at the end + atexit.register(shutil.rmtree, self.containersCacheDir) + else: + self.containersCacheDir = cast( + "AbsPath", os.path.abspath(containersCacheDir) + ) if tempDir is None: tempDir = cast( @@ -285,11 +524,13 @@ def __init__( # This directory might be needed by temporary processes, like # image materialization in singularity or podman self.tempDir = tempDir - # But, for materialized containers, we should use common directories - # This for the containers themselves - self.containersCacheDir = cast( - "AnyPath", os.path.join(cacheDir, "containers", self.__class__.__name__) + + self.cc_handler = ContainerCacheHandler( + self.containersCacheDir, + engine_name=engine_name, + simple_file_name_method=simpleFileNameMethod, ) + # stagedContainersDir if stagedContainersDir is None: stagedContainersDir = self.containersCacheDir @@ -408,7 +649,6 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture def materializeContainers( self, tagList: "Sequence[ContainerTaggedName]", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, offline: "bool" = False, force: "bool" = False, @@ -425,7 +665,6 @@ def materializeContainers( if self.AcceptsContainer(tag): container = self.materializeSingleContainer( tag, - simpleFileNameMethod, containers_dir=containers_dir, offline=offline, force=force, @@ -447,7 +686,6 @@ def materializeContainers( def materializeSingleContainer( self, tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, offline: "bool" = False, force: "bool" = False, @@ -460,7 +698,6 @@ def materializeSingleContainer( def deployContainers( self, containers_list: "Sequence[Container]", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, ) -> "Sequence[Container]": @@ -475,7 +712,6 @@ def deployContainers( if self.AcceptsContainer(container): was_redeployed = self.deploySingleContainer( container, - simpleFileNameMethod, containers_dir=containers_dir, force=force, ) @@ -488,7 +724,6 @@ def deployContainers( def deploySingleContainer( self, container: "Container", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, ) -> "bool": diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index 53ddb730..dee9dc3f 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -88,21 +88,22 @@ def trimmable_manifest_keys(cls) -> "Sequence[str]": def __init__( self, - cacheDir: "Optional[AnyPath]" = None, + simpleFileNameMethod: "ContainerFileNamingMethod", + containersCacheDir: "Optional[AnyPath]" = None, stagedContainersDir: "Optional[AnyPath]" = None, - local_config: "Optional[ContainerLocalConfig]" = None, + tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", tempDir: "Optional[AnyPath]" = None, ): super().__init__( - cacheDir=cacheDir, + simpleFileNameMethod=simpleFileNameMethod, + containersCacheDir=containersCacheDir, stagedContainersDir=stagedContainersDir, - local_config=local_config, + tools_config=tools_config, engine_name=engine_name, tempDir=tempDir, ) - tools = local_config.get("tools", {}) if local_config else {} - self.runtime_cmd = tools.get("dockerCommand", DEFAULT_DOCKER_CMD) + self.runtime_cmd = self.tools_config.get("dockerCommand", DEFAULT_DOCKER_CMD) @classmethod def ContainerType(cls) -> "ContainerType": @@ -149,7 +150,6 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture def materializeSingleContainer( self, tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, offline: "bool" = False, force: "bool" = False, @@ -182,7 +182,7 @@ def materializeSingleContainer( self.logger.info(f"downloading docker container: {tag_name} => {dockerTag}") # These are the paths to the copy of the saved container - containerFilename = simpleFileNameMethod(cast("URIType", tag_name)) + containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) containerFilenameMeta = containerFilename + META_JSON_POSTFIX localContainerPath = cast( "AbsPath", @@ -503,7 +503,6 @@ def materializeSingleContainer( def deploySingleContainer( self, container: "Container", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, ) -> "bool": @@ -514,7 +513,7 @@ def deploySingleContainer( tag_name = container.origTaggedName # These are the paths to the copy of the saved container - containerFilename = simpleFileNameMethod(cast("URIType", tag_name)) + containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) containerFilenameMeta = containerFilename + META_JSON_POSTFIX # Keep a copy outside the cache directory diff --git a/wfexs_backend/container_factories/no_container.py b/wfexs_backend/container_factories/no_container.py index 710006dd..5402c1d7 100644 --- a/wfexs_backend/container_factories/no_container.py +++ b/wfexs_backend/container_factories/no_container.py @@ -63,8 +63,8 @@ class NoContainerFactory(ContainerFactory): The 'no container approach', for development and local installed software """ - # def __init__(self, cacheDir=None, local_config=None, engine_name='unset'): - # super().__init__(cacheDir=cacheDir, local_config=local_config, engine_name=engine_name) + # def __init__(self, containersCacheDir=None, tools_config=None, engine_name='unset'): + # super().__init__(containersCacheDir=containersCacheDir, tools_config=tools_config, engine_name=engine_name) AcceptedContainerTypes = set([common.ContainerType.NoContainer]) @classmethod @@ -86,7 +86,6 @@ def engine_version(self) -> "ContainerEngineVersionStr": def materializeSingleContainer( self, tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[Union[RelPath, AbsPath]]" = None, offline: "bool" = False, force: "bool" = False, @@ -99,7 +98,6 @@ def materializeSingleContainer( def deploySingleContainer( self, container: "Container", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, ) -> "bool": diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index ab2d8a82..62c32e24 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -91,21 +91,22 @@ def trimmable_manifest_keys(cls) -> "Sequence[str]": def __init__( self, - cacheDir: "Optional[AnyPath]" = None, + simpleFileNameMethod: "ContainerFileNamingMethod", + containersCacheDir: "Optional[AnyPath]" = None, stagedContainersDir: "Optional[AnyPath]" = None, - local_config: "Optional[ContainerLocalConfig]" = None, + tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", tempDir: "Optional[AnyPath]" = None, ): super().__init__( - cacheDir=cacheDir, + simpleFileNameMethod=simpleFileNameMethod, + containersCacheDir=containersCacheDir, stagedContainersDir=stagedContainersDir, - local_config=local_config, + tools_config=tools_config, engine_name=engine_name, tempDir=tempDir, ) - tools = local_config.get("tools", {}) if local_config else {} - self.runtime_cmd = tools.get("podmanCommand", DEFAULT_PODMAN_CMD) + self.runtime_cmd = self.tools_config.get("podmanCommand", DEFAULT_PODMAN_CMD) self._environment.update( { @@ -167,7 +168,6 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture def materializeSingleContainer( self, tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, offline: "bool" = False, force: "bool" = False, @@ -209,7 +209,7 @@ def materializeSingleContainer( self.logger.info(f"downloading podman container: {tag_name} => {podmanPullTag}") # These are the paths to the copy of the saved container - containerFilename = simpleFileNameMethod(cast("URIType", tag_name)) + containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) containerFilenameMeta = containerFilename + META_JSON_POSTFIX localContainerPath = cast( "AbsPath", @@ -531,7 +531,6 @@ def materializeSingleContainer( def deploySingleContainer( self, container: "Container", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, ) -> "bool": @@ -542,7 +541,7 @@ def deploySingleContainer( tag_name = container.origTaggedName # These are the paths to the copy of the saved container - containerFilename = simpleFileNameMethod(cast("URIType", tag_name)) + containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) containerFilenameMeta = containerFilename + META_JSON_POSTFIX # Keep a copy outside the cache directory diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index efc749b5..5ebd5200 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -91,7 +91,6 @@ class SingularityManifest(TypedDict): ) from ..utils.contents import link_or_copy -from ..utils.digests import ComputeDigestFromFile, nihDigester from ..utils.docker import DockerHelper @@ -122,21 +121,24 @@ class SingularityContainerFactory(ContainerFactory): def __init__( self, - cacheDir: "Optional[AnyPath]" = None, + simpleFileNameMethod: "ContainerFileNamingMethod", + containersCacheDir: "Optional[AnyPath]" = None, stagedContainersDir: "Optional[AnyPath]" = None, - local_config: "Optional[ContainerLocalConfig]" = None, + tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", tempDir: "Optional[AnyPath]" = None, ): super().__init__( - cacheDir=cacheDir, + simpleFileNameMethod=simpleFileNameMethod, + containersCacheDir=containersCacheDir, stagedContainersDir=stagedContainersDir, - local_config=local_config, + tools_config=tools_config, engine_name=engine_name, tempDir=tempDir, ) - tools = local_config.get("tools", {}) if local_config else {} - self.runtime_cmd = tools.get("singularityCommand", DEFAULT_SINGULARITY_CMD) + self.runtime_cmd = self.tools_config.get( + "singularityCommand", DEFAULT_SINGULARITY_CMD + ) # This is needed due a bug in singularity 3.6, where # singularity pull --disable-cache does not create a container @@ -352,7 +354,6 @@ def _getContainerArchitecture( def materializeSingleContainer( self, tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[Union[RelPath, AbsPath]]" = None, offline: "bool" = False, force: "bool" = False, @@ -362,7 +363,6 @@ def materializeSingleContainer( """ the_cont = self._materializeSingleContainerSing( tag, - simpleFileNameMethod, containers_dir=containers_dir, offline=offline, force=force, @@ -373,7 +373,6 @@ def materializeSingleContainer( def _materializeSingleContainerSing( self, tag: "ContainerTaggedName", - simpleFileNameMethod: "ContainerFileNamingMethod", matEnv: "Mapping[str, str]" = {}, dhelp: "DockerHelper" = DockerHelper(), containers_dir: "Optional[AnyPath]" = None, @@ -421,86 +420,77 @@ def _materializeSingleContainerSing( parsedTag = parse.urlparse(singTag) # Last case, it already has a registry declared - containerFilename = simpleFileNameMethod(cast("URIType", tag_name)) - containerFilenameMeta = containerFilename + META_JSON_POSTFIX - localContainerPath = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilename), - ) - localContainerPathMeta = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilenameMeta), - ) - - self.logger.info( - f"downloading singularity container: {tag_name} => {localContainerPath}" - ) - - # First, let's materialize the container image if it is needed - tmpContainerPath = None - - # Does the metadata exist? - fetch_metadata = force or not os.path.isfile(localContainerPathMeta) - - imageSignature = None - canonicalContainerPath = None - - # Now it is time to check the local cache of the container - if not force and os.path.isfile(localContainerPath): - trusted_copy = False - if os.path.islink(localContainerPath): - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPath = os.readlink(localContainerPath) - fsImageSignature = os.path.basename(unlinkedContainerPath) - imageSignature = cast( - "Fingerprint", - fsImageSignature.replace("~", "=") - .replace("-", "/") - .replace("_", "+"), - ) - - canonicalContainerPath = os.path.join( - self.containersCacheDir, - fsImageSignature, - ) - - trusted_copy = os.path.samefile( - os.path.realpath(localContainerPath), - os.path.realpath(canonicalContainerPath), - ) - else: - imageSignature = cast( - "Fingerprint", ComputeDigestFromFile(localContainerPath) - ) - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - canonicalContainerPath = os.path.join( - self.containersCacheDir, - imageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) + fetch_metadata = True + trusted_copy = False + localContainerPath: "Optional[AbsPath]" = None + localContainerPathMeta: "Optional[AbsPath]" = None + imageSignature: "Optional[Fingerprint]" = None + fingerprint: "Optional[Fingerprint]" = None + if not force: + ( + trusted_copy, + localContainerPath, + localContainerPathMeta, + imageSignature, + ) = self.cc_handler.query(tag) + + if trusted_copy: + try: + with open( + localContainerPathMeta, mode="r", encoding="utf8" + ) as tcpm: + raw_metadata = json.load(tcpm) + if isinstance(raw_metadata, dict) and ( + "registryServer" in raw_metadata + ): + metadata = cast("SingularityManifest", raw_metadata) + registryServer = metadata["registryServer"] + registryType = metadata.get("registryType", "docker") + repo = metadata["repo"] + alias = metadata.get("alias") + partial_fingerprint = metadata.get("dcd") + imageSignature_in_metadata = metadata.get("image_signature") + manifest = metadata.get("manifest") + if partial_fingerprint is not None: + fingerprint = cast( + # Maybe in the future registryServer + '/' + repo + "@" + partial_fingerprint + "Fingerprint", + repo + "@" + partial_fingerprint, + ) + else: + # TODO: is there a better alternative? + fingerprint = cast("Fingerprint", tag_name) + + if imageSignature_in_metadata is not None: + # Do the signatures match? + fetch_metadata = ( + imageSignature != imageSignature_in_metadata + ) + else: + registryServer = "" + registryType = None + repo = "" + alias = "" + partial_fingerprint = "" + imageSignature_in_metadata = None + manifest = None + fingerprint = cast("Fingerprint", tag_name) - if os.path.isfile(canonicalContainerPath): - canonicalImageSignature = cast( - "Fingerprint", ComputeDigestFromFile(canonicalContainerPath) + except Exception as e: + # Some problem happened parsing the existing metadata + self.logger.exception( + f"Error while reading or parsing {localContainerPathMeta}. Discarding it" ) - trusted_copy = canonicalImageSignature == imageSignature - - if not trusted_copy: - self.logger.warning( - f"Unable to trust Singularity container {singTag} cached copy at {unlinkedContainerPath} pointed from {localContainerPath}. Discarding" - ) - os.unlink(localContainerPath) + self.logger.info(f"downloading singularity container: {tag_name}") # Now, time to fetch the container itself # (if it is needed) - if force or not os.path.isfile(localContainerPath): + tmpContainerPath: "Optional[str]" = None + if not trusted_copy: if offline: raise ContainerFactoryException( - "Cannot download containers in offline mode from {} to {}".format( - tag_name, localContainerPath - ) + f"Cannot download containers in offline mode from {tag_name}" ) with tempfile.NamedTemporaryFile() as s_out, tempfile.NamedTemporaryFile() as s_err: @@ -540,48 +530,10 @@ def _materializeSingleContainerSing( ) ) - imageSignature = cast( - "Fingerprint", ComputeDigestFromFile(tmpContainerPath) - ) - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - canonicalContainerPath = os.path.join( - self.containersCacheDir, - imageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), + # This is needed for the metadata + imageSignature = self.cc_handler._computeFingerprint( + cast("AnyPath", tmpContainerPath) ) - - # There was already a materialized container - if os.path.exists(canonicalContainerPath): - tmpSize = os.path.getsize(tmpContainerPath) - canonicalSize = os.path.getsize(canonicalContainerPath) - - if tmpSize != canonicalSize: - # If files were not the same complain - # This should not happen!!!!! - raise ContainerFactoryException( - f"FATAL ERROR: Singularity cache collision for {imageSignature}, with differing sizes ({tag_name} => local {canonicalSize} != remote {tmpSize})" - ) - else: - # Remove the temporary one, as the name contains the digest - os.unlink(tmpContainerPath) - else: - shutil.move(tmpContainerPath, canonicalContainerPath) - # The metadata should be invalidated - if not fetch_metadata: - fetch_metadata = True - - # At this point, the container is in the right canonical path - # Now, create the relative symbolic link - if os.path.lexists(localContainerPath): - os.unlink(localContainerPath) - os.symlink( - os.path.relpath( - canonicalContainerPath, self.engineContainersSymlinkDir - ), - localContainerPath, - ) - else: errstr = """Could not materialize singularity image {}. Retval {} ====== @@ -609,57 +561,10 @@ def _materializeSingleContainerSing( # At this point we should always have a image signature assert imageSignature is not None - assert canonicalContainerPath is not None - - fingerprint: "Optional[Fingerprint]" = None - if not fetch_metadata: - try: - with open(localContainerPathMeta, mode="r", encoding="utf8") as tcpm: - raw_metadata = json.load(tcpm) - if isinstance(raw_metadata, dict) and ( - "registryServer" in raw_metadata - ): - metadata = cast("SingularityManifest", raw_metadata) - registryServer = metadata["registryServer"] - registryType = metadata.get("registryType", "docker") - repo = metadata["repo"] - alias = metadata.get("alias") - partial_fingerprint = metadata.get("dcd") - imageSignature_in_metadata = metadata.get("image_signature") - manifest = metadata.get("manifest") - if partial_fingerprint is not None: - fingerprint = cast( - # Maybe in the future registryServer + '/' + repo + "@" + partial_fingerprint - "Fingerprint", - repo + "@" + partial_fingerprint, - ) - else: - # TODO: is there a better alternative? - fingerprint = cast("Fingerprint", tag_name) - - if imageSignature_in_metadata is not None: - # Do the signatures match? - fetch_metadata = ( - imageSignature != imageSignature_in_metadata - ) - else: - registryServer = "" - registryType = None - repo = "" - alias = "" - partial_fingerprint = "" - imageSignature_in_metadata = None - manifest = None - fingerprint = cast("Fingerprint", tag_name) - except Exception as e: - # Some problem happened parsing the existing metadata - self.logger.exception( - f"Error while reading or parsing {localContainerPathMeta}. Discarding it" - ) - fetch_metadata = True # When no metadata exists, we are bringing the metadata # to a temporary path + tmpContainerPathMeta: "Optional[str]" = None if fetch_metadata: if offline: raise ContainerFactoryException( @@ -667,9 +572,11 @@ def _materializeSingleContainerSing( ) if tmpContainerPath is None: + assert localContainerPath is not None tmpContainerPath = os.path.join( self.containersCacheDir, str(uuid.uuid4()) ) + link_or_copy(localContainerPath, cast("AbsPath", tmpContainerPath)) tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX self.logger.debug( @@ -712,37 +619,25 @@ def _materializeSingleContainerSing( fingerprint = cast("Fingerprint", tag_name) json.dump(tmp_meta, tcpm) - canonicalContainerPathMeta = cast( - "AbsPath", canonicalContainerPath + META_JSON_POSTFIX - ) - shutil.move(tmpContainerPathMeta, canonicalContainerPathMeta) - - if os.path.lexists(localContainerPathMeta): - os.unlink(localContainerPathMeta) - os.symlink( - os.path.relpath( - canonicalContainerPathMeta, self.engineContainersSymlinkDir - ), - localContainerPathMeta, - ) - # Last, but not the least important # Hardlink or copy the container and its metadata + if tmpContainerPath is not None and tmpContainerPathMeta is not None: + self.cc_handler.update( + tag, + image_path=cast("AbsPath", tmpContainerPath), + image_metadata_path=cast("AbsPath", tmpContainerPathMeta), + do_move=True, + ) + if containers_dir is None: containers_dir = self.stagedContainersDir - containerPath = cast("AbsPath", os.path.join(containers_dir, containerFilename)) - # Do not allow overwriting in offline mode - if not offline: - containerPathMeta = cast( - "AbsPath", os.path.join(containers_dir, containerFilenameMeta) - ) - os.makedirs(containers_dir, exist_ok=True) - if force or not os.path.exists(containerPath): - link_or_copy(localContainerPath, containerPath) - if force or not os.path.exists(containerPathMeta): - link_or_copy(localContainerPathMeta, containerPathMeta) + transferred_image = self.cc_handler.transfer( + tag, stagedContainersDir=containers_dir, force=force and not offline + ) + assert transferred_image is not None, f"Unexpected cache miss for {tag}" + containerPath, containerPathMeta = transferred_image return Container( origTaggedName=tag_name, @@ -761,7 +656,6 @@ def _materializeSingleContainerSing( def materializeContainers( self, tagList: "Sequence[ContainerTaggedName]", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, offline: "bool" = False, force: "bool" = False, @@ -785,7 +679,6 @@ def materializeContainers( continue matched_container = self._materializeSingleContainerSing( tag, - simpleFileNameMethod=simpleFileNameMethod, matEnv=matEnv, dhelp=dhelp, containers_dir=containers_dir, @@ -815,7 +708,6 @@ def materializeContainers( def deploySingleContainer( self, container: "Container", - simpleFileNameMethod: "ContainerFileNamingMethod", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, ) -> "bool": diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 746d9cc4..1f5c9491 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -505,10 +505,9 @@ def __init__( self.stagedContainersDir = cast("AbsPath", stagedContainersDir) # Setting up common properties - self.docker_cmd = local_config.get("tools", {}).get( - "dockerCommand", DEFAULT_DOCKER_CMD - ) - engine_mode = local_config.get("tools", {}).get("engineMode") + tools_config = local_config.get("tools", {}) + self.docker_cmd = tools_config.get("dockerCommand", DEFAULT_DOCKER_CMD) + engine_mode = tools_config.get("engineMode") if engine_mode is None: engine_mode = DEFAULT_ENGINE_MODE else: @@ -527,10 +526,17 @@ def __init__( ) self.logger.debug(f"Instantiating container type {container_type}") + # For materialized containers, we should use common directories + # This for the containers themselves + containersCacheDir = cast( + "AnyPath", + os.path.join(cacheDir, "containers", container_factory_clazz.__name__), + ) self.container_factory = container_factory_clazz( - cacheDir=cacheDir, + simpleFileNameMethod=self.simpleContainerFileName, + containersCacheDir=containersCacheDir, stagedContainersDir=stagedContainersDir, - local_config=local_config, + tools_config=tools_config, engine_name=self.__class__.__name__, tempDir=self.tempDir, ) @@ -770,7 +776,6 @@ def materialize_containers( self.container_factory.engine_version(), self.container_factory.materializeContainers( listOfContainerTags, - self.simpleContainerFileName, containers_dir=containersDir, offline=offline, ), @@ -788,7 +793,6 @@ def deploy_containers( return self.container_factory.deployContainers( containers_list=containers_list, - simpleFileNameMethod=self.simpleContainerFileName, containers_dir=containersDir, force=force, ) diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index 04d41f55..6e85833b 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -743,15 +743,18 @@ def runNextflowCommand( self, nextflow_version: "EngineVersion", commandLine: "Sequence[str]", + containers_path: "Optional[AnyPath]" = None, workdir: "Optional[AbsPath]" = None, intermediateDir: "Optional[AbsPath]" = None, nextflow_path: "Optional[EnginePath]" = None, - containers_path: "Optional[AnyPath]" = None, stdoutFilename: "Optional[AbsPath]" = None, stderrFilename: "Optional[AbsPath]" = None, runEnv: "Optional[Mapping[str, str]]" = None, ) -> "Tuple[ExitVal, Optional[str], Optional[str]]": self.logger.debug("Command => nextflow " + " ".join(commandLine)) + + if containers_path is None: + containers_path = self.container_factory.cacheDir if self.engine_mode == EngineMode.Docker: ( retval, @@ -760,9 +763,9 @@ def runNextflowCommand( ) = self.runNextflowCommandInDocker( nextflow_version, commandLine, - workdir, - intermediateDir=intermediateDir, containers_path=containers_path, + workdir=workdir, + intermediateDir=intermediateDir, stdoutFilename=stdoutFilename, stderrFilename=stderrFilename, runEnv=runEnv, @@ -771,9 +774,9 @@ def runNextflowCommand( retval, nxf_run_stdout_v, nxf_run_stderr_v = self.runLocalNextflowCommand( nextflow_version, commandLine, - workdir, - intermediateDir=intermediateDir, containers_path=containers_path, + workdir=workdir, + intermediateDir=intermediateDir, nextflow_install_dir=nextflow_path, stdoutFilename=stdoutFilename, stderrFilename=stderrFilename, @@ -792,10 +795,10 @@ def runLocalNextflowCommand( self, nextflow_version: "EngineVersion", commandLine: "Sequence[str]", + containers_path: "AnyPath", workdir: "Optional[AbsPath]" = None, intermediateDir: "Optional[AbsPath]" = None, nextflow_install_dir: "Optional[EnginePath]" = None, - containers_path: "Optional[AnyPath]" = None, stdoutFilename: "Optional[AbsPath]" = None, stderrFilename: "Optional[AbsPath]" = None, runEnv: "Optional[Mapping[str, str]]" = None, @@ -851,8 +854,6 @@ def runLocalNextflowCommand( instEnv["TMPDIR"] = self.tempDir # This is needed to have Nextflow using the cached contents - if containers_path is None: - containers_path = self.container_factory.cacheDir if self.container_factory.containerType == ContainerType.Singularity: # See https://github.com/nextflow-io/nextflow/commit/91e9ee7c3c2ed4e63559339ae1a1d2c7d5f25953 if nextflow_version >= "21.09.0-edge": @@ -940,9 +941,9 @@ def runNextflowCommandInDocker( self, nextflow_version: "EngineVersion", commandLine: "Sequence[str]", + containers_path: "AnyPath", workdir: "Optional[AbsPath]" = None, intermediateDir: "Optional[AbsPath]" = None, - containers_path: "Optional[AnyPath]" = None, stdoutFilename: "Optional[AbsPath]" = None, stderrFilename: "Optional[AbsPath]" = None, runEnv: "Optional[Mapping[str, str]]" = None, @@ -1904,10 +1905,10 @@ def wfexs_allParams() launch_retval, launch_stdout, launch_stderr = self.runNextflowCommand( matWfEng.version, nxf_params, + containers_path=matWfEng.containers_path, workdir=outputsDir, intermediateDir=intermediateDir, nextflow_path=matWfEng.engine_path, - containers_path=matWfEng.containers_path, stdoutFilename=stdoutFilename, stderrFilename=stderrFilename, runEnv=runEnv, From eff89159adc40a76aa882122641ea7d838904b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 4 Jun 2024 04:05:54 +0200 Subject: [PATCH 05/62] Container caching infrastructure overhaul (2/3). * Rewritten DockerContainerFactory, so it uses new instance of ContainerCacheHandler. * Added an additional helper method, to know which files to expect in the staged working directories. --- wfexs_backend/container_factories/__init__.py | 40 ++- .../container_factories/docker_container.py | 271 +++++------------- .../singularity_container.py | 10 +- .../hello/hello_cwl_http.wfex.stage | 13 + .../hello/hellow_cwl_docker.wfex.stage | 14 + .../hello/hellow_cwl_podman.wfex.stage | 14 + 6 files changed, 147 insertions(+), 215 deletions(-) create mode 100644 workflow_examples/hello/hello_cwl_http.wfex.stage create mode 100644 workflow_examples/hello/hellow_cwl_docker.wfex.stage create mode 100644 workflow_examples/hello/hellow_cwl_podman.wfex.stage diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 29ee6279..febb538b 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -28,6 +28,7 @@ import abc import logging import inspect +import uuid from typing import ( cast, @@ -284,6 +285,12 @@ def __init__( self.simpleFileNameMethod = simple_file_name_method + def _genTmpContainerPath(self) -> "AbsPath": + """ + This is a helper method + """ + return cast("AbsPath", os.path.join(self.containersCacheDir, str(uuid.uuid4()))) + def _genContainerPaths( self, container: "ContainerTaggedName" ) -> "Tuple[AbsPath, AbsPath]": @@ -371,6 +378,26 @@ def query( return trusted_copy, localContainerPath, localContainerPathMeta, imageSignature + def genStagedContainersDirPaths( + self, + container: "ContainerTaggedName", + stagedContainersDir: "AnyPath", + ) -> "Tuple[AbsPath, AbsPath]": + containerFilename = self.simpleFileNameMethod( + cast("URIType", container.origTaggedName) + ) + containerFilenameMeta = containerFilename + META_JSON_POSTFIX + + containerPath = cast( + "AbsPath", os.path.join(stagedContainersDir, containerFilename) + ) + + containerPathMeta = cast( + "AbsPath", os.path.join(stagedContainersDir, containerFilenameMeta) + ) + + return containerPath, containerPathMeta + def transfer( self, container: "ContainerTaggedName", @@ -393,20 +420,11 @@ def transfer( # Last, but not the least important # Hardlink or copy the container and its metadata - containerFilename = self.simpleFileNameMethod( - cast("URIType", container.origTaggedName) + containerPath, containerPathMeta = self.genStagedContainersDirPaths( + container, stagedContainersDir ) - containerFilenameMeta = containerFilename + META_JSON_POSTFIX os.makedirs(stagedContainersDir, exist_ok=True) - containerPath = cast( - "AbsPath", os.path.join(stagedContainersDir, containerFilename) - ) - - containerPathMeta = cast( - "AbsPath", os.path.join(stagedContainersDir, containerFilenameMeta) - ) - if force or not os.path.exists(containerPath): link_or_copy(localContainerPath, containerPath) if force or not os.path.exists(containerPathMeta): diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index dee9dc3f..25b58ec3 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -181,169 +181,72 @@ def materializeSingleContainer( # Last case, it already has a registry declared self.logger.info(f"downloading docker container: {tag_name} => {dockerTag}") - # These are the paths to the copy of the saved container - containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) - containerFilenameMeta = containerFilename + META_JSON_POSTFIX - localContainerPath = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilename), - ) - localContainerPathMeta = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilenameMeta), - ) - - # Keep a copy outside the cache directory - if containers_dir is None: - containers_dir = self.stagedContainersDir - containerPath = cast("AbsPath", os.path.join(containers_dir, containerFilename)) - containerPathMeta = cast( - "AbsPath", os.path.join(containers_dir, containerFilenameMeta) - ) - # Now it is time to check whether the local cache of the container - # does exist and it is right + fetch_metadata = True trusted_copy = False - image_id: "Optional[Fingerprint]" = None + localContainerPath: "Optional[AbsPath]" = None + localContainerPathMeta: "Optional[AbsPath]" = None imageSignature: "Optional[Fingerprint]" = None + image_id: "Optional[Fingerprint]" = None manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None - if not force and os.path.isfile(localContainerPathMeta): - trusted_copy = True - try: - with open(localContainerPathMeta, mode="r", encoding="utf-8") as mH: - signaturesAndManifest = cast( - "DockerManifestMetadata", json.load(mH) - ) - image_id = signaturesAndManifest["image_id"] - imageSignature = signaturesAndManifest["image_signature"] - manifestsImageSignature = signaturesAndManifest[ - "manifests_signature" - ] - manifests = signaturesAndManifest["manifests"] - - # Check the status of the gathered manifests - trusted_copy = ( - manifestsImageSignature - == self._gen_trimmed_manifests_signature(manifests) - ) - except Exception as e: - self.logger.exception( - f"Problems extracting docker metadata at {localContainerPathMeta}" - ) - trusted_copy = False - - # Let's check metadata coherence - if trusted_copy: - trusted_copy = ( - imageSignature is not None - and manifestsImageSignature is not None - and manifests is not None - ) - - if trusted_copy: - assert manifestsImageSignature is not None - if os.path.islink(localContainerPathMeta): - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPathMeta = os.readlink(localContainerPathMeta) - fsImageSignatureMeta = os.path.basename(unlinkedContainerPathMeta) - if fsImageSignatureMeta.endswith(META_JSON_POSTFIX): - fsImageSignatureMeta = fsImageSignatureMeta[ - : -len(META_JSON_POSTFIX) - ] - putativeManifestsImageSignature = ( - fsImageSignatureMeta.replace("~", "=") - .replace("-", "/") - .replace("_", "+") - ) - - trusted_copy = ( - putativeManifestsImageSignature == manifestsImageSignature - ) - if trusted_copy: - canonicalContainerPathMeta = os.path.join( - self.containersCacheDir, - fsImageSignatureMeta + META_JSON_POSTFIX, - ) - - trusted_copy = os.path.samefile( - os.path.realpath(localContainerPathMeta), - os.path.realpath(canonicalContainerPathMeta), - ) - else: - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - putativeCanonicalContainerPathMeta = os.path.join( - self.containersCacheDir, - manifestsImageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) - - # This is to detect poisoned caches - trusted_copy = os.path.samefile( - localContainerPathMeta, putativeCanonicalContainerPathMeta - ) - - # Now, let's check the image itself - if trusted_copy and os.path.isfile(localContainerPath): - trusted_copy = imageSignature == ComputeDigestFromFile( - localContainerPath - ) + if not force: + ( + trusted_copy, + localContainerPath, + localContainerPathMeta, + imageSignature, + ) = self.cc_handler.query(tag) if trusted_copy: - assert imageSignature is not None - if os.path.islink(localContainerPath): - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPath = os.readlink(localContainerPath) - fsImageSignature = os.path.basename(unlinkedContainerPath) - putativeImageSignature = ( - fsImageSignature.replace("~", "=") - .replace("-", "/") - .replace("_", "+") - ) - - trusted_copy = putativeImageSignature == manifestsImageSignature - if trusted_copy: - canonicalContainerPath = os.path.join( - self.containersCacheDir, - fsImageSignature, + try: + with open(localContainerPathMeta, mode="r", encoding="utf-8") as mH: + signaturesAndManifest = cast( + "DockerManifestMetadata", json.load(mH) ) + image_id = signaturesAndManifest["image_id"] + imageSignature_in_metadata = signaturesAndManifest[ + "image_signature" + ] + manifestsImageSignature = signaturesAndManifest[ + "manifests_signature" + ] + manifests = signaturesAndManifest["manifests"] - trusted_copy = os.path.samefile( - os.path.realpath(localContainerPath), - os.path.realpath(canonicalContainerPath), + # Check the status of the gathered manifests + trusted_copy = ( + manifestsImageSignature + == self._gen_trimmed_manifests_signature(manifests) ) - else: - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - putativeCanonicalContainerPath = os.path.join( - self.containersCacheDir, - imageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) - - # This is to detect poisoned caches - trusted_copy = os.path.samefile( - localContainerPath, putativeCanonicalContainerPath + if trusted_copy: + trusted_copy = imageSignature == imageSignature_in_metadata + fetch_metadata = not trusted_copy + except Exception as e: + self.logger.exception( + f"Problems extracting docker metadata at {localContainerPathMeta}" ) + trusted_copy = False # And now, the final judgement! - if force or not trusted_copy: + if not trusted_copy: if offline: raise ContainerFactoryException( f"Banned remove docker containers in offline mode from {tag_name}" ) - if os.path.exists(localContainerPathMeta) or os.path.exists( - localContainerPath + if ( + localContainerPathMeta is not None + and localContainerPath is not None + and ( + os.path.exists(localContainerPathMeta) + or os.path.exists(localContainerPath) + ) ): self.logger.warning( f"Unable to trust Docker container {tag_name} => {dockerTag} . Discarding cached contents" ) - real_unlink_if_exists(localContainerPathMeta) - real_unlink_if_exists(localContainerPath) # Blindly remove _, _, _ = self._rmi(dockerTag, matEnv) @@ -374,37 +277,23 @@ def materializeSingleContainer( manifests = cast("Sequence[Mapping[str, Any]]", json.loads(d_out_v)) manifest = manifests[0] image_id = cast("Fingerprint", manifest["Id"]) + manifestsImageSignature = self._gen_trimmed_manifests_signature( + manifests + ) except Exception as e: raise ContainerFactoryException( f"FATAL ERROR: Docker finished properly but it did not properly materialize {tag_name}: {e}" ) self.logger.info( - "saving docker container (for reproducibility matters): {} => {}".format( - tag_name, localContainerPath - ) + f"saving docker container (for reproducibility matters): {tag_name}" ) # Let's materialize the container image for preservation - manifestsImageSignature = self._gen_trimmed_manifests_signature(manifests) - canonicalContainerPath = os.path.join( - self.containersCacheDir, - manifestsImageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) - - # Being sure the paths do not exist - if os.path.exists(canonicalContainerPath): - os.unlink(canonicalContainerPath) - canonicalContainerPathMeta = canonicalContainerPath + META_JSON_POSTFIX - if os.path.exists(canonicalContainerPathMeta): - os.unlink(canonicalContainerPathMeta) + tmpContainerPath = self.cc_handler._genTmpContainerPath() # Now, save the image as such - d_retval, d_err_ev = self._save( - dockerTag, cast("AbsPath", canonicalContainerPath), matEnv - ) + d_retval, d_err_ev = self._save(dockerTag, tmpContainerPath, matEnv) self.logger.debug("docker save retval: {}".format(d_retval)) self.logger.debug("docker save stderr: {}".format(d_err_v)) @@ -418,19 +307,22 @@ def materializeSingleContainer( ) # Removing partial dumps - if os.path.exists(canonicalContainerPath): + if os.path.exists(tmpContainerPath): try: - os.unlink(canonicalContainerPath) + os.unlink(tmpContainerPath) except: pass raise ContainerEngineException(errstr) - imageSignature = cast( - "Fingerprint", ComputeDigestFromFile(canonicalContainerPath) + # This is needed for the metadata + imageSignature = self.cc_handler._computeFingerprint( + cast("AnyPath", tmpContainerPath) ) + tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX + # Last, save the metadata itself for further usage - with open(canonicalContainerPathMeta, mode="w", encoding="utf-8") as tcpM: + with open(tmpContainerPathMeta, mode="w", encoding="utf-8") as tcpM: manifest_metadata: "DockerManifestMetadata" = { "image_id": image_id, "image_signature": imageSignature, @@ -439,40 +331,30 @@ def materializeSingleContainer( } json.dump(manifest_metadata, tcpM) - # Now, check the relative symbolic link of image - if os.path.lexists(localContainerPath): - os.unlink(localContainerPath) - - os.symlink( - os.path.relpath( - canonicalContainerPath, self.engineContainersSymlinkDir - ), - localContainerPath, + # And update the cache + self.cc_handler.update( + tag, + image_path=tmpContainerPath, + image_metadata_path=cast("AbsPath", tmpContainerPathMeta), + do_move=True, ) - # Now, check the relative symbolic link of metadata - if os.path.lexists(localContainerPathMeta): - os.unlink(localContainerPathMeta) - os.symlink( - os.path.relpath( - canonicalContainerPathMeta, self.engineContainersSymlinkDir - ), - localContainerPathMeta, - ) + if containers_dir is None: + containers_dir = self.stagedContainersDir + + # Do not allow overwriting in offline mode + transferred_image = self.cc_handler.transfer( + tag, stagedContainersDir=containers_dir, force=force and not offline + ) + assert transferred_image is not None, f"Unexpected cache miss for {tag}" + containerPath, containerPathMeta = transferred_image assert manifestsImageSignature is not None assert manifests is not None if manifest is None: manifest = manifests[0] - # Do not allow overwriting in offline mode - if not offline or not os.path.exists(containerPath): - link_or_copy(localContainerPath, containerPath) - if not offline or not os.path.exists(containerPathMeta): - link_or_copy(localContainerPathMeta, containerPathMeta) - # Now the image is not loaded here, but later in deploySingleContainer - # Then, compute the fingerprint fingerprint = None if len(manifest["RepoDigests"]) > 0: @@ -513,15 +395,10 @@ def deploySingleContainer( tag_name = container.origTaggedName # These are the paths to the copy of the saved container - containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) - containerFilenameMeta = containerFilename + META_JSON_POSTFIX - - # Keep a copy outside the cache directory if containers_dir is None: containers_dir = self.stagedContainersDir - containerPath = cast("AbsPath", os.path.join(containers_dir, containerFilename)) - containerPathMeta = cast( - "AbsPath", os.path.join(containers_dir, containerFilenameMeta) + containerPath, containerPathMeta = self.cc_handler.genStagedContainersDirPaths( + container, containers_dir ) imageSignature: "Optional[Fingerprint]" = None @@ -529,7 +406,7 @@ def deploySingleContainer( manifests = None manifest = None if not os.path.isfile(containerPathMeta): - errmsg = f"FATAL ERROR: Docker saved image {containerFilenameMeta} is not in the staged working dir for {tag_name}" + errmsg = f"FATAL ERROR: Docker saved image {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" self.logger.error(errmsg) raise ContainerFactoryException(errmsg) diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index 5ebd5200..f030d768 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -494,9 +494,7 @@ def _materializeSingleContainerSing( ) with tempfile.NamedTemporaryFile() as s_out, tempfile.NamedTemporaryFile() as s_err: - tmpContainerPath = os.path.join( - self.containersCacheDir, str(uuid.uuid4()) - ) + tmpContainerPath = self.cc_handler._genTmpContainerPath() self.logger.debug( f"downloading temporary container: {tag_name} => {tmpContainerPath}" @@ -573,10 +571,8 @@ def _materializeSingleContainerSing( if tmpContainerPath is None: assert localContainerPath is not None - tmpContainerPath = os.path.join( - self.containersCacheDir, str(uuid.uuid4()) - ) - link_or_copy(localContainerPath, cast("AbsPath", tmpContainerPath)) + tmpContainerPath = self.cc_handler._genTmpContainerPath() + link_or_copy(localContainerPath, tmpContainerPath) tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX self.logger.debug( diff --git a/workflow_examples/hello/hello_cwl_http.wfex.stage b/workflow_examples/hello/hello_cwl_http.wfex.stage new file mode 100644 index 00000000..011dae29 --- /dev/null +++ b/workflow_examples/hello/hello_cwl_http.wfex.stage @@ -0,0 +1,13 @@ +workflow_id: https://raw.githubusercontent.com/inab/hello-workflows/b0afc5871c6fdbd66576fcc5a3813ea49aca5104/cwl/hello-workflow.cwl +workflow_config: + secure: false +# All the inputs must be URLs or CURIEs from identifiers.org +params: + an_input: + c-l-a-s-s: File + url: https://raw.githubusercontent.com/inab/hello-workflows/b0afc5871c6fdbd66576fcc5a3813ea49aca5104/cwl/hello.yml +environment: + SECRET_VARIABLE: "The secret content" +outputs: + hello_output: + c-l-a-s-s: File diff --git a/workflow_examples/hello/hellow_cwl_docker.wfex.stage b/workflow_examples/hello/hellow_cwl_docker.wfex.stage new file mode 100644 index 00000000..331471c8 --- /dev/null +++ b/workflow_examples/hello/hellow_cwl_docker.wfex.stage @@ -0,0 +1,14 @@ +workflow_id: github:inab/hello-workflows/b0afc5871c6fdbd66576fcc5a3813ea49aca5104/cwl/hello-workflow.cwl +workflow_config: + secure: false + containerType: docker +# All the inputs must be URLs or CURIEs from identifiers.org +params: + an_input: + c-l-a-s-s: File + url: github:inab/hello-workflows/b0afc5871c6fdbd66576fcc5a3813ea49aca5104/cwl/hello.yml +environment: + SECRET_VARIABLE: "The secret content" +outputs: + hello_output: + c-l-a-s-s: File diff --git a/workflow_examples/hello/hellow_cwl_podman.wfex.stage b/workflow_examples/hello/hellow_cwl_podman.wfex.stage new file mode 100644 index 00000000..5b15f7f2 --- /dev/null +++ b/workflow_examples/hello/hellow_cwl_podman.wfex.stage @@ -0,0 +1,14 @@ +workflow_id: github:inab/hello-workflows/b0afc5871c6fdbd66576fcc5a3813ea49aca5104/cwl/hello-workflow.cwl +workflow_config: + secure: false + containerType: podman +# All the inputs must be URLs or CURIEs from identifiers.org +params: + an_input: + c-l-a-s-s: File + url: github:inab/hello-workflows/b0afc5871c6fdbd66576fcc5a3813ea49aca5104/cwl/hello.yml +environment: + SECRET_VARIABLE: "The secret content" +outputs: + hello_output: + c-l-a-s-s: File From b018b25903d04245a95c582ac4df799ee5e3344a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 4 Jun 2024 04:28:14 +0200 Subject: [PATCH 06/62] Container caching infrastructure overhaul (3/3). * Rewritten PodmanContainerFactory, so it uses new instance of ContainerCacheHandler. --- .../container_factories/podman_container.py | 271 +++++------------- 1 file changed, 74 insertions(+), 197 deletions(-) diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index 62c32e24..e9f4d1b8 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -208,169 +208,72 @@ def materializeSingleContainer( # Last case, it already has a registry declared self.logger.info(f"downloading podman container: {tag_name} => {podmanPullTag}") - # These are the paths to the copy of the saved container - containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) - containerFilenameMeta = containerFilename + META_JSON_POSTFIX - localContainerPath = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilename), - ) - localContainerPathMeta = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilenameMeta), - ) - - # Keep a copy outside the cache directory - if containers_dir is None: - containers_dir = self.stagedContainersDir - containerPath = cast("AbsPath", os.path.join(containers_dir, containerFilename)) - containerPathMeta = cast( - "AbsPath", os.path.join(containers_dir, containerFilenameMeta) - ) - # Now it is time to check whether the local cache of the container - # does exist and it is right + fetch_metadata = True trusted_copy = False - image_id: "Optional[Fingerprint]" = None + localContainerPath: "Optional[AbsPath]" = None + localContainerPathMeta: "Optional[AbsPath]" = None imageSignature: "Optional[Fingerprint]" = None + image_id: "Optional[Fingerprint]" = None manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None - if not force and os.path.isfile(localContainerPathMeta): - trusted_copy = True - try: - with open(localContainerPathMeta, mode="r", encoding="utf-8") as mH: - signaturesAndManifest = cast( - "DockerManifestMetadata", json.load(mH) - ) - image_id = signaturesAndManifest["image_id"] - imageSignature = signaturesAndManifest["image_signature"] - manifestsImageSignature = signaturesAndManifest[ - "manifests_signature" - ] - manifests = signaturesAndManifest["manifests"] - - # Check the status of the gathered manifests - trusted_copy = ( - manifestsImageSignature - == self._gen_trimmed_manifests_signature(manifests) - ) - except Exception as e: - self.logger.exception( - f"Problems extracting podman metadata at {localContainerPathMeta}" - ) - trusted_copy = False - - # Let's check metadata coherence - if trusted_copy: - trusted_copy = ( - imageSignature is not None - and manifestsImageSignature is not None - and manifests is not None - ) - - if trusted_copy: - assert manifestsImageSignature is not None - if os.path.islink(localContainerPathMeta): - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPathMeta = os.readlink(localContainerPathMeta) - fsImageSignatureMeta = os.path.basename(unlinkedContainerPathMeta) - if fsImageSignatureMeta.endswith(META_JSON_POSTFIX): - fsImageSignatureMeta = fsImageSignatureMeta[ - : -len(META_JSON_POSTFIX) - ] - putativeManifestsImageSignature = ( - fsImageSignatureMeta.replace("~", "=") - .replace("-", "/") - .replace("_", "+") - ) - - trusted_copy = ( - putativeManifestsImageSignature == manifestsImageSignature - ) - if trusted_copy: - canonicalContainerPathMeta = os.path.join( - self.containersCacheDir, - fsImageSignatureMeta + META_JSON_POSTFIX, - ) - - trusted_copy = os.path.samefile( - os.path.realpath(localContainerPathMeta), - os.path.realpath(canonicalContainerPathMeta), - ) - else: - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - putativeCanonicalContainerPathMeta = os.path.join( - self.containersCacheDir, - manifestsImageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) - - # This is to detect poisoned caches - trusted_copy = os.path.samefile( - localContainerPathMeta, putativeCanonicalContainerPathMeta - ) - - # Now, let's check the image itself - if trusted_copy and os.path.isfile(localContainerPath): - trusted_copy = imageSignature == ComputeDigestFromFile( - localContainerPath - ) + if not force: + ( + trusted_copy, + localContainerPath, + localContainerPathMeta, + imageSignature, + ) = self.cc_handler.query(tag) if trusted_copy: - assert imageSignature is not None - if os.path.islink(localContainerPath): - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPath = os.readlink(localContainerPath) - fsImageSignature = os.path.basename(unlinkedContainerPath) - putativeImageSignature = ( - fsImageSignature.replace("~", "=") - .replace("-", "/") - .replace("_", "+") - ) - - trusted_copy = putativeImageSignature == manifestsImageSignature - if trusted_copy: - canonicalContainerPath = os.path.join( - self.containersCacheDir, - fsImageSignature, + try: + with open(localContainerPathMeta, mode="r", encoding="utf-8") as mH: + signaturesAndManifest = cast( + "DockerManifestMetadata", json.load(mH) ) + image_id = signaturesAndManifest["image_id"] + imageSignature_in_metadata = signaturesAndManifest[ + "image_signature" + ] + manifestsImageSignature = signaturesAndManifest[ + "manifests_signature" + ] + manifests = signaturesAndManifest["manifests"] - trusted_copy = os.path.samefile( - os.path.realpath(localContainerPath), - os.path.realpath(canonicalContainerPath), + # Check the status of the gathered manifests + trusted_copy = ( + manifestsImageSignature + == self._gen_trimmed_manifests_signature(manifests) ) - else: - # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - putativeCanonicalContainerPath = os.path.join( - self.containersCacheDir, - imageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) - - # This is to detect poisoned caches - trusted_copy = os.path.samefile( - localContainerPath, putativeCanonicalContainerPath + if trusted_copy: + trusted_copy = imageSignature == imageSignature_in_metadata + fetch_metadata = not trusted_copy + except Exception as e: + self.logger.exception( + f"Problems extracting docker metadata at {localContainerPathMeta}" ) + trusted_copy = False # And now, the final judgement! - if force or not trusted_copy: + if not trusted_copy: if offline: raise ContainerFactoryException( f"Banned remove podman containers in offline mode from {tag_name}" ) - if os.path.exists(localContainerPathMeta) or os.path.exists( - localContainerPath + if ( + localContainerPathMeta is not None + and localContainerPath is not None + and ( + os.path.exists(localContainerPathMeta) + or os.path.exists(localContainerPath) + ) ): self.logger.warning( f"Unable to trust Podman container {dockerTag} => {podmanPullTag} . Discarding cached contents" ) - real_unlink_if_exists(localContainerPathMeta) - real_unlink_if_exists(localContainerPath) # Blindly remove _, _, _ = self._rmi(dockerTag, matEnv) @@ -401,37 +304,23 @@ def materializeSingleContainer( manifests = cast("Sequence[Mapping[str, Any]]", json.loads(d_out_v)) manifest = manifests[0] image_id = cast("Fingerprint", manifest["Id"]) + manifestsImageSignature = self._gen_trimmed_manifests_signature( + manifests + ) except Exception as e: raise ContainerFactoryException( f"FATAL ERROR: Podman finished properly but it did not properly materialize {tag_name}: {e}" ) self.logger.info( - "saving podman container (for reproducibility matters): {} => {}".format( - tag_name, localContainerPath - ) + f"saving podman container (for reproducibility matters): {tag_name}" ) # Let's materialize the container image for preservation - manifestsImageSignature = self._gen_trimmed_manifests_signature(manifests) - canonicalContainerPath = os.path.join( - self.containersCacheDir, - manifestsImageSignature.replace("=", "~") - .replace("/", "-") - .replace("+", "_"), - ) - - # Being sure the paths do not exist - if os.path.exists(canonicalContainerPath): - os.unlink(canonicalContainerPath) - canonicalContainerPathMeta = canonicalContainerPath + META_JSON_POSTFIX - if os.path.exists(canonicalContainerPathMeta): - os.unlink(canonicalContainerPathMeta) + tmpContainerPath = self.cc_handler._genTmpContainerPath() # Now, save the image as such - d_retval, d_err_ev = self._save( - dockerTag, cast("AbsPath", canonicalContainerPath), matEnv - ) + d_retval, d_err_ev = self._save(dockerTag, tmpContainerPath, matEnv) self.logger.debug("podman save retval: {}".format(d_retval)) self.logger.debug("podman save stderr: {}".format(d_err_v)) @@ -445,19 +334,22 @@ def materializeSingleContainer( ) # Removing partial dumps - if os.path.exists(canonicalContainerPath): + if os.path.exists(tmpContainerPath): try: - os.unlink(canonicalContainerPath) + os.unlink(tmpContainerPath) except: pass raise ContainerEngineException(errstr) - imageSignature = cast( - "Fingerprint", ComputeDigestFromFile(canonicalContainerPath) + # This is needed for the metadata + imageSignature = self.cc_handler._computeFingerprint( + cast("AnyPath", tmpContainerPath) ) + tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX + # Last, save the metadata itself for further usage - with open(canonicalContainerPathMeta, mode="w", encoding="utf-8") as tcpM: + with open(tmpContainerPathMeta, mode="w", encoding="utf-8") as tcpM: manifest_metadata: "DockerManifestMetadata" = { "image_id": image_id, "image_signature": imageSignature, @@ -466,40 +358,30 @@ def materializeSingleContainer( } json.dump(manifest_metadata, tcpM) - # Now, check the relative symbolic link of image - if os.path.lexists(localContainerPath): - os.unlink(localContainerPath) - - os.symlink( - os.path.relpath( - canonicalContainerPath, self.engineContainersSymlinkDir - ), - localContainerPath, + # And update the cache + self.cc_handler.update( + tag, + image_path=tmpContainerPath, + image_metadata_path=cast("AbsPath", tmpContainerPathMeta), + do_move=True, ) - # Now, check the relative symbolic link of metadata - if os.path.lexists(localContainerPathMeta): - os.unlink(localContainerPathMeta) - os.symlink( - os.path.relpath( - canonicalContainerPathMeta, self.engineContainersSymlinkDir - ), - localContainerPathMeta, - ) + if containers_dir is None: + containers_dir = self.stagedContainersDir + + # Do not allow overwriting in offline mode + transferred_image = self.cc_handler.transfer( + tag, stagedContainersDir=containers_dir, force=force and not offline + ) + assert transferred_image is not None, f"Unexpected cache miss for {tag}" + containerPath, containerPathMeta = transferred_image assert manifestsImageSignature is not None assert manifests is not None if manifest is None: manifest = manifests[0] - # Do not allow overwriting in offline mode - if not offline or not os.path.exists(containerPath): - link_or_copy(localContainerPath, containerPath) - if not offline or not os.path.exists(containerPathMeta): - link_or_copy(localContainerPathMeta, containerPathMeta) - # Now the image is not loaded here, but later in deploySingleContainer - # Then, compute the fingerprint based on remote repo's information fingerprint = None if len(manifest["RepoDigests"]) > 0: @@ -541,15 +423,10 @@ def deploySingleContainer( tag_name = container.origTaggedName # These are the paths to the copy of the saved container - containerFilename = self.simpleFileNameMethod(cast("URIType", tag_name)) - containerFilenameMeta = containerFilename + META_JSON_POSTFIX - - # Keep a copy outside the cache directory if containers_dir is None: containers_dir = self.stagedContainersDir - containerPath = cast("AbsPath", os.path.join(containers_dir, containerFilename)) - containerPathMeta = cast( - "AbsPath", os.path.join(containers_dir, containerFilenameMeta) + containerPath, containerPathMeta = self.cc_handler.genStagedContainersDirPaths( + container, containers_dir ) imageSignature: "Optional[Fingerprint]" = None @@ -557,7 +434,7 @@ def deploySingleContainer( manifests = None manifest = None if not os.path.isfile(containerPathMeta): - errmsg = f"FATAL ERROR: Podman saved image {containerFilenameMeta} is not in the staged working dir for {tag_name}" + errmsg = f"FATAL ERROR: Podman saved image {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" self.logger.error(errmsg) raise ContainerFactoryException(errmsg) From c5d59ebafe1df80faf6e51a41d6057dc9af5b98a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 4 Jun 2024 04:29:19 +0200 Subject: [PATCH 07/62] Added minimal implementation of `deploySingleContainer` for SingularityContainerFactory. --- .../container_factories/singularity_container.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index f030d768..fef8ae81 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -711,4 +711,15 @@ def deploySingleContainer( This is almost no-op, but it should check the integrity of the local images """ - return force + if containers_dir is None: + containers_dir = self.stagedContainersDir + containerPath, containerPathMeta = self.cc_handler.genStagedContainersDirPaths( + container, containers_dir + ) + + if not os.path.isfile(containerPath): + errmsg = f"FATAL ERROR: SIF saved image {os.path.basename(containerPath)} is not in the staged working dir for {container.origTaggedName}" + self.logger.error(errmsg) + raise ContainerFactoryException(errmsg) + + return False From d987897bfe1a5cbe9681e6f9b80f6cfa2f478877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 4 Jun 2024 13:37:17 +0200 Subject: [PATCH 08/62] Container materialization and deployment mechanisms have been uplifted, in order to support injected container images in the near commits. --- wfexs_backend/container_factories/__init__.py | 36 ++-- .../container_factories/docker_container.py | 98 ++++++++--- .../container_factories/no_container.py | 8 +- .../container_factories/podman_container.py | 98 ++++++++--- .../singularity_container.py | 158 +++++++++++++++--- 5 files changed, 317 insertions(+), 81 deletions(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index febb538b..e0be0bd0 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -681,17 +681,23 @@ def materializeContainers( containers_dir = self.stagedContainersDir for tag in tagList: if self.AcceptsContainer(tag): - container = self.materializeSingleContainer( - tag, - containers_dir=containers_dir, - offline=offline, - force=force, - ) - if container is not None: - if container not in materialized_containers: - materialized_containers.append(container) - else: - not_found_containers.append(tag.origTaggedName) + container: "Optional[Container]" + try: + container, was_redeployed = self.deploySingleContainer( + tag, containers_dir=containers_dir, force=force + ) + except ContainerFactoryException as cfe: + container = self.materializeSingleContainer( + tag, + containers_dir=containers_dir, + offline=offline, + force=force, + ) + if container is not None: + if container not in materialized_containers: + materialized_containers.append(container) + else: + not_found_containers.append(tag.origTaggedName) if len(not_found_containers) > 0: raise ContainerNotFoundException( @@ -728,12 +734,12 @@ def deployContainers( containers_dir = self.stagedContainersDir for container in containers_list: if self.AcceptsContainer(container): - was_redeployed = self.deploySingleContainer( + deployed_container, was_redeployed = self.deploySingleContainer( container, containers_dir=containers_dir, force=force, ) - if was_redeployed is not None: + if was_redeployed: redeployed_containers.append(container) return redeployed_containers @@ -741,10 +747,10 @@ def deployContainers( @abc.abstractmethod def deploySingleContainer( self, - container: "Container", + container: "ContainerTaggedName", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, - ) -> "bool": + ) -> "Tuple[Container, bool]": """ It is assured the container is properly deployed """ diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index 25b58ec3..7702797b 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -147,20 +147,10 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture "Ill-formed answer from docker version" ) from je - def materializeSingleContainer( + def _genDockerTag( self, tag: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, - offline: "bool" = False, - force: "bool" = False, - ) -> "Optional[Container]": - """ - It is assured the containers are materialized - """ - matEnv = dict(os.environ) - matEnv.update(self.environment) - - # It is an absolute URL, we are removing the docker:// + ) -> "URIType": tag_name = tag.origTaggedName dockerTag = ( tag_name[len(DOCKER_PROTO) :] @@ -180,6 +170,25 @@ def materializeSingleContainer( dockerTag = f"{registry}/{dockerTag}" # Last case, it already has a registry declared + return cast("URIType", dockerTag) + + def materializeSingleContainer( + self, + tag: "ContainerTaggedName", + containers_dir: "Optional[AnyPath]" = None, + offline: "bool" = False, + force: "bool" = False, + ) -> "Optional[Container]": + """ + It is assured the containers are materialized + """ + matEnv = dict(os.environ) + matEnv.update(self.environment) + + # It is an absolute URL, we are removing the docker:// + tag_name = tag.origTaggedName + dockerTag = self._genDockerTag(tag) + self.logger.info(f"downloading docker container: {tag_name} => {dockerTag}") fetch_metadata = True @@ -369,7 +378,7 @@ def materializeSingleContainer( # And add to the list of containers return Container( origTaggedName=tag_name, - taggedName=cast("URIType", dockerTag), + taggedName=dockerTag, signature=image_id, fingerprint=fingerprint, architecture=architecture, @@ -384,14 +393,13 @@ def materializeSingleContainer( def deploySingleContainer( self, - container: "Container", + container: "ContainerTaggedName", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, - ) -> "bool": + ) -> "Tuple[Container, bool]": # Should we load the image? matEnv = dict(os.environ) matEnv.update(self.environment) - dockerTag = container.taggedName tag_name = container.origTaggedName # These are the paths to the copy of the saved container @@ -405,22 +413,72 @@ def deploySingleContainer( manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None + if not os.path.isfile(containerPath): + errmsg = f"Docker saved image {os.path.basename(containerPath)} is not in the staged working dir for {tag_name}" + self.logger.warning(errmsg) + raise ContainerFactoryException(errmsg) + if not os.path.isfile(containerPathMeta): - errmsg = f"FATAL ERROR: Docker saved image {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" - self.logger.error(errmsg) + errmsg = f"Docker saved image metadata {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" + self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) try: with open(containerPathMeta, mode="r", encoding="utf-8") as mH: signaturesAndManifest = cast("DockerManifestMetadata", json.load(mH)) - imageSignature = signaturesAndManifest["image_signature"] + imageSignature_in_metadata = signaturesAndManifest["image_signature"] manifestsImageSignature = signaturesAndManifest["manifests_signature"] manifests = signaturesAndManifest["manifests"] + + if isinstance(container, Container): + # Reuse the input container instance + rebuilt_container = container + dockerTag = rebuilt_container.taggedName + else: + manifest = manifests[0] + + dockerTag = self._genDockerTag(container) + + image_id = signaturesAndManifest["image_id"] + + # Then, compute the fingerprint + fingerprint = None + if len(manifest["RepoDigests"]) > 0: + fingerprint = manifest["RepoDigests"][0] + + # Learning about the intended processor architecture and variant + architecture = manifest.get("Architecture") + if architecture is not None: + variant = manifest.get("Variant") + if variant is not None: + architecture += "/" + variant + + rebuilt_container = Container( + origTaggedName=container.origTaggedName, + taggedName=dockerTag, + signature=image_id, + fingerprint=fingerprint, + architecture=architecture, + operatingSystem=manifest.get("Os"), + type=self.containerType, + localPath=containerPath, + registries=container.registries, + metadataLocalPath=containerPathMeta, + source_type=container.type, + image_signature=imageSignature_in_metadata, + ) except Exception as e: errmsg = f"Problems extracting docker metadata at {containerPathMeta}" self.logger.exception(errmsg) raise ContainerFactoryException(errmsg) + imageSignature = self.cc_handler._computeFingerprint(containerPath) + + if imageSignature != imageSignature_in_metadata: + errmsg = f"Image signature recorded in {os.path.basename(containerPathMeta)} does not match image signature of {os.path.basename(containerPath)}" + self.logger.exception(errmsg) + raise ContainerFactoryException(errmsg) + d_retval, d_out_v, d_err_v = self._inspect(dockerTag, matEnv) # d_retval, d_out_v, d_err_v = self._images(matEnv) @@ -472,4 +530,4 @@ def deploySingleContainer( self.logger.error(errstr) raise ContainerEngineException(errstr) - return do_redeploy + return rebuilt_container, do_redeploy diff --git a/wfexs_backend/container_factories/no_container.py b/wfexs_backend/container_factories/no_container.py index 5402c1d7..a1d9671a 100644 --- a/wfexs_backend/container_factories/no_container.py +++ b/wfexs_backend/container_factories/no_container.py @@ -97,11 +97,13 @@ def materializeSingleContainer( def deploySingleContainer( self, - container: "Container", + container: "ContainerTaggedName", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, - ) -> "bool": + ) -> "Tuple[Container, bool]": """ This is a no-op """ - return False + assert isinstance(container, Container) + + return container, False diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index e9f4d1b8..e26f53ca 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -165,20 +165,10 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture "Ill-formed answer from podman version" ) from e - def materializeSingleContainer( + def _genPodmanTag( self, tag: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, - offline: "bool" = False, - force: "bool" = False, - ) -> "Optional[Container]": - """ - It is assured the containers are materialized - """ - - matEnv = dict(os.environ) - matEnv.update(self.environment) - + ) -> "Tuple[URIType, str]": # It is an absolute URL, we are removing the docker:// tag_name = tag.origTaggedName if tag_name.startswith(DOCKER_PROTO): @@ -207,6 +197,26 @@ def materializeSingleContainer( podmanPullTag = DOCKER_PROTO + dockerTag # Last case, it already has a registry declared + return cast("URIType", dockerTag), podmanPullTag + + def materializeSingleContainer( + self, + tag: "ContainerTaggedName", + containers_dir: "Optional[AnyPath]" = None, + offline: "bool" = False, + force: "bool" = False, + ) -> "Optional[Container]": + """ + It is assured the containers are materialized + """ + + matEnv = dict(os.environ) + matEnv.update(self.environment) + + # It is an absolute URL, we are removing the docker:// + tag_name = tag.origTaggedName + dockerTag, podmanPullTag = self._genPodmanTag(tag) + self.logger.info(f"downloading podman container: {tag_name} => {podmanPullTag}") fetch_metadata = True @@ -397,7 +407,7 @@ def materializeSingleContainer( # And add to the list of containers return Container( origTaggedName=tag_name, - taggedName=cast("URIType", dockerTag), + taggedName=dockerTag, signature=image_id, fingerprint=fingerprint, architecture=architecture, @@ -412,14 +422,13 @@ def materializeSingleContainer( def deploySingleContainer( self, - container: "Container", + container: "ContainerTaggedName", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, - ) -> "bool": + ) -> "Tuple[Container, bool]": # Should we load the image? matEnv = dict(os.environ) matEnv.update(self.environment) - dockerTag = container.taggedName tag_name = container.origTaggedName # These are the paths to the copy of the saved container @@ -433,22 +442,73 @@ def deploySingleContainer( manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None + if not os.path.isfile(containerPath): + errmsg = f"Podman saved image {os.path.basename(containerPath)} is not in the staged working dir for {tag_name}" + self.logger.warning(errmsg) + raise ContainerFactoryException(errmsg) + if not os.path.isfile(containerPathMeta): - errmsg = f"FATAL ERROR: Podman saved image {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" + errmsg = f"FATAL ERROR: Podman saved image metadata {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" self.logger.error(errmsg) raise ContainerFactoryException(errmsg) try: with open(containerPathMeta, mode="r", encoding="utf-8") as mH: signaturesAndManifest = cast("DockerManifestMetadata", json.load(mH)) - imageSignature = signaturesAndManifest["image_signature"] + imageSignature_in_metadata = signaturesAndManifest["image_signature"] manifestsImageSignature = signaturesAndManifest["manifests_signature"] manifests = signaturesAndManifest["manifests"] + + if isinstance(container, Container): + # Reuse the input container instance + rebuilt_container = container + dockerTag = rebuilt_container.taggedName + else: + manifest = manifests[0] + + dockerTag, podmanPullTag = self._genPodmanTag(container) + + image_id = signaturesAndManifest["image_id"] + + # Then, compute the fingerprint based on remote repo's information + fingerprint = None + if len(manifest["RepoDigests"]) > 0: + fingerprint = manifest["RepoDigests"][0] + + # Learning about the intended processor architecture and variant + architecture = manifest.get("Architecture") + # As of version 4.5.0, podman does not report the architecture variant + if architecture is not None: + variant = manifest.get("Variant") + if variant is not None: + architecture += "/" + variant + + rebuilt_container = Container( + origTaggedName=container.origTaggedName, + taggedName=dockerTag, + signature=image_id, + fingerprint=fingerprint, + architecture=architecture, + operatingSystem=manifest.get("Os"), + type=self.containerType, + localPath=containerPath, + registries=container.registries, + metadataLocalPath=containerPathMeta, + source_type=container.type, + image_signature=imageSignature_in_metadata, + ) except Exception as e: errmsg = f"Problems extracting podman metadata at {containerPathMeta}" self.logger.exception(errmsg) raise ContainerFactoryException(errmsg) + imageSignature = self.cc_handler._computeFingerprint(containerPath) + + if imageSignature != imageSignature_in_metadata: + errmsg = f"Image signature recorded in {os.path.basename(containerPathMeta)} does not match image signature of {os.path.basename(containerPath)}" + self.logger.exception(errmsg) + raise ContainerFactoryException(errmsg) + d_retval, d_out_v, d_err_v = self._inspect(dockerTag, matEnv) # d_retval, d_out_v, d_err_v = self._images(matEnv) @@ -500,4 +560,4 @@ def deploySingleContainer( self.logger.error(errstr) raise ContainerEngineException(errstr) - return do_redeploy + return rebuilt_container, do_redeploy diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index fef8ae81..ae01a6b3 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -47,6 +47,7 @@ Optional, Sequence, Set, + Tuple, Union, ) from typing_extensions import ( @@ -198,6 +199,10 @@ def AcceptsContainerType( def _getContainerArchitecture( self, container_filename: "AnyPath", matEnv: "Mapping[str, str]" = {} ) -> "Optional[ProcessorArchitecture]": + if len(matEnv) == 0: + matEnv = dict(os.environ) + matEnv.update(self.environment) + with tempfile.NamedTemporaryFile() as s_out, tempfile.NamedTemporaryFile() as s_err: self.logger.debug( f"Checking {container_filename} looks like a singularity container" @@ -370,20 +375,45 @@ def materializeSingleContainer( return the_cont if isinstance(the_cont, Container) else None - def _materializeSingleContainerSing( + def _genSingTag( self, tag: "ContainerTaggedName", - matEnv: "Mapping[str, str]" = {}, - dhelp: "DockerHelper" = DockerHelper(), - containers_dir: "Optional[AnyPath]" = None, - offline: "bool" = False, - force: "bool" = False, - ) -> "Union[Container, FailedContainerTag]": - if len(matEnv) == 0: - matEnvNew = dict(os.environ) - matEnvNew.update(self.environment) - matEnv = matEnvNew + ) -> "Tuple[str, parse.ParseResult, bool]": + # It is not an absolute URL, we are prepending the docker:// + tag_name = tag.origTaggedName + parsedTag = parse.urlparse(tag_name) + if parsedTag.scheme in self.ACCEPTED_SING_SCHEMES: + singTag = tag_name + isDocker = parsedTag.scheme == DOCKER_SCHEME + else: + if parsedTag.scheme == "": + singTag = "docker://" + tag_name + parsedTag = parse.urlparse(singTag) + else: + parsedTag = parsedTag._replace( + scheme=DOCKER_SCHEME, + netloc=parsedTag.scheme + ":" + parsedTag.path, + path="", + ) + singTag = parse.urlunparse(parsedTag) + # Assuming it is docker + isDocker = True + # Should we enrich the tag with the registry? + if ( + isDocker + and isinstance(tag.registries, dict) + and (common.ContainerType.Docker in tag.registries) + ): + registry = tag.registries[common.ContainerType.Docker] + # Bare case + if len(parsedTag.path) <= 1: + singTag = f"docker://{registry}/library/{parsedTag.netloc}" + parsedTag = parse.urlparse(singTag) + elif "/" not in parsedTag.path[1:]: + singTag = f"docker://{registry}/{parsedTag.netloc}{parsedTag.path}" + parsedTag = parse.urlparse(singTag) + # Last case, it already has a registry declared # It is not an absolute URL, we are prepending the docker:// tag_name = tag.origTaggedName parsedTag = parse.urlparse(tag_name) @@ -420,6 +450,25 @@ def _materializeSingleContainerSing( parsedTag = parse.urlparse(singTag) # Last case, it already has a registry declared + return singTag, parsedTag, isDocker + + def _materializeSingleContainerSing( + self, + tag: "ContainerTaggedName", + matEnv: "Mapping[str, str]" = {}, + dhelp: "DockerHelper" = DockerHelper(), + containers_dir: "Optional[AnyPath]" = None, + offline: "bool" = False, + force: "bool" = False, + ) -> "Union[Container, FailedContainerTag]": + if len(matEnv) == 0: + matEnvNew = dict(os.environ) + matEnvNew.update(self.environment) + matEnv = matEnvNew + + tag_name = tag.origTaggedName + singTag, parsedTag, isDocker = self._genSingTag(tag) + fetch_metadata = True trusted_copy = False localContainerPath: "Optional[AbsPath]" = None @@ -673,14 +722,21 @@ def materializeContainers( # If we cannot materialize it we cannot accept it if not self.AcceptsContainer(tag): continue - matched_container = self._materializeSingleContainerSing( - tag, - matEnv=matEnv, - dhelp=dhelp, - containers_dir=containers_dir, - offline=offline, - force=force, - ) + + matched_container: "Union[Container, FailedContainerTag]" + try: + matched_container, was_redeployed = self.deploySingleContainer( + tag, containers_dir=containers_dir, force=force + ) + except ContainerFactoryException as cfe: + matched_container = self._materializeSingleContainerSing( + tag, + matEnv=matEnv, + dhelp=dhelp, + containers_dir=containers_dir, + offline=offline, + force=force, + ) if isinstance(matched_container, Container): if matched_container not in containersList: @@ -703,10 +759,10 @@ def materializeContainers( def deploySingleContainer( self, - container: "Container", + container: "ContainerTaggedName", containers_dir: "Optional[AnyPath]" = None, force: "bool" = False, - ) -> "bool": + ) -> "Tuple[Container, bool]": """ This is almost no-op, but it should check the integrity of the local images @@ -718,8 +774,62 @@ def deploySingleContainer( ) if not os.path.isfile(containerPath): - errmsg = f"FATAL ERROR: SIF saved image {os.path.basename(containerPath)} is not in the staged working dir for {container.origTaggedName}" - self.logger.error(errmsg) + errmsg = f"SIF saved image {os.path.basename(containerPath)} is not in the staged working dir for {container.origTaggedName}" + self.logger.warning(errmsg) + raise ContainerFactoryException(errmsg) + + if not os.path.isfile(containerPathMeta): + errmsg = f"SIF saved image metadata {os.path.basename(containerPathMeta)} is not in the staged working dir for {container.origTaggedName}" + self.logger.warning(errmsg) + raise ContainerFactoryException(errmsg) + + try: + with open(containerPathMeta, mode="r", encoding="utf-8") as mH: + signaturesAndManifest = cast("SingularityManifest", json.load(mH)) + imageSignature_in_metadata = signaturesAndManifest["image_signature"] + + if isinstance(container, Container): + # Reuse the input container instance + rebuilt_container = container + else: + singTag, parsedTag, isDocker = self._genSingTag(container) + + partial_fingerprint = signaturesAndManifest.get("dcd") + repo = signaturesAndManifest["repo"] + if partial_fingerprint is not None: + fingerprint = cast( + # Maybe in the future registryServer + '/' + repo + "@" + partial_fingerprint + "Fingerprint", + repo + "@" + partial_fingerprint, + ) + else: + # TODO: is there a better alternative? + fingerprint = cast("Fingerprint", container.origTaggedName) + + rebuilt_container = Container( + origTaggedName=container.origTaggedName, + taggedName=cast("URIType", singTag), + signature=imageSignature_in_metadata, + fingerprint=fingerprint, + architecture=self._getContainerArchitecture(containerPath), + type=self.containerType, + localPath=containerPath, + registries=container.registries, + metadataLocalPath=containerPathMeta, + source_type=container.type, + image_signature=imageSignature_in_metadata, + ) + + except Exception as e: + errmsg = f"Problems extracting SIF metadata at {containerPathMeta} or {containerPath}" + self.logger.exception(errmsg) + raise ContainerFactoryException(errmsg) + + imageSignature = self.cc_handler._computeFingerprint(containerPath) + + if imageSignature != imageSignature_in_metadata: + errmsg = f"Image signature recorded in {os.path.basename(containerPathMeta)} does not match image signature of {os.path.basename(containerPath)}" + self.logger.exception(errmsg) raise ContainerFactoryException(errmsg) - return False + return rebuilt_container, False From 6e7a9a6a0223c43242f2ca69e5ed32d31d137cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 5 Jun 2024 17:02:38 +0200 Subject: [PATCH 09/62] Minor fix for cases where the full tag includes the server port for custom docker registries --- wfexs_backend/utils/docker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/utils/docker.py b/wfexs_backend/utils/docker.py index 6903dc82..41436e42 100644 --- a/wfexs_backend/utils/docker.py +++ b/wfexs_backend/utils/docker.py @@ -235,10 +235,10 @@ def query_tag(self, tag: "str") -> "Optional[DockerTagMetadata]": pathToParse = parsedTag.netloc + parsedTag.path splitSep = "@sha256:" - splitPos = pathToParse.find(splitSep) + splitPos = pathToParse.rfind(splitSep) if splitPos == -1: splitSep = ":" - splitPos = pathToParse.find(splitSep) + splitPos = pathToParse.rfind(splitSep) if splitPos != -1: repo = pathToParse[0:splitPos] From 2dedcd2be14b62e9e151786cc9b716ca8dbebb60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 5 Jun 2024 17:08:48 +0200 Subject: [PATCH 10/62] Improve docker tag parsing capabilities, so tags using sha256 instead of a label are properly parsed. Also, remove duplicated declaration, add an abstract declaration for local image manifests and add an additional check of the image digest. --- wfexs_backend/container_factories/__init__.py | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index e0be0bd0..8e06d1be 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -19,6 +19,7 @@ import copy from dataclasses import dataclass +import json import os import tempfile import atexit @@ -60,6 +61,7 @@ ) from typing_extensions import ( + NotRequired, TypeAlias, TypedDict, Final, @@ -85,14 +87,8 @@ ContainerOperatingSystem = NewType("ContainerOperatingSystem", str) ProcessorArchitecture = NewType("ProcessorArchitecture", str) - DockerLikeManifest: TypeAlias = Mapping[str, Any] - MutableDockerLikeManifest: TypeAlias = MutableMapping[str, Any] - - class DockerManifestMetadata(TypedDict): - image_id: Fingerprint - image_signature: Fingerprint - manifests_signature: Fingerprint - manifests: Sequence[DockerLikeManifest] + class AbstractImageManifestMetadata(TypedDict): + image_signature: NotRequired[Fingerprint] import yaml @@ -173,7 +169,7 @@ def decompose_docker_tagged_name( # Now ... registry: "str" tag_name: "str" - tag_label: "str" + tag_label: "Optional[str]" # Is it a fully qualified docker tag? left_slash_pos = tagged_name.find("/") @@ -184,13 +180,21 @@ def decompose_docker_tagged_name( registry = DEFAULT_DOCKER_REGISTRY # Now, the tag label - right_colon_pos = tagged_name.rfind(":") - if right_colon_pos < 0: - tag_name = tagged_name - tag_label = "latest" + right_sha256_pos = tagged_name.rfind("@sha256:") + if right_sha256_pos > 0: + tag_name = tagged_name[0:right_sha256_pos] + # No tag label, as it is an specific layer + tag_label = None else: - tag_name = tagged_name[0:right_colon_pos] - tag_label = tagged_name[right_colon_pos + 1 :] + right_colon_pos = tagged_name.rfind(":") + right_slash_pos = tagged_name.rfind("/") + if right_colon_pos > right_slash_pos: + tag_name = tagged_name[0:right_colon_pos] + tag_label = tagged_name[right_colon_pos + 1 :] + else: + tag_name = tagged_name + # Default + tag_label = "latest" return registry, tag_name, tag_label else: @@ -374,7 +378,20 @@ def query( trusted_copy = canonicalImageSignature == imageSignature if trusted_copy: - trusted_copy = os.path.isfile(localContainerPathMeta) + if os.path.isfile(localContainerPathMeta): + try: + with open(localContainerPathMeta, mode="r", encoding="utf-8") as mH: + signaturesAndManifest = cast( + "AbstractImageManifestMetadata", json.load(mH) + ) + imageSignature_in_metadata = signaturesAndManifest.get( + "image_signature" + ) + trusted_copy = imageSignature_in_metadata == imageSignature + except: + trusted_copy = False + else: + trusted_copy = False return trusted_copy, localContainerPath, localContainerPathMeta, imageSignature From d09bab1b50f2617af651b153b601183618291c82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 04:54:20 +0200 Subject: [PATCH 11/62] Although not accessible from end user side, propagate `force` value flag --- wfexs_backend/workflow_engines/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 1f5c9491..5c80e363 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -249,6 +249,7 @@ def materialize_containers( listOfContainerTags: "Sequence[ContainerTaggedName]", containersDir: "AnyPath", offline: "bool" = False, + force: "bool" = False, ) -> "Tuple[ContainerEngineVersionStr, Sequence[Container], ContainerOperatingSystem, ProcessorArchitecture]": pass @@ -768,6 +769,7 @@ def materialize_containers( listOfContainerTags: "Sequence[ContainerTaggedName]", containersDir: "Optional[AnyPath]" = None, offline: "bool" = False, + force: "bool" = False, ) -> "Tuple[ContainerEngineVersionStr, Sequence[Container], ContainerOperatingSystem, ProcessorArchitecture]": if containersDir is None: containersDir = self.stagedContainersDir @@ -778,6 +780,7 @@ def materialize_containers( listOfContainerTags, containers_dir=containersDir, offline=offline, + force=force, ), *self.container_factory.architecture, ) From 3306dab6ae67db78596be2adfa7d48ce119a906d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 04:55:47 +0200 Subject: [PATCH 12/62] Fixed corner case where a layer's sha is provided instead of a tag label, and no metadata could be fetched back. --- wfexs_backend/utils/docker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/wfexs_backend/utils/docker.py b/wfexs_backend/utils/docker.py index 41436e42..249a0a14 100644 --- a/wfexs_backend/utils/docker.py +++ b/wfexs_backend/utils/docker.py @@ -239,6 +239,9 @@ def query_tag(self, tag: "str") -> "Optional[DockerTagMetadata]": if splitPos == -1: splitSep = ":" splitPos = pathToParse.rfind(splitSep) + else: + # We need to include 'sha256:' prefix in alias + splitSep = "@" if splitPos != -1: repo = pathToParse[0:splitPos] @@ -280,6 +283,7 @@ def query_tag(self, tag: "str") -> "Optional[DockerTagMetadata]": assert partial_fingerprint is not None except Exception as e: + self.logger.exception(f"Unable to obtain fingerprint from {tag}") raise DockerHelperException( f"Unable to obtain fingerprint from {tag}. Reason {e}" ) from e From ac13edffcb291a6e1ecf199f4b4aa2d079afcdcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 04:58:42 +0200 Subject: [PATCH 13/62] Added correction fix for cases where the ContainerTaggedName instance is really a Container one --- wfexs_backend/container_factories/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 8e06d1be..86771520 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -606,7 +606,10 @@ def ContainerType(cls) -> "common.ContainerType": @classmethod def AcceptsContainer(cls, container: "ContainerTaggedName") -> "bool": - return cls.AcceptsContainerType(container.type) + if isinstance(container, Container) and container.source_type is not None: + return cls.AcceptsContainerType(container.source_type) + else: + return cls.AcceptsContainerType(container.type) @classmethod @abc.abstractmethod From e72c0731ee53680c951d7117309b7ada5b7be1ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 05:23:28 +0200 Subject: [PATCH 14/62] Now container factories honor Container instances, so the materialized container image is more correct. Also, added initial Singularity container factory tests --- tests/containers/__init__.py | 0 tests/containers/test_singularity.py | 233 ++++++++++++++++++ .../abstract_docker_container.py | 33 ++- .../container_factories/docker_container.py | 38 ++- .../container_factories/podman_container.py | 43 +++- .../singularity_container.py | 204 +++++++++------ 6 files changed, 453 insertions(+), 98 deletions(-) create mode 100644 tests/containers/__init__.py create mode 100644 tests/containers/test_singularity.py diff --git a/tests/containers/__init__.py b/tests/containers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/containers/test_singularity.py b/tests/containers/test_singularity.py new file mode 100644 index 00000000..9efc2e92 --- /dev/null +++ b/tests/containers/test_singularity.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import logging + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from wfexs_backend.common import ( + Fingerprint, + RelPath, + URIType, + ) + + from wfexs_backend.container_factories import ( + ProcessorArchitecture, + ) + +from wfexs_backend.common import ( + ContainerTaggedName, + ContainerType, +) + +from wfexs_backend.container_factories import ( + Container, + ContainerEngineException, + ContainerFactoryException, +) + +from wfexs_backend.container_factories.singularity_container import ( + SingularityContainerFactory, +) + + +# Enable logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def simpleTestContainerFileName(imageUrl: "URIType") -> "RelPath": + """ + This method was borrowed from + https://github.com/nextflow-io/nextflow/blob/539a22b68c114c94eaf4a88ea8d26b7bfe2d0c39/modules/nextflow/src/main/groovy/nextflow/container/SingularityCache.groovy#L80 + and translated to Python + """ + p = imageUrl.find("://") + name = imageUrl[p + 3 :] if p != -1 else imageUrl + extension = ".img" + if ".sif:" in name: + extension = ".sif" + name = name.replace(".sif:", "-") + elif name.endswith(".sif"): + extension = ".sif" + name = name[:-4] + + name = name.replace(":", "-").replace("/", "-") + + return cast("RelPath", name + extension) + + +def test_singularity_basic(tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check singularity container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + scf = SingularityContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + + +TAGGED_TESTBED = pytest.mark.parametrize( + ["cont_tagged"], + [ + ( + ContainerTaggedName( + origTaggedName="busybox:stable", + type=ContainerType.Docker, + ), + ), + ( + ContainerTaggedName( + origTaggedName="quay/busybox:latest", + type=ContainerType.Docker, + registries={ + ContainerType.Docker: "quay.io", + }, + ), + ), + ( + Container( + origTaggedName="busybox:stable", + type=ContainerType.Singularity, + taggedName=cast("URIType", "docker://busybox:stable"), + architecture=cast("ProcessorArchitecture", "amd64"), + signature=cast( + "Fingerprint", "sha256=sBYlsI2WjxCwGm9juxawq1ryW3OpivFxFUWxEvQ9vBU=" + ), + fingerprint=cast( + "Fingerprint", + "library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7", + ), + source_type=ContainerType.Docker, + image_signature=cast( + "Fingerprint", "sha256=sBYlsI2WjxCwGm9juxawq1ryW3OpivFxFUWxEvQ9vBU=" + ), + ), + ), + ( + Container( + origTaggedName="quay/busybox:latest", + type=ContainerType.Singularity, + registries={ + ContainerType.Docker: "quay.io", + }, + taggedName=cast("URIType", "docker://quay.io/quay/busybox:latest"), + architecture=cast("ProcessorArchitecture", "amd64"), + signature=cast( + "Fingerprint", "sha256=WTkWLbkE2f3HvwpLcWIOMaW85YxuZBCPXSffBez6hKY=" + ), + fingerprint=cast( + "Fingerprint", + "quay/busybox@sha256:92f3298bf80a1ba949140d77987f5de081f010337880cd771f7e7fc928f8c74d", + ), + source_type=ContainerType.Docker, + image_signature=cast( + "Fingerprint", "sha256=WTkWLbkE2f3HvwpLcWIOMaW85YxuZBCPXSffBez6hKY=" + ), + ), + ), + ], +) + + +@TAGGED_TESTBED +def test_singularity_container_tagged_name(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check singularity container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + scf = SingularityContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + containers = scf.materializeContainers(tagList=[cont_tagged]) + if isinstance(cont_tagged, Container): + for attr in ( + "origTaggedName", + "type", + "registries", + "taggedName", + "architecture", + "operatingSystem", + "fingerprint", + "source_type", + ): + assert getattr(cont_tagged, attr) == getattr(containers[0], attr) + + +@TAGGED_TESTBED +def test_singularity_container_tagged_name_fail(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check singularity container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + scf = SingularityContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + with pytest.raises(ContainerFactoryException): + containers = scf.materializeContainers(tagList=[cont_tagged], offline=True) + logger.info(containers) + + +@TAGGED_TESTBED +def test_singularity_container_tagged_name_cached(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check singularity container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + scf = SingularityContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + containers = scf.materializeContainers(tagList=[cont_tagged]) + containers2 = scf.materializeContainers(tagList=[cont_tagged], offline=True) + for container, container2 in zip(containers, containers2): + for attr in ( + "origTaggedName", + "type", + "registries", + "taggedName", + "architecture", + "operatingSystem", + "fingerprint", + "source_type", + ): + assert getattr(container, attr) == getattr(container2, attr) diff --git a/wfexs_backend/container_factories/abstract_docker_container.py b/wfexs_backend/container_factories/abstract_docker_container.py index d03a0e7b..47bc4464 100644 --- a/wfexs_backend/container_factories/abstract_docker_container.py +++ b/wfexs_backend/container_factories/abstract_docker_container.py @@ -79,15 +79,15 @@ ) from . import ( + AbstractImageManifestMetadata, Container, ) DockerLikeManifest: TypeAlias = Mapping[str, Any] MutableDockerLikeManifest: TypeAlias = MutableMapping[str, Any] - class DockerManifestMetadata(TypedDict): + class DockerManifestMetadata(AbstractImageManifestMetadata): image_id: "Fingerprint" - image_signature: "Fingerprint" manifests_signature: "Fingerprint" manifests: "Sequence[DockerLikeManifest]" @@ -229,6 +229,35 @@ def _pull( return cast("ExitVal", d_retval), d_out_v, d_err_v + def _tag( + self, dockerPullTag: "str", dockerTag: "str", matEnv: "Mapping[str, str]" + ) -> "Tuple[ExitVal, str, str]": + with tempfile.NamedTemporaryFile() as d_out, tempfile.NamedTemporaryFile() as d_err: + self.logger.debug( + f"tagging {self.variant_name()} container {dockerPullTag} as {dockerTag}" + ) + d_retval = subprocess.Popen( + [self.runtime_cmd, "tag", dockerPullTag, dockerTag], + env=matEnv, + stdout=d_out, + stderr=d_err, + ).wait() + + self.logger.debug( + f"{self.variant_name()} tag {dockerPullTag} {dockerTag} retval: {d_retval}" + ) + + with open(d_out.name, mode="r") as c_stF: + d_out_v = c_stF.read() + with open(d_err.name, "r") as c_stF: + d_err_v = c_stF.read() + + self.logger.debug(f"{self.variant_name()} pull stdout: {d_out_v}") + + self.logger.debug(f"{self.variant_name()} pull stderr: {d_err_v}") + + return cast("ExitVal", d_retval), d_out_v, d_err_v + def _rmi( self, dockerTag: "str", matEnv: "Mapping[str, str]" ) -> "Tuple[ExitVal, str, str]": diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index 7702797b..fe5b500d 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -51,10 +51,13 @@ ContainerFileNamingMethod, ContainerLocalConfig, ContainerOperatingSystem, - DockerManifestMetadata, ProcessorArchitecture, ) + from .abstract_docker_container import ( + DockerManifestMetadata, + ) + from ..common import ( ContainerType, DEFAULT_DOCKER_CMD, @@ -150,7 +153,7 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture def _genDockerTag( self, tag: "ContainerTaggedName", - ) -> "URIType": + ) -> "Tuple[URIType, str]": tag_name = tag.origTaggedName dockerTag = ( tag_name[len(DOCKER_PROTO) :] @@ -170,7 +173,23 @@ def _genDockerTag( dockerTag = f"{registry}/{dockerTag}" # Last case, it already has a registry declared - return cast("URIType", dockerTag) + if isinstance(tag, Container) and tag.signature is not None: + shapos = dockerTag.rfind("@sha256:") + if shapos != -1: + # The sha256 tag takes precedence over the recorded signature + dockerPullTag = dockerTag + else: + colonpos = dockerTag.rfind(":") + slashpos = dockerTag.rfind("/") + if colonpos > slashpos: + dockerPullTag = dockerTag[:colonpos] + else: + dockerPullTag = dockerTag + dockerPullTag += "@sha256:" + tag.signature + else: + dockerPullTag = dockerTag + + return cast("URIType", dockerTag), dockerPullTag def materializeSingleContainer( self, @@ -187,7 +206,7 @@ def materializeSingleContainer( # It is an absolute URL, we are removing the docker:// tag_name = tag.origTaggedName - dockerTag = self._genDockerTag(tag) + dockerTag, dockerPullTag = self._genDockerTag(tag) self.logger.info(f"downloading docker container: {tag_name} => {dockerTag}") @@ -261,9 +280,14 @@ def materializeSingleContainer( _, _, _ = self._rmi(dockerTag, matEnv) # And now, let's materialize the new world - d_retval, d_out_v, d_err_v = self._pull(dockerTag, matEnv) - if d_retval == 0: + d_retval, d_out_v, d_err_v = self._pull(dockerPullTag, matEnv) + + if d_retval == 0 and dockerTag != dockerPullTag: # Second try + d_retval, d_out_v, d_err_v = self._tag(dockerPullTag, dockerTag, matEnv) + + if d_retval == 0: + # Third try d_retval, d_out_v, d_err_v = self._inspect(dockerTag, matEnv) if d_retval != 0: @@ -437,7 +461,7 @@ def deploySingleContainer( else: manifest = manifests[0] - dockerTag = self._genDockerTag(container) + dockerTag, dockerPullTag = self._genDockerTag(container) image_id = signaturesAndManifest["image_id"] diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index e26f53ca..48dc4e10 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -51,10 +51,13 @@ ContainerFileNamingMethod, ContainerLocalConfig, ContainerOperatingSystem, - DockerManifestMetadata, ProcessorArchitecture, ) + from .abstract_docker_container import ( + DockerManifestMetadata, + ) + from ..common import ( ContainerType, DEFAULT_PODMAN_CMD, @@ -168,15 +171,13 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture def _genPodmanTag( self, tag: "ContainerTaggedName", - ) -> "Tuple[URIType, str]": + ) -> "Tuple[URIType, str, str]": # It is an absolute URL, we are removing the docker:// tag_name = tag.origTaggedName if tag_name.startswith(DOCKER_PROTO): dockerTag = tag_name[len(DOCKER_PROTO) :] - podmanPullTag = tag_name else: dockerTag = tag_name - podmanPullTag = DOCKER_PROTO + tag_name # Should we enrich the tag with the registry? if isinstance(tag.registries, dict) and ( @@ -191,13 +192,30 @@ def _genPodmanTag( # Bare case if "/" not in dockerTag: dockerTag = f"{registry}/library/{dockerTag}" - podmanPullTag = DOCKER_PROTO + dockerTag elif dockerTag.find("/") == dockerTag.rfind("/"): dockerTag = f"{registry}/{dockerTag}" - podmanPullTag = DOCKER_PROTO + dockerTag # Last case, it already has a registry declared - return cast("URIType", dockerTag), podmanPullTag + # This is needed .... + if isinstance(tag, Container) and tag.signature is not None: + shapos = dockerTag.rfind("@sha256:") + if shapos != -1: + # The sha256 tag takes precedence over the recorded signature + dockerPullTag = dockerTag + else: + colonpos = dockerTag.rfind(":") + slashpos = dockerTag.rfind("/") + if colonpos > slashpos: + dockerPullTag = dockerTag[:colonpos] + else: + dockerPullTag = dockerTag + dockerPullTag += "@sha256:" + tag.signature + else: + dockerPullTag = dockerTag + + podmanPullTag = DOCKER_PROTO + dockerPullTag + + return cast("URIType", dockerTag), dockerPullTag, podmanPullTag def materializeSingleContainer( self, @@ -215,7 +233,7 @@ def materializeSingleContainer( # It is an absolute URL, we are removing the docker:// tag_name = tag.origTaggedName - dockerTag, podmanPullTag = self._genPodmanTag(tag) + dockerTag, dockerPullTag, podmanPullTag = self._genPodmanTag(tag) self.logger.info(f"downloading podman container: {tag_name} => {podmanPullTag}") @@ -290,6 +308,11 @@ def materializeSingleContainer( # And now, let's materialize the new world d_retval, d_out_v, d_err_v = self._pull(podmanPullTag, matEnv) + + if d_retval == 0 and dockerTag != dockerPullTag: + # Second try + d_retval, d_out_v, d_err_v = self._tag(dockerPullTag, dockerTag, matEnv) + if d_retval == 0: # Second try d_retval, d_out_v, d_err_v = self._inspect(dockerTag, matEnv) @@ -466,7 +489,9 @@ def deploySingleContainer( else: manifest = manifests[0] - dockerTag, podmanPullTag = self._genPodmanTag(container) + dockerTag, dockerPullTag, podmanPullTag = self._genPodmanTag( + container + ) image_id = signaturesAndManifest["image_id"] diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index ae01a6b3..74d53ad1 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -61,25 +61,26 @@ AbsPath, AnyPath, ContainerTaggedName, + ExitVal, Fingerprint, RelPath, URIType, ) from . import ( + AbstractImageManifestMetadata, ContainerFileNamingMethod, ContainerLocalConfig, ProcessorArchitecture, ) - class SingularityManifest(TypedDict): + class SingularityManifest(AbstractImageManifestMetadata): registryServer: Required[str] registryType: Required[str] repo: Required[str] alias: Required[Optional[str]] dcd: NotRequired[str] manifest: NotRequired[Mapping[str, Any]] - image_signature: NotRequired[Fingerprint] from . import ( @@ -220,18 +221,18 @@ def _getContainerArchitecture( s_out_v = c_stF.read() with open(s_err.name, "r") as c_stF: s_err_v = c_stF.read() - errstr = """Could not inspect singularity image {}. Retval {} + errstr = f"""\ +Could not inspect singularity image {container_filename}. Retval {s_retval} ====== STDOUT ====== -{} +{s_out_v} ====== STDERR ====== -{}""".format( - container_filename, s_retval, s_out_v, s_err_v - ) +{s_err_v}""" + self.logger.error(errstr) raise ContainerEngineException(errstr) @@ -257,18 +258,18 @@ def _getContainerArchitecture( self.logger.debug(f"singularity sif list stderr: {s_err_v}") if s_retval != 0: - errstr = """Could not describe singularity image {}. Retval {} + errstr = f"""\ +Could not describe singularity image {container_filename}. Retval {s_retval} ====== STDOUT ====== -{} +{s_out_v} ====== STDERR ====== -{}""".format( - container_filename, s_retval, s_out_v, s_err_v - ) +{s_err_v}""" + self.logger.warning(errstr) self.logger.warning( f"Most probably, image {container_filename} was built using singularity older than 3.0.0" @@ -328,18 +329,17 @@ def _getContainerArchitecture( self.logger.debug(f"singularity sif info stderr: {s_err_v}") if s_retval != 0: - errstr = """Could not describe bundle {} from singularity image {}. Retval {} + errstr = f"""\ +Could not describe bundle {data_bundle_id} from singularity image {container_filename}. Retval {s_retval} ====== STDOUT ====== -{} +{s_out_v} ====== STDERR ====== -{}""".format( - data_bundle_id, container_filename, s_retval, s_out_v, s_err_v - ) +{s_err_v}""" raise ContainerEngineException(errstr) # Learning the architecture @@ -378,7 +378,7 @@ def materializeSingleContainer( def _genSingTag( self, tag: "ContainerTaggedName", - ) -> "Tuple[str, parse.ParseResult, bool]": + ) -> "Tuple[str, parse.ParseResult, str, bool]": # It is not an absolute URL, we are prepending the docker:// tag_name = tag.origTaggedName parsedTag = parse.urlparse(tag_name) @@ -450,7 +450,57 @@ def _genSingTag( parsedTag = parse.urlparse(singTag) # Last case, it already has a registry declared - return singTag, parsedTag, isDocker + # Now, the singPullTag + if isDocker and isinstance(tag, Container) and tag.fingerprint is not None: + shapos = singTag.rfind("@sha256:") + if shapos != -1: + # The sha256 tag takes precedence over the recorded signature + singPullTag = singTag + else: + atpos = tag.fingerprint.rfind("@") + if atpos > 0: + partial_fingerprint = tag.fingerprint[atpos:] + colonpos = singTag.rfind(":") + slashpos = singTag.rfind("/") + if colonpos > slashpos: + singPullTag = singTag[:colonpos] + else: + singPullTag = singTag + + singPullTag += partial_fingerprint + else: + singPullTag = singTag + + return singTag, parsedTag, singPullTag, isDocker + + def _pull( + self, singTag: "str", tmpContainerPath: "str", matEnv: "Mapping[str, str]" + ) -> "Tuple[ExitVal, str, str]": + with tempfile.NamedTemporaryFile() as s_out, tempfile.NamedTemporaryFile() as s_err: + self.logger.debug( + f"downloading temporary container: {singTag} => {tmpContainerPath}" + ) + # Singularity command line borrowed from + # https://github.com/nextflow-io/nextflow/blob/539a22b68c114c94eaf4a88ea8d26b7bfe2d0c39/modules/nextflow/src/main/groovy/nextflow/container/SingularityCache.groovy#L221 + s_retval = subprocess.Popen( + [self.runtime_cmd, "pull", "--name", tmpContainerPath, singTag], + env=matEnv, + stdout=s_out, + stderr=s_err, + ).wait() + + self.logger.debug(f"singularity pull retval: {s_retval}") + + with open(s_out.name, "r") as c_stF: + s_out_v = c_stF.read() + with open(s_err.name, "r") as c_stF: + s_err_v = c_stF.read() + + self.logger.debug(f"singularity pull stdout: {s_out_v}") + + self.logger.debug(f"singularity pull stderr: {s_err_v}") + + return cast("ExitVal", s_retval), s_out_v, s_err_v def _materializeSingleContainerSing( self, @@ -467,7 +517,7 @@ def _materializeSingleContainerSing( matEnv = matEnvNew tag_name = tag.origTaggedName - singTag, parsedTag, isDocker = self._genSingTag(tag) + singTag, parsedTag, singPullTag, isDocker = self._genSingTag(tag) fetch_metadata = True trusted_copy = False @@ -542,69 +592,48 @@ def _materializeSingleContainerSing( f"Cannot download containers in offline mode from {tag_name}" ) - with tempfile.NamedTemporaryFile() as s_out, tempfile.NamedTemporaryFile() as s_err: - tmpContainerPath = self.cc_handler._genTmpContainerPath() - - self.logger.debug( - f"downloading temporary container: {tag_name} => {tmpContainerPath}" - ) - # Singularity command line borrowed from - # https://github.com/nextflow-io/nextflow/blob/539a22b68c114c94eaf4a88ea8d26b7bfe2d0c39/modules/nextflow/src/main/groovy/nextflow/container/SingularityCache.groovy#L221 - s_retval = subprocess.Popen( - [self.runtime_cmd, "pull", "--name", tmpContainerPath, singTag], - env=matEnv, - stdout=s_out, - stderr=s_err, - ).wait() - - self.logger.debug(f"singularity pull retval: {s_retval}") - - with open(s_out.name, "r") as c_stF: - s_out_v = c_stF.read() - with open(s_err.name, "r") as c_stF: - s_err_v = c_stF.read() - - self.logger.debug(f"singularity pull stdout: {s_out_v}") - - self.logger.debug(f"singularity pull stderr: {s_err_v}") + tmpContainerPath = self.cc_handler._genTmpContainerPath() + s_retval, s_out_v, s_err_v = self._pull( + singPullTag, tmpContainerPath, matEnv + ) - # Reading the output and error for the report - if s_retval == 0: - if not os.path.exists(tmpContainerPath): - raise ContainerFactoryException( - "FATAL ERROR: Singularity finished properly but it did not materialize {} into {}".format( - tag_name, tmpContainerPath - ) + # Reading the output and error for the report + if s_retval == 0: + if not os.path.exists(tmpContainerPath): + raise ContainerFactoryException( + "FATAL ERROR: Singularity finished properly but it did not materialize {} into {}".format( + tag_name, tmpContainerPath ) - - # This is needed for the metadata - imageSignature = self.cc_handler._computeFingerprint( - cast("AnyPath", tmpContainerPath) ) - else: - errstr = """Could not materialize singularity image {}. Retval {} + + # This is needed for the metadata + imageSignature = self.cc_handler._computeFingerprint( + cast("AnyPath", tmpContainerPath) + ) + else: + errstr = f"""\ +Could not materialize singularity image {singTag} ({singPullTag}). Retval {s_retval} ====== STDOUT ====== -{} +{s_out_v} ====== STDERR ====== -{}""".format( - singTag, s_retval, s_out_v, s_err_v - ) - if os.path.exists(tmpContainerPath): - try: - os.unlink(tmpContainerPath) - except: - pass - self.logger.error(errstr) - - return FailedContainerTag( - tag=tag_name, - sing_tag=singTag, - ) +{s_err_v}""" + + if os.path.exists(tmpContainerPath): + try: + os.unlink(tmpContainerPath) + except: + pass + self.logger.error(errstr) + + return FailedContainerTag( + tag=tag_name, + sing_tag=singPullTag, + ) # At this point we should always have a image signature assert imageSignature is not None @@ -633,24 +662,37 @@ def _materializeSingleContainerSing( if isDocker: tag_details = dhelp.query_tag(singTag) if tag_details is None: + self.logger.error(f"FALLA {singTag}") return FailedContainerTag(tag=tag_name, sing_tag=singTag) + if singTag != singPullTag: + tag_pull_details = dhelp.query_tag(singPullTag) + if tag_pull_details is None: + self.logger.error(f"CANALLA {singPullTag}") + return FailedContainerTag(tag=tag_name, sing_tag=singPullTag) + else: + tag_pull_details = tag_details + else: + tag_pull_details = tag_details # Save the temporary metadata with open(tmpContainerPathMeta, mode="w", encoding="utf8") as tcpm: tmp_meta: "SingularityManifest" if tag_details is not None: + assert tag_pull_details is not None tmp_meta = { "image_signature": imageSignature, - "registryServer": tag_details.registryServer, + "registryServer": tag_pull_details.registryServer, "registryType": "docker", - "repo": tag_details.repo, + "repo": tag_pull_details.repo, "alias": tag_details.alias, - "dcd": tag_details.partial_fingerprint, - "manifest": tag_details.manifest, + "dcd": tag_pull_details.partial_fingerprint, + "manifest": tag_pull_details.manifest, } fingerprint = cast( "Fingerprint", - tag_details.repo + "@" + tag_details.partial_fingerprint, + tag_pull_details.repo + + "@" + + tag_pull_details.partial_fingerprint, ) else: # TODO: Which metadata could we add for other schemes? @@ -694,7 +736,7 @@ def _materializeSingleContainerSing( localPath=containerPath, registries=tag.registries, metadataLocalPath=containerPathMeta, - source_type=tag.type, + source_type=tag.source_type if isinstance(tag, Container) else tag.type, image_signature=imageSignature, ) @@ -792,7 +834,9 @@ def deploySingleContainer( # Reuse the input container instance rebuilt_container = container else: - singTag, parsedTag, isDocker = self._genSingTag(container) + singTag, parsedTag, singPullTag, isDocker = self._genSingTag( + container + ) partial_fingerprint = signaturesAndManifest.get("dcd") repo = signaturesAndManifest["repo"] From 38af41e2f534136ba5e7dd6148fee8731a3e1016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 13:53:45 +0200 Subject: [PATCH 15/62] Fixed generated fingerprint for Singularity containers, so it always includes the registry. Also, shared method with other container tests has been moved to `tests.containers.util` --- tests/containers/test_singularity.py | 34 +++------- tests/containers/util.py | 49 +++++++++++++++ .../singularity_container.py | 62 ++++++++++++++----- 3 files changed, 105 insertions(+), 40 deletions(-) create mode 100644 tests/containers/util.py diff --git a/tests/containers/test_singularity.py b/tests/containers/test_singularity.py index 9efc2e92..ed4711e4 100644 --- a/tests/containers/test_singularity.py +++ b/tests/containers/test_singularity.py @@ -50,33 +50,13 @@ SingularityContainerFactory, ) +from tests.containers.util import simpleTestContainerFileName # Enable logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -def simpleTestContainerFileName(imageUrl: "URIType") -> "RelPath": - """ - This method was borrowed from - https://github.com/nextflow-io/nextflow/blob/539a22b68c114c94eaf4a88ea8d26b7bfe2d0c39/modules/nextflow/src/main/groovy/nextflow/container/SingularityCache.groovy#L80 - and translated to Python - """ - p = imageUrl.find("://") - name = imageUrl[p + 3 :] if p != -1 else imageUrl - extension = ".img" - if ".sif:" in name: - extension = ".sif" - name = name.replace(".sif:", "-") - elif name.endswith(".sif"): - extension = ".sif" - name = name[:-4] - - name = name.replace(":", "-").replace("/", "-") - - return cast("RelPath", name + extension) - - def test_singularity_basic(tmpdir) -> "None": # type: ignore[no-untyped-def] """ Check singularity container factory instantiation @@ -121,7 +101,7 @@ def test_singularity_basic(tmpdir) -> "None": # type: ignore[no-untyped-def] ), fingerprint=cast( "Fingerprint", - "library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7", + "docker.io/library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7", ), source_type=ContainerType.Docker, image_signature=cast( @@ -143,7 +123,7 @@ def test_singularity_basic(tmpdir) -> "None": # type: ignore[no-untyped-def] ), fingerprint=cast( "Fingerprint", - "quay/busybox@sha256:92f3298bf80a1ba949140d77987f5de081f010337880cd771f7e7fc928f8c74d", + "quay.io/quay/busybox@sha256:92f3298bf80a1ba949140d77987f5de081f010337880cd771f7e7fc928f8c74d", ), source_type=ContainerType.Docker, image_signature=cast( @@ -181,7 +161,9 @@ def test_singularity_container_tagged_name(cont_tagged: "ContainerTaggedName", t "fingerprint", "source_type", ): - assert getattr(cont_tagged, attr) == getattr(containers[0], attr) + assert getattr(cont_tagged, attr) == getattr( + containers[0], attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" @TAGGED_TESTBED @@ -230,4 +212,6 @@ def test_singularity_container_tagged_name_cached(cont_tagged: "ContainerTaggedN "fingerprint", "source_type", ): - assert getattr(container, attr) == getattr(container2, attr) + assert getattr(cont_tagged, attr) == getattr( + containers[0], attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" diff --git a/tests/containers/util.py b/tests/containers/util.py new file mode 100644 index 00000000..ca3968b0 --- /dev/null +++ b/tests/containers/util.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from wfexs_backend.common import ( + RelPath, + URIType, + ) + + +def simpleTestContainerFileName(imageUrl: "URIType") -> "RelPath": + """ + This method was borrowed from + https://github.com/nextflow-io/nextflow/blob/539a22b68c114c94eaf4a88ea8d26b7bfe2d0c39/modules/nextflow/src/main/groovy/nextflow/container/SingularityCache.groovy#L80 + and translated to Python + """ + p = imageUrl.find("://") + name = imageUrl[p + 3 :] if p != -1 else imageUrl + extension = ".img" + if ".sif:" in name: + extension = ".sif" + name = name.replace(".sif:", "-") + elif name.endswith(".sif"): + extension = ".sif" + name = name[:-4] + + name = name.replace(":", "-").replace("/", "-") + + return cast("RelPath", name + extension) diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index 74d53ad1..01b05151 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -89,6 +89,7 @@ class SingularityManifest(AbstractImageManifestMetadata): ContainerEngineException, ContainerFactoryException, ContainerNotFoundException, + DEFAULT_DOCKER_REGISTRY, DOCKER_SCHEME, ) @@ -453,21 +454,20 @@ def _genSingTag( # Now, the singPullTag if isDocker and isinstance(tag, Container) and tag.fingerprint is not None: shapos = singTag.rfind("@sha256:") - if shapos != -1: + atpos = tag.fingerprint.rfind("@") + if shapos != -1 or atpos <= 0: # The sha256 tag takes precedence over the recorded signature singPullTag = singTag else: - atpos = tag.fingerprint.rfind("@") - if atpos > 0: - partial_fingerprint = tag.fingerprint[atpos:] - colonpos = singTag.rfind(":") - slashpos = singTag.rfind("/") - if colonpos > slashpos: - singPullTag = singTag[:colonpos] - else: - singPullTag = singTag + partial_fingerprint = tag.fingerprint[atpos:] + colonpos = singTag.rfind(":") + slashpos = singTag.rfind("/") + if colonpos > slashpos: + singPullTag = singTag[:colonpos] + else: + singPullTag = singTag - singPullTag += partial_fingerprint + singPullTag += partial_fingerprint else: singPullTag = singTag @@ -551,10 +551,21 @@ def _materializeSingleContainerSing( imageSignature_in_metadata = metadata.get("image_signature") manifest = metadata.get("manifest") if partial_fingerprint is not None: + usableRegistryServer = ( + DEFAULT_DOCKER_REGISTRY + if registryServer.endswith( + "." + DEFAULT_DOCKER_REGISTRY + ) + else registryServer + ) fingerprint = cast( # Maybe in the future registryServer + '/' + repo + "@" + partial_fingerprint "Fingerprint", - repo + "@" + partial_fingerprint, + usableRegistryServer + + "/" + + repo + + "@" + + partial_fingerprint, ) else: # TODO: is there a better alternative? @@ -688,9 +699,18 @@ def _materializeSingleContainerSing( "dcd": tag_pull_details.partial_fingerprint, "manifest": tag_pull_details.manifest, } + usableRegistryServer = ( + DEFAULT_DOCKER_REGISTRY + if tag_pull_details.registryServer.endswith( + "." + DEFAULT_DOCKER_REGISTRY + ) + else tag_pull_details.registryServer + ) fingerprint = cast( "Fingerprint", - tag_pull_details.repo + usableRegistryServer + + "/" + + tag_pull_details.repo + "@" + tag_pull_details.partial_fingerprint, ) @@ -841,10 +861,20 @@ def deploySingleContainer( partial_fingerprint = signaturesAndManifest.get("dcd") repo = signaturesAndManifest["repo"] if partial_fingerprint is not None: + registryServer = signaturesAndManifest["registryServer"] + usableRegistryServer = ( + DEFAULT_DOCKER_REGISTRY + if registryServer.endswith("." + DEFAULT_DOCKER_REGISTRY) + else registryServer + ) fingerprint = cast( # Maybe in the future registryServer + '/' + repo + "@" + partial_fingerprint "Fingerprint", - repo + "@" + partial_fingerprint, + usableRegistryServer + + "/" + + repo + + "@" + + partial_fingerprint, ) else: # TODO: is there a better alternative? @@ -860,7 +890,9 @@ def deploySingleContainer( localPath=containerPath, registries=container.registries, metadataLocalPath=containerPathMeta, - source_type=container.type, + source_type=container.source_type + if isinstance(container, Container) + else container.type, image_signature=imageSignature_in_metadata, ) From 2c6be81233d7c5a39d4b187ae3f658252da14870 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 13:55:22 +0200 Subject: [PATCH 16/62] In docker (and podman) containers, use `docker image inspect` instead of `docker inspect`, to avoid possible collisions with other inspectable objects. --- .../container_factories/abstract_docker_container.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/container_factories/abstract_docker_container.py b/wfexs_backend/container_factories/abstract_docker_container.py index 47bc4464..4d160a7c 100644 --- a/wfexs_backend/container_factories/abstract_docker_container.py +++ b/wfexs_backend/container_factories/abstract_docker_container.py @@ -181,14 +181,14 @@ def _inspect( with tempfile.NamedTemporaryFile() as d_out, tempfile.NamedTemporaryFile() as d_err: self.logger.debug(f"querying {self.variant_name()} container {dockerTag}") d_retval = subprocess.Popen( - [self.runtime_cmd, "inspect", dockerTag], + [self.runtime_cmd, "image", "inspect", dockerTag], env=matEnv, stdout=d_out, stderr=d_err, ).wait() self.logger.debug( - f"{self.variant_name()} inspect {dockerTag} retval: {d_retval}" + f"{self.variant_name()} image inspect {dockerTag} retval: {d_retval}" ) with open(d_out.name, mode="rb") as c_stF: From bcac2dbdb585265b845cef6692b012d68ad4e1fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 13:57:30 +0200 Subject: [PATCH 17/62] Added tests for both podman and docker container factories. Also, fixed several misbehaviours and inconsistencies among them. --- tests/containers/test_docker.py | 220 ++++++++++++++++++ tests/containers/test_podman.py | 220 ++++++++++++++++++ .../container_factories/docker_container.py | 45 +++- .../container_factories/podman_container.py | 14 +- 4 files changed, 485 insertions(+), 14 deletions(-) create mode 100644 tests/containers/test_docker.py create mode 100644 tests/containers/test_podman.py diff --git a/tests/containers/test_docker.py b/tests/containers/test_docker.py new file mode 100644 index 00000000..c5a89efc --- /dev/null +++ b/tests/containers/test_docker.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import logging + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from wfexs_backend.common import ( + Fingerprint, + RelPath, + URIType, + ) + + from wfexs_backend.container_factories import ( + ContainerOperatingSystem, + ProcessorArchitecture, + ) + +from wfexs_backend.common import ( + ContainerTaggedName, + ContainerType, +) + +from wfexs_backend.container_factories import ( + Container, + ContainerEngineException, + ContainerFactoryException, +) + +from wfexs_backend.container_factories.docker_container import ( + DockerContainerFactory, +) + +from tests.containers.util import simpleTestContainerFileName + +# Enable logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def test_docker_basic(tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check docker container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + dcf = DockerContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + + +TAGGED_TESTBED = pytest.mark.parametrize( + ["cont_tagged"], + [ + ( + ContainerTaggedName( + origTaggedName="busybox:stable", + type=ContainerType.Docker, + ), + ), + ( + ContainerTaggedName( + origTaggedName="quay/busybox:latest", + type=ContainerType.Docker, + registries={ + ContainerType.Docker: "quay.io", + }, + ), + ), + ( + Container( + origTaggedName="busybox:stable", + type=ContainerType.Docker, + taggedName=cast("URIType", "busybox:stable"), + operatingSystem=cast("ContainerOperatingSystem", "linux"), + architecture=cast("ProcessorArchitecture", "amd64"), + signature=cast( + "Fingerprint", "sha256=sBYlsI2WjxCwGm9juxawq1ryW3OpivFxFUWxEvQ9vBU=" + ), + fingerprint=cast( + "Fingerprint", + "docker.io/library/busybox@sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7", + ), + source_type=ContainerType.Docker, + image_signature=cast( + "Fingerprint", "sha256=sBYlsI2WjxCwGm9juxawq1ryW3OpivFxFUWxEvQ9vBU=" + ), + ), + ), + ( + Container( + origTaggedName="quay/busybox:latest", + type=ContainerType.Docker, + registries={ + ContainerType.Docker: "quay.io", + }, + taggedName=cast("URIType", "quay.io/quay/busybox:latest"), + operatingSystem=cast("ContainerOperatingSystem", "linux"), + architecture=cast("ProcessorArchitecture", "amd64"), + signature=cast( + "Fingerprint", "sha256=WTkWLbkE2f3HvwpLcWIOMaW85YxuZBCPXSffBez6hKY=" + ), + fingerprint=cast( + "Fingerprint", + "quay.io/quay/busybox@sha256:92f3298bf80a1ba949140d77987f5de081f010337880cd771f7e7fc928f8c74d", + ), + source_type=ContainerType.Docker, + image_signature=cast( + "Fingerprint", "sha256=WTkWLbkE2f3HvwpLcWIOMaW85YxuZBCPXSffBez6hKY=" + ), + ), + ), + ], +) + + +@TAGGED_TESTBED +def test_docker_container_tagged_name(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check docker container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + dcf = DockerContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + containers = dcf.materializeContainers(tagList=[cont_tagged]) + if isinstance(cont_tagged, Container): + for attr in ( + "origTaggedName", + "type", + "registries", + "taggedName", + "architecture", + "operatingSystem", + "fingerprint", + "source_type", + ): + assert getattr(cont_tagged, attr) == getattr( + containers[0], attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" + + +@TAGGED_TESTBED +def test_docker_container_tagged_name_fail(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check docker container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + dcf = DockerContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + with pytest.raises(ContainerFactoryException): + containers = dcf.materializeContainers(tagList=[cont_tagged], offline=True) + logger.info(containers) + + +@TAGGED_TESTBED +def test_docker_container_tagged_name_cached(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check docker container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + dcf = DockerContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + containers = dcf.materializeContainers(tagList=[cont_tagged]) + containers2 = dcf.materializeContainers(tagList=[cont_tagged], offline=True) + for container, container2 in zip(containers, containers2): + for attr in ( + "origTaggedName", + "type", + "registries", + "taggedName", + "architecture", + "operatingSystem", + "fingerprint", + "source_type", + ): + assert getattr(cont_tagged, attr) == getattr( + containers[0], attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" diff --git a/tests/containers/test_podman.py b/tests/containers/test_podman.py new file mode 100644 index 00000000..35bda517 --- /dev/null +++ b/tests/containers/test_podman.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import logging + +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from wfexs_backend.common import ( + Fingerprint, + RelPath, + URIType, + ) + + from wfexs_backend.container_factories import ( + ContainerOperatingSystem, + ProcessorArchitecture, + ) + +from wfexs_backend.common import ( + ContainerTaggedName, + ContainerType, +) + +from wfexs_backend.container_factories import ( + Container, + ContainerEngineException, + ContainerFactoryException, +) + +from wfexs_backend.container_factories.podman_container import ( + PodmanContainerFactory, +) + +from tests.containers.util import simpleTestContainerFileName + +# Enable logging +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def test_podman_basic(tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check podman container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + pcf = PodmanContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + + +TAGGED_TESTBED = pytest.mark.parametrize( + ["cont_tagged"], + [ + ( + ContainerTaggedName( + origTaggedName="busybox:stable", + type=ContainerType.Docker, + ), + ), + ( + ContainerTaggedName( + origTaggedName="quay/busybox:latest", + type=ContainerType.Docker, + registries={ + ContainerType.Docker: "quay.io", + }, + ), + ), + ( + Container( + origTaggedName="busybox:stable", + type=ContainerType.Podman, + taggedName=cast("URIType", "busybox:stable"), + operatingSystem=cast("ContainerOperatingSystem", "linux"), + architecture=cast("ProcessorArchitecture", "amd64"), + signature=cast( + "Fingerprint", "sha256=sBYlsI2WjxCwGm9juxawq1ryW3OpivFxFUWxEvQ9vBU=" + ), + fingerprint=cast( + "Fingerprint", + "docker.io/library/busybox@sha256:50aa4698fa6262977cff89181b2664b99d8a56dbca847bf62f2ef04854597cf8", + ), + source_type=ContainerType.Docker, + image_signature=cast( + "Fingerprint", "sha256=sBYlsI2WjxCwGm9juxawq1ryW3OpivFxFUWxEvQ9vBU=" + ), + ), + ), + ( + Container( + origTaggedName="quay/busybox:latest", + type=ContainerType.Podman, + registries={ + ContainerType.Docker: "quay.io", + }, + taggedName=cast("URIType", "quay.io/quay/busybox:latest"), + operatingSystem=cast("ContainerOperatingSystem", "linux"), + architecture=cast("ProcessorArchitecture", "amd64"), + signature=cast( + "Fingerprint", "sha256=WTkWLbkE2f3HvwpLcWIOMaW85YxuZBCPXSffBez6hKY=" + ), + fingerprint=cast( + "Fingerprint", + "quay.io/quay/busybox@sha256:92f3298bf80a1ba949140d77987f5de081f010337880cd771f7e7fc928f8c74d", + ), + source_type=ContainerType.Docker, + image_signature=cast( + "Fingerprint", "sha256=WTkWLbkE2f3HvwpLcWIOMaW85YxuZBCPXSffBez6hKY=" + ), + ), + ), + ], +) + + +@TAGGED_TESTBED +def test_podman_container_tagged_name(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check podman container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + pcf = PodmanContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + containers = pcf.materializeContainers(tagList=[cont_tagged]) + if isinstance(cont_tagged, Container): + for attr in ( + "origTaggedName", + "type", + "registries", + "taggedName", + "architecture", + "operatingSystem", + "fingerprint", + "source_type", + ): + assert getattr(cont_tagged, attr) == getattr( + containers[0], attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" + + +@TAGGED_TESTBED +def test_podman_container_tagged_name_fail(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check podman container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + pcf = PodmanContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + with pytest.raises(ContainerFactoryException): + containers = pcf.materializeContainers(tagList=[cont_tagged], offline=True) + logger.info(containers) + + +@TAGGED_TESTBED +def test_podman_container_tagged_name_cached(cont_tagged: "ContainerTaggedName", tmpdir) -> "None": # type: ignore[no-untyped-def] + """ + Check podman container factory instantiation + """ + temppath = tmpdir.mkdir("TEMP") + cachepath = tmpdir.mkdir("CACHE") + stagedpath = tmpdir.mkdir("STAGED") + pcf = PodmanContainerFactory( + simpleFileNameMethod=simpleTestContainerFileName, + containersCacheDir=cachepath.strpath, + stagedContainersDir=stagedpath.strpath, + tempDir=temppath.strpath, + ) + containers = pcf.materializeContainers(tagList=[cont_tagged]) + containers2 = pcf.materializeContainers(tagList=[cont_tagged], offline=True) + for container, container2 in zip(containers, containers2): + for attr in ( + "origTaggedName", + "type", + "registries", + "taggedName", + "architecture", + "operatingSystem", + "fingerprint", + "source_type", + ): + assert getattr(cont_tagged, attr) == getattr( + containers[0], attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index fe5b500d..dd2e9785 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -68,6 +68,7 @@ Container, ContainerEngineException, ContainerFactoryException, + DEFAULT_DOCKER_REGISTRY, ) from .abstract_docker_container import ( AbstractDockerContainerFactory, @@ -173,24 +174,46 @@ def _genDockerTag( dockerTag = f"{registry}/{dockerTag}" # Last case, it already has a registry declared - if isinstance(tag, Container) and tag.signature is not None: + # This is needed .... + if isinstance(tag, Container) and tag.fingerprint is not None: shapos = dockerTag.rfind("@sha256:") - if shapos != -1: + atpos = tag.fingerprint.rfind("@") + if shapos != -1 or atpos <= 0: # The sha256 tag takes precedence over the recorded signature dockerPullTag = dockerTag else: + partial_fingerprint = tag.fingerprint[atpos:] colonpos = dockerTag.rfind(":") slashpos = dockerTag.rfind("/") if colonpos > slashpos: dockerPullTag = dockerTag[:colonpos] else: dockerPullTag = dockerTag - dockerPullTag += "@sha256:" + tag.signature + dockerPullTag += partial_fingerprint else: dockerPullTag = dockerTag return cast("URIType", dockerTag), dockerPullTag + def _enrichFingerprint( + self, fingerprint: "str", tag: "ContainerTaggedName" + ) -> "Fingerprint": + # Should we enrich it? + if isinstance(tag.registries, dict) and ( + ContainerType.Docker in tag.registries + ): + registry = tag.registries[ContainerType.Docker] + else: + registry = DEFAULT_DOCKER_REGISTRY + # Bare case + if "/" not in fingerprint: + fingerprint = f"{registry}/library/{fingerprint}" + elif fingerprint.find("/") == fingerprint.rfind("/"): + fingerprint = f"{registry}/{fingerprint}" + # Last case, it already has a registry declared + + return cast("Fingerprint", fingerprint) + def materializeSingleContainer( self, tag: "ContainerTaggedName", @@ -389,9 +412,9 @@ def materializeSingleContainer( # Now the image is not loaded here, but later in deploySingleContainer # Then, compute the fingerprint - fingerprint = None + fingerprint: "Optional[Fingerprint]" = None if len(manifest["RepoDigests"]) > 0: - fingerprint = manifest["RepoDigests"][0] + fingerprint = self._enrichFingerprint(manifest["RepoDigests"][0], tag) # Learning about the intended processor architecture and variant architecture = manifest.get("Architecture") @@ -411,7 +434,7 @@ def materializeSingleContainer( localPath=containerPath, registries=tag.registries, metadataLocalPath=containerPathMeta, - source_type=tag.type, + source_type=tag.source_type if isinstance(tag, Container) else tag.type, image_signature=imageSignature, ) @@ -466,9 +489,11 @@ def deploySingleContainer( image_id = signaturesAndManifest["image_id"] # Then, compute the fingerprint - fingerprint = None + fingerprint: "Optional[Fingerprint]" = None if len(manifest["RepoDigests"]) > 0: - fingerprint = manifest["RepoDigests"][0] + fingerprint = self._enrichFingerprint( + manifest["RepoDigests"][0], container + ) # Learning about the intended processor architecture and variant architecture = manifest.get("Architecture") @@ -488,7 +513,9 @@ def deploySingleContainer( localPath=containerPath, registries=container.registries, metadataLocalPath=containerPathMeta, - source_type=container.type, + source_type=container.source_type + if isinstance(container, Container) + else container.type, image_signature=imageSignature_in_metadata, ) except Exception as e: diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index 48dc4e10..f48c424a 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -197,19 +197,21 @@ def _genPodmanTag( # Last case, it already has a registry declared # This is needed .... - if isinstance(tag, Container) and tag.signature is not None: + if isinstance(tag, Container) and tag.fingerprint is not None: shapos = dockerTag.rfind("@sha256:") - if shapos != -1: + atpos = tag.fingerprint.rfind("@") + if shapos != -1 or atpos <= 0: # The sha256 tag takes precedence over the recorded signature dockerPullTag = dockerTag else: + partial_fingerprint = tag.fingerprint[atpos:] colonpos = dockerTag.rfind(":") slashpos = dockerTag.rfind("/") if colonpos > slashpos: dockerPullTag = dockerTag[:colonpos] else: dockerPullTag = dockerTag - dockerPullTag += "@sha256:" + tag.signature + dockerPullTag += partial_fingerprint else: dockerPullTag = dockerTag @@ -439,7 +441,7 @@ def materializeSingleContainer( localPath=containerPath, registries=tag.registries, metadataLocalPath=containerPathMeta, - source_type=tag.type, + source_type=tag.source_type if isinstance(tag, Container) else tag.type, image_signature=imageSignature, ) @@ -519,7 +521,9 @@ def deploySingleContainer( localPath=containerPath, registries=container.registries, metadataLocalPath=containerPathMeta, - source_type=container.type, + source_type=container.source_type + if isinstance(container, Container) + else container.type, image_signature=imageSignature_in_metadata, ) except Exception as e: From 82e22fb04ed8841b4f5e8867c13b99f761badf2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 6 Jun 2024 14:06:22 +0200 Subject: [PATCH 18/62] Fixed container caching tests, which were using wrong values for their comparisons --- tests/containers/test_docker.py | 6 +++--- tests/containers/test_podman.py | 6 +++--- tests/containers/test_singularity.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/containers/test_docker.py b/tests/containers/test_docker.py index c5a89efc..c153252c 100644 --- a/tests/containers/test_docker.py +++ b/tests/containers/test_docker.py @@ -215,6 +215,6 @@ def test_docker_container_tagged_name_cached(cont_tagged: "ContainerTaggedName", "fingerprint", "source_type", ): - assert getattr(cont_tagged, attr) == getattr( - containers[0], attr - ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" + assert getattr(container, attr) == getattr( + container2, attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(container, attr)} vs {getattr(container2, attr)}" diff --git a/tests/containers/test_podman.py b/tests/containers/test_podman.py index 35bda517..fc5b51f9 100644 --- a/tests/containers/test_podman.py +++ b/tests/containers/test_podman.py @@ -215,6 +215,6 @@ def test_podman_container_tagged_name_cached(cont_tagged: "ContainerTaggedName", "fingerprint", "source_type", ): - assert getattr(cont_tagged, attr) == getattr( - containers[0], attr - ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" + assert getattr(container, attr) == getattr( + container2, attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(container, attr)} vs {getattr(container2, attr)}" diff --git a/tests/containers/test_singularity.py b/tests/containers/test_singularity.py index ed4711e4..bb8f6513 100644 --- a/tests/containers/test_singularity.py +++ b/tests/containers/test_singularity.py @@ -212,6 +212,6 @@ def test_singularity_container_tagged_name_cached(cont_tagged: "ContainerTaggedN "fingerprint", "source_type", ): - assert getattr(cont_tagged, attr) == getattr( - containers[0], attr - ), f"Expected and obtainer container '{attr}' do not match: {getattr(cont_tagged, attr)} vs {getattr(containers[0], attr)}" + assert getattr(container, attr) == getattr( + container2, attr + ), f"Expected and obtainer container '{attr}' do not match: {getattr(container, attr)} vs {getattr(container2, attr)}" From 0866d064b9af49b644cfce9fca9e10d510523be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 02:09:56 +0200 Subject: [PATCH 19/62] Added command-line parameters for the reproducibility level and for the strictness. Also, added all the needed intermediate method parameters to propagate the values of these new parameters to the WF constructor. --- wfexs_backend/__main__.py | 31 ++++++++++++- wfexs_backend/wfexs_backend.py | 9 ++++ wfexs_backend/workflow.py | 82 +++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index b46e4fe0..5ebc6b2f 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -80,7 +80,10 @@ class BasicLoggingConfigDict(TypedDict): from .security_context import SecurityContextVault from .wfexs_backend import WfExSBackend -from .workflow import WF +from .workflow import ( + ReproducibilityLevel, + WF, +) from . import get_WfExS_version_str from .utils.licences import LicenceMatcherSingleton from .utils.misc import DatetimeEncoder @@ -274,6 +277,28 @@ def genParserSub( help="Make secured working directory (default)", ) + ap_.add_argument( + "--strict-reproducibility", + dest="strict_reproducibility_level", + action="store_true", + default=False, + help="Strict reproducibility", + ) + ap_.add_argument( + "--no-strict-reproducibility", + dest="strict_reproducibility_level", + action="store_false", + help="Permissive reproducibility", + ) + + ap_.add_argument( + "--reproducibility-level", + dest="reproducibility_level", + default=ReproducibilityLevel.Metadata, + choices=ReproducibilityLevel, + help="Max reproducibility level to be tried", + ) + if preStageParams or exportParams or command == WfExS_Commands.ReStage: ap_.add_argument( "-Z", @@ -1433,6 +1458,8 @@ def main() -> None: private_key_passphrase=private_key_passphrase, orcids=op_orcids, secure=args.secure, + reproducibility_level=args.reproducibility_level, + strict_reproducibility_level=args.strict_reproducibility_level, ) else: print( @@ -1469,6 +1496,8 @@ def main() -> None: private_key_passphrase=private_key_passphrase, orcids=op_orcids, secure=args.secure, + reproducibility_level=args.reproducibility_level, + strict_reproducibility_level=args.strict_reproducibility_level, ) wfSetup = wfInstance.getStagedSetup() diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 34d12f3c..94494edd 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -138,6 +138,7 @@ from .pushers import AbstractExportPlugin from .workflow import ( + ReproducibilityLevel, WF, WFException, ) @@ -1312,6 +1313,8 @@ def fromPreviousInstanceDeclaration( private_key_passphrase: "Optional[str]" = None, secure: "bool" = True, paranoidMode: "bool" = False, + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ) -> "WF": return WF.FromPreviousInstanceDeclaration( self, @@ -1325,6 +1328,8 @@ def fromPreviousInstanceDeclaration( private_key_passphrase=private_key_passphrase, secure=secure, paranoidMode=paranoidMode, + reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, ) def fromPreviousROCrate( @@ -1339,6 +1344,8 @@ def fromPreviousROCrate( private_key_passphrase: "Optional[str]" = None, secure: "bool" = True, paranoidMode: "bool" = False, + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ) -> "WF": # Let's check whether it is a local file # or a remote RO-Crate @@ -1370,6 +1377,8 @@ def fromPreviousROCrate( private_key_passphrase=private_key_passphrase, secure=secure, paranoidMode=paranoidMode, + reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, ) def parseAndValidateSecurityContextFile( diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index a09557b1..1d3c1906 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -20,6 +20,7 @@ import atexit import copy import datetime +import enum import inspect import json import logging @@ -40,6 +41,7 @@ cast, Dict, NamedTuple, + # This one might be needed for proper unmarshalling Pattern, TYPE_CHECKING, TypeVar, @@ -77,7 +79,6 @@ MutableMapping, MutableSequence, Optional, - Pattern, Sequence, Set, Tuple, @@ -338,6 +339,12 @@ from exceptiongroup import ExceptionGroup +class ReproducibilityLevel(enum.IntEnum): + Minimal = enum.auto() # Minimal / no reproducibility is requested + Metadata = enum.auto() # Metadata reproducibility is requested + Strict = enum.auto() # Strict reproducibility (metadata + payload) is required") + + # Related export namedtuples class ExportItem(NamedTuple): type: "ExportItemType" @@ -459,6 +466,12 @@ def __init__( private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, fail_ok: "bool" = False, + cached_workflow: "Optional[LocalWorkflow]" = None, + cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, + cached_environment: "Optional[Sequence[MaterializedInput]]" = None, + preferred_containers: "Sequence[Container]" = [], + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ): """ Init function @@ -508,6 +521,17 @@ def __init__( ) self.wfexs = wfexs + + # These internal variables are needed for imports. + # They are not preserved in the marshalled staging state, so + # their effects are only in the initial session + self.cached_workflow = cached_workflow + self.cached_inputs = cached_inputs + self.cached_environment = cached_environment + self.preferred_containers = copy.copy(preferred_containers) + self.reproducibility_level = reproducibility_level + self.strict_reproducibility_level = strict_reproducibility_level + self.encWorkDir: "Optional[AbsPath]" = None self.workDir: "Optional[AbsPath]" = None @@ -1170,6 +1194,21 @@ def getMarshallingStatus(self, reread_stats: "bool" = False) -> "MarshallingStat else [], ) + def getMaterializedWorkflow(self) -> "Optional[LocalWorkflow]": + return ( + self.localWorkflow + if self.materializedEngine is None + else self.materializedEngine.workflow + ) + + def getMaterializedContainers(self) -> "Sequence[Container]": + containers: "Sequence[Container]" = [] + if self.materializedEngine is not None: + if self.materializedEngine.containers is not None: + containers = self.materializedEngine.containers + + return containers + def enableParanoidMode(self) -> None: self.paranoidMode = True @@ -1288,6 +1327,7 @@ def FromFiles( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, + reproducibility_level=ReproducibilityLevel.Minimal, ) @classmethod @@ -1302,6 +1342,12 @@ def FromStagedRecipe( private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, paranoidMode: "bool" = False, + cached_workflow: "Optional[LocalWorkflow]" = None, + cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, + cached_environment: "Optional[Sequence[MaterializedInput]]" = None, + preferred_containers: "Sequence[Container]" = [], + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ) -> "WF": """ This class method creates a new staged working directory @@ -1333,6 +1379,12 @@ def FromStagedRecipe( public_key_filenames=public_key_filenames, private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, + cached_workflow=cached_workflow, + cached_inputs=cached_inputs, + cached_environment=cached_environment, + preferred_containers=preferred_containers, + reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, ) @classmethod @@ -1349,6 +1401,8 @@ def FromPreviousInstanceDeclaration( private_key_passphrase: "Optional[str]" = None, secure: "bool" = True, paranoidMode: "bool" = False, + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ) -> "WF": """ This class method creates a new staged working directory @@ -1385,6 +1439,12 @@ def FromPreviousInstanceDeclaration( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, + cached_workflow=wfInstance.getMaterializedWorkflow(), + cached_inputs=wfInstance.materializedParams, + cached_environment=wfInstance.materializedEnvironment, + preferred_containers=wfInstance.getMaterializedContainers(), + reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, ) @classmethod @@ -1402,6 +1462,8 @@ def FromPreviousROCrate( private_key_passphrase: "Optional[str]" = None, secure: "bool" = True, paranoidMode: "bool" = False, + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ) -> "WF": """ This class method creates a new staged working directory @@ -1463,6 +1525,12 @@ def FromPreviousROCrate( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, + # cached_workflow= , + # cached_inputs= , + # cached_environment= , + preferred_containers=the_containers, + reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, ) @classmethod @@ -1476,6 +1544,12 @@ def FromDescription( private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, paranoidMode: "bool" = False, + cached_workflow: "Optional[LocalWorkflow]" = None, + cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, + cached_environment: "Optional[Sequence[MaterializedInput]]" = None, + preferred_containers: "Sequence[Container]" = [], + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, ) -> "WF": """ This class method might create a new staged working directory @@ -1514,6 +1588,12 @@ def FromDescription( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoid_mode=paranoidMode, + cached_workflow=cached_workflow, + cached_inputs=cached_inputs, + cached_environment=cached_environment, + preferred_containers=preferred_containers, + reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, ) @classmethod From bd9c20fe7362a9aab39b25e3b7700d669b882fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 02:58:29 +0200 Subject: [PATCH 20/62] Added the scaffolding code needed to provide the rescued cached workflow, container images, cached inputs and environment. --- wfexs_backend/__main__.py | 6 ++++-- wfexs_backend/utils/rocrate.py | 20 ++++++++++++++++++-- wfexs_backend/wfexs_backend.py | 5 ++++- wfexs_backend/workflow.py | 23 +++++++++++------------ 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index 5ebc6b2f..4a5b8b1c 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -79,9 +79,11 @@ class BasicLoggingConfigDict(TypedDict): from yaml import Loader as YAMLLoader, Dumper as YAMLDumper from .security_context import SecurityContextVault +from .utils.rocrate import ( + ReproducibilityLevel, +) from .wfexs_backend import WfExSBackend from .workflow import ( - ReproducibilityLevel, WF, ) from . import get_WfExS_version_str @@ -1268,7 +1270,7 @@ def main() -> None: localConfigFilename = args.localConfigFilename if localConfigFilename and os.path.exists(localConfigFilename): with open(localConfigFilename, mode="r", encoding="utf-8") as cf: - local_config = yaml.load(cf, Loader=YAMLLoader) + local_config = yaml.safe_load(cf) else: local_config = {} if localConfigFilename and not os.path.exists(localConfigFilename): diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 5822c41f..40f9b8c8 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -95,6 +95,8 @@ from ..common import ( ContainerType, ContentKind, + LocalWorkflow, + MaterializedInput, ) from ..container_factories import ( @@ -118,6 +120,12 @@ # import magic +class ReproducibilityLevel(enum.IntEnum): + Minimal = enum.auto() # Minimal / no reproducibility is requested + Metadata = enum.auto() # Metadata reproducibility is requested + Strict = enum.auto() # Strict reproducibility (metadata + payload) is required") + + class ContainerTypeMetadata(NamedTuple): sa_id: "str" applicationCategory: "str" @@ -1899,7 +1907,8 @@ def generateWorkflowMetaFromJSONLD( jsonld_obj: "Mapping[str, Any]", public_name: "str", retrospective_first: "bool" = True, - ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, Sequence[Container], ParamsBlock, EnvironmentBlock, OutputsBlock]": + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, ParamsBlock, EnvironmentBlock, OutputsBlock, Optional[LocalWorkflow], Sequence[Container], Optional[Sequence[MaterializedInput]], Optional[Sequence[MaterializedInput]]]": matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) # Is it an RO-Crate? if matched_crate is None: @@ -1922,6 +1931,10 @@ def generateWorkflowMetaFromJSONLD( f"JSON-LD from {public_name} is not a WRROC Workflow" ) + cached_workflow: "Optional[LocalWorkflow]" = None + cached_inputs: "Optional[Sequence[MaterializedInput]]" = None + cached_environment: "Optional[Sequence[MaterializedInput]]" = None + # The default crate licences crate_licences = self._getLicences(g, matched_crate.mainentity, public_name) @@ -2068,8 +2081,11 @@ def generateWorkflowMetaFromJSONLD( repo, workflow_type, container_type, - the_containers, params, environment, outputs, + cached_workflow, + the_containers, + cached_inputs, + cached_environment, ) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 94494edd..3a1d74f9 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -137,8 +137,11 @@ from .pushers import AbstractExportPlugin -from .workflow import ( +from .utils.rocrate import ( ReproducibilityLevel, +) + +from .workflow import ( WF, WFException, ) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 1d3c1906..b45f5d9d 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -20,7 +20,6 @@ import atexit import copy import datetime -import enum import inspect import json import logging @@ -248,6 +247,7 @@ ) from .utils.rocrate import ( ReadROCrateMetadata, + ReproducibilityLevel, ) from .security_context import ( @@ -339,12 +339,6 @@ from exceptiongroup import ExceptionGroup -class ReproducibilityLevel(enum.IntEnum): - Minimal = enum.auto() # Minimal / no reproducibility is requested - Metadata = enum.auto() # Metadata reproducibility is requested - Strict = enum.auto() # Strict reproducibility (metadata + payload) is required") - - # Related export namedtuples class ExportItem(NamedTuple): type: "ExportItemType" @@ -1476,12 +1470,17 @@ def FromPreviousROCrate( repo, workflow_type, container_type, - the_containers, params, environment, outputs, + cached_workflow, + the_containers, + cached_inputs, + cached_environment, ) = wfexs.rocrate_toolbox.generateWorkflowMetaFromJSONLD( - jsonld_obj, public_name + jsonld_obj, + public_name, + reproducibility_level=reproducibility_level, ) workflow_pid = wfexs.gen_workflow_pid(repo) @@ -1525,9 +1524,9 @@ def FromPreviousROCrate( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, - # cached_workflow= , - # cached_inputs= , - # cached_environment= , + cached_workflow=cached_workflow, + cached_inputs=cached_inputs, + cached_environment=cached_environment, preferred_containers=the_containers, reproducibility_level=reproducibility_level, strict_reproducibility_level=strict_reproducibility_level, From 3be57ef241ee4cef799e325a5b7be2c4c3044934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 14:56:11 +0200 Subject: [PATCH 21/62] Added backport of zipfile.Path for Python 3.7. This is needed to properly use zipfile.Path in several classes. --- wfexs_backend/utils/zipfile_path.py | 386 ++++++++++++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 wfexs_backend/utils/zipfile_path.py diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py new file mode 100644 index 00000000..b775b787 --- /dev/null +++ b/wfexs_backend/utils/zipfile_path.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# These fragments of code are borrowed from Python 3.10.14 distribution, +# from Lib/zipfile.py , in order to backport zipfile.Path to Python 3.7 +# Method body from path_relative_to is borrowed from Python 3.12 +# zipfile.Path.relative_to method. + +# SPDX-License-Identifier: PSF-2.0 +# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, +# 2023 Python Software Foundation; All Rights Reserved + +import contextlib +import functools +import io +import itertools +import pathlib +import posixpath +from typing import ( + cast, + TYPE_CHECKING, +) +from zipfile import ( + ZipFile, + ZipInfo, +) + + +if TYPE_CHECKING: + from os import ( + PathLike, + ) + + from typing import ( + Any, + Dict, + IO, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Set, + Tuple, + Union, + ) + + +def _parents(path: "str") -> "Iterator[str]": + """ + Given a path with elements separated by + posixpath.sep, generate all parents of that path. + + >>> list(_parents('b/d')) + ['b'] + >>> list(_parents('/b/d/')) + ['/b'] + >>> list(_parents('b/d/f/')) + ['b/d', 'b'] + >>> list(_parents('b')) + [] + >>> list(_parents('')) + [] + """ + return itertools.islice(_ancestry(path), 1, None) + + +def _ancestry(path: "str") -> "Iterator[str]": + """ + Given a path with elements separated by + posixpath.sep, generate all elements of that path + + >>> list(_ancestry('b/d')) + ['b/d', 'b'] + >>> list(_ancestry('/b/d/')) + ['/b/d', '/b'] + >>> list(_ancestry('b/d/f/')) + ['b/d/f', 'b/d', 'b'] + >>> list(_ancestry('b')) + ['b'] + >>> list(_ancestry('')) + [] + """ + path = path.rstrip(posixpath.sep) + while path and path != posixpath.sep: + yield path + path, tail = posixpath.split(path) + + +_dedupe = dict.fromkeys +"""Deduplicate an iterable in original order""" + + +def _difference( + minuend: "Iterable[str]", subtrahend: "Iterable[str]" +) -> "Iterator[str]": + """ + Return items in minuend not in subtrahend, retaining order + with O(1) lookup. + """ + return itertools.filterfalse(set(subtrahend).__contains__, minuend) + + +class CompleteDirs(ZipFile): + """ + A ZipFile subclass that ensures that implied directories + are always included in the namelist. + """ + + @staticmethod + def _implied_dirs(names: "Sequence[str]") -> "Mapping[str, Optional[str]]": + parents = itertools.chain.from_iterable(map(_parents, names)) + as_dirs = (p + posixpath.sep for p in parents) + return _dedupe(_difference(as_dirs, names)) + + def namelist(self) -> "List[str]": + names = super(CompleteDirs, self).namelist() + return names + list(self._implied_dirs(names)) + + def _name_set(self) -> "Set[str]": + return set(self.namelist()) + + def resolve_dir(self, name: "str") -> "str": + """ + If the name represents a directory, return that name + as a directory (with the trailing slash). + """ + names = self._name_set() + dirname = name + "/" + dir_match = name not in names and dirname in names + return dirname if dir_match else name + + def getinfo(self, name: "str") -> "ZipInfo": + """ + Supplement getinfo for implied dirs. + """ + try: + return super().getinfo(name) + except KeyError: + if not name.endswith("/") or name not in self._name_set(): + raise + return ZipInfo(filename=name) + + @classmethod + def make( + cls, source: "Union[CompleteDirs, ZipFile, str, PathLike[str]]" + ) -> "CompleteDirs": + """ + Given a source (filename or zipfile), return an + appropriate CompleteDirs subclass. + """ + if isinstance(source, CompleteDirs): + return source + + if not isinstance(source, ZipFile): + return cls(source) + + # Only allow for FastPath when supplied zipfile is read-only + if "r" not in source.mode: + cls = CompleteDirs + + res = cls.__new__(cls) + return res + + +class FastLookup(CompleteDirs): + """ + ZipFile subclass to ensure implicit + dirs exist and are resolved rapidly. + """ + + def namelist(self) -> "List[str]": + self.__names: "List[str]" + with contextlib.suppress(AttributeError): + return self.__names # pylint: disable=access-member-before-definition + self.__names = super(FastLookup, self).namelist() + return self.__names + + def _name_set(self) -> "Set[str]": + self.__lookup: "Set[str]" + with contextlib.suppress(AttributeError): + return self.__lookup # pylint: disable=access-member-before-definition + self.__lookup = super(FastLookup, self)._name_set() + return self.__lookup + + +def _extract_text_encoding( + encoding: "Optional[str]" = None, *args: "Any", **kwargs: "Any" +) -> "Tuple[str, Tuple[Any], Dict[str, Any]]": + # stacklevel=3 so that the caller of the caller see any warning. + return io.text_encoding(encoding, 3), args, kwargs + + +def path_relative_to( + path: "Union[Path, pathlib.Path]", + other: "Union[Path, pathlib.Path]", + *extra: "Union[str, PathLike[str]]" +) -> "str": + # Method body is borrowed from Python 3.12 + # zipfile.Path.relative_to method. + return posixpath.relpath(str(path), str(other.joinpath(*extra))) + + +class Path: + """ + A pathlib-compatible interface for zip files. + + Consider a zip file with this structure:: + + . + ├── a.txt + └── b + ├── c.txt + └── d + └── e.txt + + >>> data = io.BytesIO() + >>> zf = ZipFile(data, 'w') + >>> zf.writestr('a.txt', 'content of a') + >>> zf.writestr('b/c.txt', 'content of c') + >>> zf.writestr('b/d/e.txt', 'content of e') + >>> zf.filename = 'mem/abcde.zip' + + Path accepts the zipfile object itself or a filename + + >>> root = Path(zf) + + From there, several path operations are available. + + Directory iteration (including the zip file itself): + + >>> a, b = root.iterdir() + >>> a + Path('mem/abcde.zip', 'a.txt') + >>> b + Path('mem/abcde.zip', 'b/') + + name property: + + >>> b.name + 'b' + + join with divide operator: + + >>> c = b / 'c.txt' + >>> c + Path('mem/abcde.zip', 'b/c.txt') + >>> c.name + 'c.txt' + + Read text: + + >>> c.read_text() + 'content of c' + + existence: + + >>> c.exists() + True + >>> (b / 'missing.txt').exists() + False + + Coercion to string: + + >>> import os + >>> str(c).replace(os.sep, posixpath.sep) + 'mem/abcde.zip/b/c.txt' + + At the root, ``name``, ``filename``, and ``parent`` + resolve to the zipfile. Note these attributes are not + valid and will raise a ``ValueError`` if the zipfile + has no filename. + + >>> root.name + 'abcde.zip' + >>> str(root.filename).replace(os.sep, posixpath.sep) + 'mem/abcde.zip' + >>> str(root.parent) + 'mem' + """ + + __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" + + def __init__( + self, root: "Union[str, CompleteDirs, PathLike[str], ZipFile]", at: "str" = "" + ): + """ + Construct a Path from a ZipFile or filename. + + Note: When the source is an existing ZipFile object, + its type (__class__) will be mutated to a + specialized type. If the caller wishes to retain the + original type, the caller should either create a + separate ZipFile object or pass a filename. + """ + self.root = FastLookup.make(root) + self.at = at + + def open( + self, + mode: "str" = "r", + *args: "Any", + pwd: "Optional[bytes]" = None, + **kwargs: "Any" + ) -> "Union[IO[str], IO[bytes]]": + """ + Open this entry as text or binary following the semantics + of ``pathlib.Path.open()`` by passing arguments through + to io.TextIOWrapper(). + """ + if self.is_dir(): + raise IsADirectoryError(self) + zip_mode = mode[0] + if not self.exists() and zip_mode == "r": + raise FileNotFoundError(self) + stream = self.root.open(self.at, zip_mode, pwd=pwd) + if "b" in mode: + if args or kwargs: + raise ValueError("encoding args invalid for binary operation") + return stream + # Text mode: + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + return io.TextIOWrapper(stream, encoding, *args, **kwargs) + + @property + def name(self) -> "str": + return pathlib.Path(self.at).name or self.filename.name + + @property + def filename(self) -> "pathlib.Path": + assert self.root.filename is not None + return pathlib.Path(self.root.filename).joinpath(self.at) + + def read_text(self, *args: "Any", **kwargs: "Any") -> "str": + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + with self.open("r", encoding, *args, **kwargs) as strm: + return cast("str", strm.read()) + + def read_bytes(self) -> "bytes": + with self.open("rb") as strm: + return cast("bytes", strm.read()) + + def _is_child(self, path: "Path") -> "bool": + return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") + + def _next(self, at: "str") -> "Path": + return self.__class__(self.root, at) + + def is_dir(self) -> "bool": + return not self.at or self.at.endswith("/") + + def is_file(self) -> "bool": + return self.exists() and not self.is_dir() + + def exists(self) -> "bool": + return self.at in self.root._name_set() + + def iterdir(self) -> "Iterator[Path]": + if not self.is_dir(): + raise ValueError("Can't listdir a file") + subs = map(self._next, self.root.namelist()) + return filter(self._is_child, subs) + + def __str__(self) -> "str": + assert self.root.filename is not None + return posixpath.join(self.root.filename, self.at) + + def __repr__(self) -> "str": + return self.__repr.format(self=self) + + def joinpath(self, *other: "Union[str, PathLike[str]]") -> "Path": + next = posixpath.join(self.at, *other) + return self._next(self.root.resolve_dir(next)) + + __truediv__ = joinpath + + @property + def parent(self) -> "Union[Path, pathlib.Path]": + if not self.at: + return self.filename.parent + parent_at = posixpath.dirname(self.at.rstrip("/")) + if parent_at: + parent_at += "/" + return self._next(parent_at) From 084b647b86f5c3710b435d9bab3c426260477e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 21:07:18 +0200 Subject: [PATCH 22/62] Cosmetic fixes scheduling shutil.rmtree at exit --- wfexs_backend/__main__.py | 2 +- wfexs_backend/container_factories/__init__.py | 6 +++--- wfexs_backend/fetchers/git.py | 2 +- wfexs_backend/fetchers/swh.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index 4a5b8b1c..7f9b622e 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -1292,7 +1292,7 @@ def main() -> None: cacheDir = tempfile.mkdtemp(prefix="wfexs", suffix="tmpcache") local_config["cacheDir"] = cacheDir # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, cacheDir) + atexit.register(shutil.rmtree, cacheDir, True) print( f"[WARNING] Cache directory not defined. Created a temporary one at {cacheDir}", file=sys.stderr, diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 86771520..6f3baad9 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -273,7 +273,7 @@ def __init__( "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, containers_cache_dir) + atexit.register(shutil.rmtree, containers_cache_dir, True) else: os.makedirs(containers_cache_dir, exist_ok=True) @@ -543,7 +543,7 @@ def __init__( "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, self.containersCacheDir) + atexit.register(shutil.rmtree, self.containersCacheDir, True) else: self.containersCacheDir = cast( "AbsPath", os.path.abspath(containersCacheDir) @@ -554,7 +554,7 @@ def __init__( "AbsPath", tempfile.mkdtemp(prefix="WfExS-container", suffix="tempdir") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, tempDir) + atexit.register(shutil.rmtree, tempDir, True) # This directory might be needed by temporary processes, like # image materialization in singularity or podman diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 061a36ce..43985cc7 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -672,7 +672,7 @@ def materialize_repo( repo_tag_destdir = cast( "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix=".git") ) - atexit.register(shutil.rmtree, repo_tag_destdir) + atexit.register(shutil.rmtree, repo_tag_destdir, True) else: repo_hashed_id = hashlib.sha1(repoURL.encode("utf-8")).hexdigest() repo_destdir = os.path.join(base_repo_destdir, repo_hashed_id) diff --git a/wfexs_backend/fetchers/swh.py b/wfexs_backend/fetchers/swh.py index 647cdf66..bf295dbd 100644 --- a/wfexs_backend/fetchers/swh.py +++ b/wfexs_backend/fetchers/swh.py @@ -449,7 +449,7 @@ def materialize_repo( "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix=".swh"), ) - atexit.register(shutil.rmtree, repo_tag_destdir) + atexit.register(shutil.rmtree, repo_tag_destdir, True) else: repo_hashed_id = hashlib.sha1( repoURL.encode("utf-8") @@ -478,7 +478,7 @@ def materialize_repo( # These steps are needed because the bundle has its contents in the parent extract_dir = tempfile.mkdtemp(prefix="wfexs", suffix=".swh") - atexit.register(shutil.rmtree, extract_dir) + atexit.register(shutil.rmtree, extract_dir, True) with tarfile.open( tmp_targz_filename.name, mode="r|*", bufsize=10 * 1024 * 1024 ) as tF: From cac198dc39fffd1c607c0a54778c7740d3ba45c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 21:07:54 +0200 Subject: [PATCH 23/62] Cosmetic fixes scheduling shutil.rmtree at exit --- wfexs_backend/workflow_engines/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 5c80e363..d0b1c0e0 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -401,7 +401,7 @@ def __init__( "AbsPath", tempfile.mkdtemp(prefix="WfExS", suffix="backend") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, cacheDir) + atexit.register(shutil.rmtree, cacheDir, True) else: if not os.path.isabs(cacheDir): cacheDir = cast( @@ -436,7 +436,7 @@ def __init__( "AbsPath", tempfile.mkdtemp(prefix="WfExS-exec", suffix="workdir") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, workDir) + atexit.register(shutil.rmtree, workDir, True) self.workDir = workDir # This directory should hold intermediate workflow steps results @@ -492,7 +492,7 @@ def __init__( "AbsPath", tempfile.mkdtemp(prefix="WfExS-exec", suffix="tempdir") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, tempDir) + atexit.register(shutil.rmtree, tempDir, True) self.tempDir = tempDir # This directory will hold the staged containers to be used From 051bcc7185fe2213e798f8b5d4055712387e92f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 21:08:29 +0200 Subject: [PATCH 24/62] Fixes on type annotation of `path_relative_to` --- wfexs_backend/utils/zipfile_path.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index b775b787..88200be6 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -21,6 +21,7 @@ cast, TYPE_CHECKING, ) +import sys from zipfile import ( ZipFile, ZipInfo, @@ -47,6 +48,8 @@ Union, ) + import zipfile + def _parents(path: "str") -> "Iterator[str]": """ @@ -194,8 +197,8 @@ def _extract_text_encoding( def path_relative_to( - path: "Union[Path, pathlib.Path]", - other: "Union[Path, pathlib.Path]", + path: "Union[Path, pathlib.Path, zipfile.Path]", + other: "Union[Path, pathlib.Path, zipfile.Path]", *extra: "Union[str, PathLike[str]]" ) -> "str": # Method body is borrowed from Python 3.12 @@ -384,3 +387,10 @@ def parent(self) -> "Union[Path, pathlib.Path]": if parent_at: parent_at += "/" return self._next(parent_at) + + +# Older versions of Python do not have zipfile.Path +if sys.version_info[:2] < (3, 8): + import zipfile + + zipfile.Path = Path From 09486600af212e2851860039fc827db3c04fea5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 7 Jun 2024 21:48:49 +0200 Subject: [PATCH 25/62] Wired up RO-Crate payload contents postprocessing, needed when the payload is in a zip archive --- wfexs_backend/utils/rocrate.py | 20 +++- wfexs_backend/wfexs_backend.py | 10 +- wfexs_backend/workflow.py | 164 ++++++++++++++++++++++++++++++++- 3 files changed, 186 insertions(+), 8 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 40f9b8c8..d69325d1 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -24,10 +24,15 @@ import json import logging import os.path +import pathlib import sys import urllib.parse import zipfile +# Older versions of Python do not have zipfile.Path +if sys.version_info[:2] < (3, 8): + from .zipfile_path import Path as ZipfilePath + from typing import ( cast, NamedTuple, @@ -82,6 +87,8 @@ WorkflowType, ) + from .zipfile_path import Path as ZipfilePath + # Needed by pyld to detect it import aiohttp import pyld # type: ignore[import, import-untyped] @@ -123,7 +130,7 @@ class ReproducibilityLevel(enum.IntEnum): Minimal = enum.auto() # Minimal / no reproducibility is requested Metadata = enum.auto() # Metadata reproducibility is requested - Strict = enum.auto() # Strict reproducibility (metadata + payload) is required") + Full = enum.auto() # Full reproducibility (metadata + payload) is required") class ContainerTypeMetadata(NamedTuple): @@ -206,9 +213,12 @@ class ROCrateToolboxException(Exception): LEGACY_ROCRATE_JSONLD_FILENAME: "Final[str]" = "ro-crate-metadata.jsonld" -def ReadROCrateMetadata(workflowROCrateFilename: "str", public_name: "str") -> "Any": +def ReadROCrateMetadata( + workflowROCrateFilename: "str", public_name: "str" +) -> "Tuple[Any, Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]]": # Is it a bare file or an archive? jsonld_filename: "Optional[str]" = None + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None if os.path.isdir(workflowROCrateFilename): possible_jsonld_filename = os.path.join( workflowROCrateFilename, ROCRATE_JSONLD_FILENAME @@ -224,6 +234,7 @@ def ReadROCrateMetadata(workflowROCrateFilename: "str", public_name: "str") -> " raise ROCrateToolboxException( f"{public_name} does not contain a member {ROCRATE_JSONLD_FILENAME} or {LEGACY_ROCRATE_JSONLD_FILENAME}" ) + payload_dir = pathlib.Path(workflowROCrateFilename) elif os.path.isfile(workflowROCrateFilename): jsonld_filename = workflowROCrateFilename else: @@ -259,6 +270,7 @@ def ReadROCrateMetadata(workflowROCrateFilename: "str", public_name: "str") -> " raise ROCrateToolboxException( f"{ROCRATE_JSONLD_FILENAME} from within {public_name} has unmanagable MIME {putative_mime_ld}" ) + payload_dir = zipfile.Path(workflowROCrateFilename) else: raise ROCrateToolboxException( f"The RO-Crate parsing code does not know how to parse {public_name} with MIME {putative_mime}" @@ -272,7 +284,7 @@ def ReadROCrateMetadata(workflowROCrateFilename: "str", public_name: "str") -> " f"Content from {public_name} is not a valid JSON" ) from jde - return jsonld_obj + return jsonld_obj, payload_dir class ROCrateToolbox(abc.ABC): @@ -1908,6 +1920,8 @@ def generateWorkflowMetaFromJSONLD( public_name: "str", retrospective_first: "bool" = True, reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + strict_reproducibility_level: "bool" = False, + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, ParamsBlock, EnvironmentBlock, OutputsBlock, Optional[LocalWorkflow], Sequence[Container], Optional[Sequence[MaterializedInput]], Optional[Sequence[MaterializedInput]]]": matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) # Is it an RO-Crate? diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 3a1d74f9..b0168b1e 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -576,7 +576,7 @@ def __init__( else: cacheDir = tempfile.mkdtemp(prefix="WfExS", suffix="backend") # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, cacheDir) + atexit.register(shutil.rmtree, cacheDir, True) # Setting up caching directories self.cacheDir = cacheDir @@ -609,7 +609,7 @@ def __init__( else: baseWorkDir = tempfile.mkdtemp(prefix="WfExS-workdir", suffix="backend") # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, baseWorkDir) + atexit.register(shutil.rmtree, baseWorkDir, True) self.baseWorkDir = baseWorkDir self.defaultParanoidMode = False @@ -2136,7 +2136,7 @@ def getWorkflowRepoFromTRS( "AbsPath", tempfile.mkdtemp(prefix="WfExS", suffix="TRSFetched") ) # Assuring this temporal directory is removed at the end - atexit.register(shutil.rmtree, meta_dir) + atexit.register(shutil.rmtree, meta_dir, True) else: # Assuring the destination directory does exist os.makedirs(meta_dir, exist_ok=True) @@ -2594,7 +2594,9 @@ def getWorkflowRepoFromROCrateFile( """ public_name = roCrateFile - jsonld_obj = ReadROCrateMetadata(roCrateFile, public_name=public_name) + jsonld_obj, payload_dir = ReadROCrateMetadata( + roCrateFile, public_name=public_name + ) matched_crate, g = self.rocrate_toolbox.identifyROCrate( jsonld_obj, public_name=public_name ) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index b45f5d9d..b31de735 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -19,6 +19,7 @@ import atexit import copy +import dataclasses import datetime import inspect import json @@ -69,6 +70,10 @@ ) if TYPE_CHECKING: + from os import ( + PathLike, + ) + from typing import ( Any, ClassVar, @@ -220,6 +225,9 @@ WorkflowMetaConfigBlock: TypeAlias = Mapping[str, Any] WritableWorkflowMetaConfigBlock: TypeAlias = MutableMapping[str, Any] + from .utils.zipfile_path import Path as ZipfilePath + + import urllib.parse # This is needed to assure yaml.safe_load unmarshalls gives no error @@ -326,6 +334,7 @@ ) from .utils.marshalling_handling import marshall_namedtuple, unmarshall_namedtuple from .utils.misc import config_validate +from .utils.zipfile_path import path_relative_to from .fetchers.trs_files import ( TRS_SCHEME_PREFIX, @@ -1441,6 +1450,53 @@ def FromPreviousInstanceDeclaration( strict_reproducibility_level=strict_reproducibility_level, ) + @staticmethod + def _transferInputs( + payload_dir: "Union[pathlib.Path, ZipfilePath, zipfile.Path]", + inputs_dir: "pathlib.Path", + cached_inputs: "Sequence[MaterializedInput]", + ) -> "Sequence[MaterializedInput]": + new_cached_inputs = [] + for cached_input in cached_inputs: + new_cached_input = cached_input + if len(new_cached_input.values) > 0 and isinstance( + new_cached_input.values[0], MaterializedContent + ): + new_values: "MutableSequence[MaterializedContent]" = [] + for value in cast( + "Sequence[MaterializedContent]", new_cached_input.values + ): + source_file = payload_dir / value.local + dest_file = inputs_dir / path_relative_to(source_file, payload_dir) + new_value = value._replace( + local=cast("AbsPath", dest_file.as_posix()) + ) + new_values.append(new_value) + + new_cached_input = new_cached_input._replace(values=new_values) + + if ( + new_cached_input.secondaryInputs is not None + and len(new_cached_input.secondaryInputs) > 0 + and isinstance(new_cached_input.secondaryInputs[0], MaterializedContent) + ): + new_secondaryInputs: "MutableSequence[MaterializedContent]" = [] + for secondaryInput in new_cached_input.secondaryInputs: + source_file = payload_dir / secondaryInput.local + dest_file = inputs_dir / path_relative_to(source_file, payload_dir) + new_secondaryInput = secondaryInput._replace( + local=cast("AbsPath", dest_file.as_posix()) + ) + new_secondaryInputs.append(new_secondaryInput) + + new_cached_input = new_cached_input._replace( + secondaryInputs=new_secondaryInputs + ) + + new_cached_inputs.append(new_cached_input) + + return new_cached_inputs + @classmethod def FromPreviousROCrate( cls, @@ -1464,7 +1520,9 @@ def FromPreviousROCrate( based on the declaration of an existing one """ - jsonld_obj = ReadROCrateMetadata(workflowROCrateFilename, public_name) + jsonld_obj, payload_dir = ReadROCrateMetadata( + workflowROCrateFilename, public_name + ) ( repo, @@ -1481,8 +1539,112 @@ def FromPreviousROCrate( jsonld_obj, public_name, reproducibility_level=reproducibility_level, + strict_reproducibility_level=strict_reproducibility_level, + payload_dir=payload_dir, ) + # Now, some postprocessing... + if ( + reproducibility_level >= ReproducibilityLevel.Full + and payload_dir is not None + and not isinstance(payload_dir, pathlib.Path) + ): + # This one is needed when the payload_dir is defined and not a + # local path, like within a zip archive. + materialized_payload_dir = pathlib.Path( + tempfile.mkdtemp(prefix="wfexs", suffix="import") + ) + atexit.register(shutil.rmtree, materialized_payload_dir, True) + + # Fix cached workflow + if cached_workflow is not None: + workflow_dir = materialized_payload_dir / WORKDIR_WORKFLOW_RELDIR + workflow_dir.mkdir(parents=True, exist_ok=True) + + # Transfer entrypoint + if cached_workflow.relPath is not None: + dest_entrypoint = workflow_dir / cached_workflow.relPath + dest_entrypoint.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2( + cast("PathLike[str]", payload_dir / cached_workflow.relPath), + dest_entrypoint, + ) + if ( + cached_workflow.relPathFiles is not None + and len(cached_workflow.relPathFiles) > 0 + ): + # And all the elements + for rel_file in cached_workflow.relPathFiles: + if rel_file == cached_workflow.relPath: + continue + p_rel_file = urllib.parse.urlparse(rel_file) + if p_rel_file.scheme != "": + continue + + dest_file = workflow_dir / rel_file + dest_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2( + cast("PathLike[str]", payload_dir / rel_file), dest_file + ) + + # Last, the reference + cached_workflow = cached_workflow._replace( + dir=cast("AbsPath", workflow_dir.as_posix()) + ) + + # Fix containers + if len(the_containers) > 0: + containers_dir = materialized_payload_dir / WORKDIR_CONTAINERS_RELDIR + containers_dir.mkdir(parents=True, exist_ok=True) + new_containers = [] + for the_container in the_containers: + new_container = the_container + + if new_container.localPath is not None: + source_image = payload_dir / new_container.localPath + dest_image = containers_dir / path_relative_to( + source_image, payload_dir + ) + dest_image.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(cast("PathLike[str]", source_image), dest_image) + + new_container = dataclasses.replace( + new_container, + localPath=cast("AbsPath", dest_image.as_posix()), + ) + + if new_container.metadataLocalPath is not None: + source_meta = payload_dir / new_container.metadataLocalPath + dest_meta = containers_dir / path_relative_to( + source_meta, payload_dir + ) + dest_meta.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(cast("PathLike[str]", source_meta), dest_meta) + + new_container = dataclasses.replace( + new_container, + metadataLocalPath=cast("AbsPath", dest_meta.as_posix()), + ) + + new_containers.append(new_container) + + the_containers = new_containers + + # Fix inputs + inputs_dir = materialized_payload_dir / WORKDIR_INPUTS_RELDIR + if cached_inputs is not None and len(cached_inputs) > 0: + inputs_dir.mkdir(parents=True, exist_ok=True) + cached_inputs = cls._transferInputs( + payload_dir, inputs_dir, cached_inputs + ) + + # Fix environment + if cached_environment is not None and len(cached_environment) > 0: + inputs_dir.mkdir(parents=True, exist_ok=True) + cached_environment = cls._transferInputs( + payload_dir, inputs_dir, cached_environment + ) + workflow_pid = wfexs.gen_workflow_pid(repo) logging.debug( f"Repo {repo} workflow type {workflow_type} container factory {container_type}" From 94f206d7c6d0b9802bd9da8ca897169f8b8ffa5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 8 Jun 2024 05:36:47 +0200 Subject: [PATCH 26/62] Container payloads are properly located and validated within the RO-Crate --- wfexs_backend/__main__.py | 9 +- wfexs_backend/utils/rocrate.py | 348 ++++++++++++++++++++++++++------- wfexs_backend/workflow.py | 1 - 3 files changed, 287 insertions(+), 71 deletions(-) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index 7f9b622e..d2ae4492 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -297,7 +297,10 @@ def genParserSub( "--reproducibility-level", dest="reproducibility_level", default=ReproducibilityLevel.Metadata, - choices=ReproducibilityLevel, + choices=range( + min(ReproducibilityLevel).value, max(ReproducibilityLevel).value + 1 + ), + type=int, help="Max reproducibility level to be tried", ) @@ -1460,7 +1463,7 @@ def main() -> None: private_key_passphrase=private_key_passphrase, orcids=op_orcids, secure=args.secure, - reproducibility_level=args.reproducibility_level, + reproducibility_level=ReproducibilityLevel(args.reproducibility_level), strict_reproducibility_level=args.strict_reproducibility_level, ) else: @@ -1498,7 +1501,7 @@ def main() -> None: private_key_passphrase=private_key_passphrase, orcids=op_orcids, secure=args.secure, - reproducibility_level=args.reproducibility_level, + reproducibility_level=ReproducibilityLevel(args.reproducibility_level), strict_reproducibility_level=args.strict_reproducibility_level, ) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index d69325d1..f671a476 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -44,6 +44,7 @@ if TYPE_CHECKING: from typing import ( Any, + IO, Mapping, MutableMapping, MutableSequence, @@ -58,6 +59,7 @@ ) from ..common import ( + AbsPath, Fingerprint, RelPath, RepoURL, @@ -112,6 +114,7 @@ ) from .digests import ( + ComputeDigestFromFileLike, stringifyDigest, ) @@ -570,21 +573,21 @@ def identifyROCrate( """ OBTAIN_RUN_CONTAINERS_SPARQL: "Final[str]" = """\ -SELECT ?container ?container_additional_type ?type_of_container ?type_of_container_type ?container_registry ?container_name ?container_tag ?container_sha256 ?container_platform ?container_arch +SELECT DISTINCT ?container ?container_snapshot_size ?container_snapshot_sha256 ?container_additional_type ?type_of_container ?type_of_container_type ?source_container ?source_container_additional_type ?source_container_registry ?source_container_name ?source_container_tag ?source_container_sha256 ?source_container_platform ?source_container_arch ?source_container_metadata ?source_container_metadata_size ?source_container_metadata_sha256 WHERE { { - ?execution wrterm:containerImage ?container . + ?execution wrterm:containerImage ?source_container . } UNION { - ?entity s:softwareAddOn ?container. + ?entity s:softwareAddOn ?source_container. } - ?container + ?source_container a wrterm:ContainerImage ; - s:additionalType ?container_additional_type . + s:additionalType ?source_container_additional_type . OPTIONAL { - ?container - s:softwareRequirements ?container_type ; + ?source_container + s:softwareRequirements ?source_container_type ; s:applicationCategory ?type_of_container . - ?container_type + ?source_container_type a s:SoftwareApplication ; s:applicationCategory ?type_of_container_type . FILTER( @@ -592,27 +595,58 @@ def identifyROCrate( STRSTARTS(str(?type_of_container_type), str(wikidata:)) ) . } + OPTIONAL { - ?container wrterm:registry ?container_registry . + ?create_snapshot_container + a s:CreateAction ; + s:object ?source_container ; + s:result ?container . + ?container + a s:MediaObject ; + a wrterm:ContainerImage ; + s:additionalType ?container_additional_type . + OPTIONAL { + ?container wrterm:sha256 ?container_snapshot_sha256 . + } + OPTIONAL { + ?container + s:contentSize ?container_snapshot_size . + } } OPTIONAL { - ?container s:name ?container_name . + ?source_container wrterm:registry ?source_container_registry . } OPTIONAL { - ?container wrterm:tag ?container_tag . + ?source_container s:name ?source_container_name . } OPTIONAL { - ?container wrterm:sha256 ?container_sha256 . + ?source_container wrterm:tag ?source_container_tag . } OPTIONAL { - ?container + ?source_container wrterm:sha256 ?source_container_sha256 . + } + OPTIONAL { + ?source_container a s:SoftwareApplication ; - s:operatingSystem ?container_platform . + s:operatingSystem ?source_container_platform . } OPTIONAL { - ?container + ?source_container a s:SoftwareApplication ; - s:processorRequirements ?container_arch . + s:processorRequirements ?source_container_arch . + } + OPTIONAL { + ?source_container_metadata + a s:MediaObject ; + s:about ?source_container . + OPTIONAL { + ?source_container_metadata + s:contentSize ?source_container_metadata_size . + } + OPTIONAL { + ?source_container_metadata + s:sha256 ?source_container_metadata_sha256 . + } } } """ @@ -1102,6 +1136,7 @@ def _parseContainersFromWorkflow( self, g: "rdflib.graph.Graph", main_entity: "rdflib.term.Identifier", + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": # Get the list of containers qcontainers = rdflib.plugins.sparql.prepareQuery( @@ -1116,13 +1151,16 @@ def _parseContainersFromWorkflow( }, ) - return self.__parseContainersResults(qcontainersres, main_entity) + return self.__parseContainersResults( + qcontainersres, main_entity, payload_dir=payload_dir + ) def _parseContainersFromExecution( self, g: "rdflib.graph.Graph", execution: "rdflib.term.Identifier", main_entity: "rdflib.term.Identifier", + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": # Get the list of containers qcontainers = rdflib.plugins.sparql.prepareQuery( @@ -1137,12 +1175,15 @@ def _parseContainersFromExecution( }, ) - return self.__parseContainersResults(qcontainersres, main_entity) + return self.__parseContainersResults( + qcontainersres, main_entity, payload_dir=payload_dir + ) def __parseContainersResults( self, qcontainersres: "rdflib.query.Result", main_entity: "rdflib.term.Identifier", + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": container_type: "Optional[ContainerType]" = None additional_container_type: "Optional[ContainerType]" = None @@ -1216,99 +1257,264 @@ def __parseContainersResults( assert isinstance( containerrow, rdflib.query.ResultRow ), "Check the SPARQL code, as it should be a SELECT query" - self.logger.debug( - f"""\ -Container {containerrow.container} -{containerrow.container_additional_type} -{containerrow.type_of_container} -{containerrow.type_of_container_type} -{containerrow.container_registry} -{containerrow.container_name} -{containerrow.container_tag} -{containerrow.container_sha256} -{containerrow.container_platform} -{containerrow.container_arch} -""" - ) - if ( - containerrow.container_additional_type is not None - and containerrow.container_name is not None - ): + self.logger.debug("\nTuple") + for key, val in containerrow.asdict().items(): + self.logger.debug(f"{key} => {val}") + + source_container_type: "Optional[ContainerType]" = None + if containerrow.source_container_additional_type is not None: try: - putative_additional_container_image_additional_type = ( + putative_additional_source_container_image_additional_type = ( StrContainerAdditionalType2ContainerImageAdditionalType.get( - str(containerrow.container_additional_type) + str(containerrow.source_container_additional_type) ) ) - putative_additional_container_type = ( + source_container_type = ( None - if putative_additional_container_image_additional_type is None + if putative_additional_source_container_image_additional_type + is None else ( AdditionalType2ContainerType.get( - putative_additional_container_image_additional_type + putative_additional_source_container_image_additional_type ) ) ) + except Exception as e: + self.logger.error( + f"Unable to map additional type {str(containerrow.source_container_additional_type)} for {str(containerrow.source_container)}" + ) + + if source_container_type is None: + source_container_type = container_type + + if containerrow.source_container_name is not None: + try: registries: "Optional[Mapping[ContainerType, str]]" = None fingerprint = None origTaggedName = "" taggedName = "" image_signature = None - if putative_additional_container_type == ContainerType.Docker: + if source_container_type == ContainerType.Docker: the_registry = ( - str(containerrow.container_registry) - if containerrow.container_registry is not None + str(containerrow.source_container_registry) + if containerrow.source_container_registry is not None else DEFAULT_DOCKER_REGISTRY ) registries = { ContainerType.Docker: the_registry, } - container_identifier = str(containerrow.container_name) - assert containerrow.container_tag is not None - if containerrow.container_sha256 is not None: - fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.container_sha256)}" + container_identifier = str(containerrow.source_container_name) + if "/" not in container_identifier: + container_identifier = "library/" + container_identifier + assert containerrow.source_container_tag is not None + if containerrow.source_container_sha256 is not None: + fingerprint = f"{the_registry}/{container_identifier}@sha256:{str(containerrow.source_container_sha256)}" else: - fingerprint = f"{the_registry}/{container_identifier}:{str(containerrow.container_tag)}" - origTaggedName = ( - f"{container_identifier}:{str(containerrow.container_tag)}" - ) - taggedName = f"docker://{the_registry}/{container_identifier}:{str(containerrow.container_tag)}" - # Disable for now - # image_signature = stringifyDigest("sha256", bytes.fromhex(str(containerrow.container_sha256))) - elif ( - putative_additional_container_type == ContainerType.Singularity - ): - origTaggedName = str(containerrow.container_name) + fingerprint = f"{the_registry}/{container_identifier}:{str(containerrow.source_container_tag)}" + origTaggedName = f"{container_identifier}:{str(containerrow.source_container_tag)}" + taggedName = f"docker://{the_registry}/{container_identifier}:{str(containerrow.source_container_tag)}" + elif source_container_type == ContainerType.Singularity: + origTaggedName = str(containerrow.source_container_name) taggedName = origTaggedName fingerprint = origTaggedName + container_image_path: "Optional[str]" = None + metadata_container_image_path: "Optional[str]" = None + if payload_dir is not None: + if containerrow.container != containerrow.source_container: + container_image_uri = str(containerrow.container) + container_image_parsed_uri = urllib.parse.urlparse( + container_image_uri + ) + if ( + container_image_parsed_uri.scheme + == self.RELATIVE_ROCRATE_SCHEME + ): + container_image_path = container_image_parsed_uri.path + if container_image_path.startswith("/"): + container_image_path = container_image_path[1:] + + located_snapshot = payload_dir / container_image_path + if located_snapshot.exists(): + if containerrow.container_snapshot_size is not None: + if hasattr(located_snapshot, "stat"): + the_size = located_snapshot.stat().st_size + else: + the_size = located_snapshot.root.getinfo( + container_image_path + ).file_size + if isinstance( + containerrow.container_snapshot_size, + rdflib.term.Literal, + ): + container_snapshot_size = int( + containerrow.container_snapshot_size.value + ) + else: + container_snapshot_size = int( + str( + containerrow.container_snapshot_size + ) + ) + if the_size == container_snapshot_size: + with located_snapshot.open(mode="rb") as lS: + computed_image_signature = ( + ComputeDigestFromFileLike( + cast("IO[bytes]", lS), + digestAlgorithm="sha256", + ) + ) + if ( + containerrow.container_snapshot_sha256 + is not None + ): + image_signature = stringifyDigest( + "sha256", + bytes.fromhex( + str( + containerrow.container_snapshot_sha256 + ) + ), + ) + + if ( + image_signature + != computed_image_signature + ): + self.logger.warning( + f"Discarding payload {container_image_path} for {origTaggedName} (mismatching digest)" + ) + container_image_path = None + else: + image_signature = ( + computed_image_signature + ) + else: + self.logger.warning( + f"Discarding payload {container_image_path} for {origTaggedName} (mismatching file size)" + ) + container_image_path = None + else: + self.logger.warning( + f"Discarding payload {container_image_path} (not found)" + ) + container_image_path = None + + if containerrow.source_container_metadata is not None: + container_metadata_uri = str( + containerrow.source_container_metadata + ) + container_metadata_parsed_uri = urllib.parse.urlparse( + container_metadata_uri + ) + if ( + container_metadata_parsed_uri.scheme + == self.RELATIVE_ROCRATE_SCHEME + ): + metadata_container_image_path = ( + container_metadata_parsed_uri.path + ) + if metadata_container_image_path.startswith("/"): + metadata_container_image_path = ( + metadata_container_image_path[1:] + ) + + located_metadata = ( + payload_dir / metadata_container_image_path + ) + if located_metadata.exists(): + if ( + containerrow.source_container_metadata_size + is not None + ): + if hasattr(located_metadata, "stat"): + the_size = located_metadata.stat().st_size + else: + the_size = located_metadata.root.getinfo( + metadata_container_image_path + ).file_size + if isinstance( + containerrow.source_container_metadata_size, + rdflib.term.Literal, + ): + source_container_metadata_size = int( + containerrow.source_container_metadata_size.value + ) + else: + source_container_metadata_size = int( + str( + containerrow.source_container_metadata_size + ) + ) + if the_size == source_container_metadata_size: + with located_metadata.open(mode="rb") as lM: + computed_source_container_metadata_signature = ComputeDigestFromFileLike( + cast("IO[bytes]", lM), + digestAlgorithm="sha256", + ) + if ( + containerrow.source_container_metadata_sha256 + is not None + ): + source_container_metadata_signature = stringifyDigest( + "sha256", + bytes.fromhex( + str( + containerrow.source_container_metadata_sha256 + ) + ), + ) + + if ( + source_container_metadata_signature + != computed_source_container_metadata_signature + ): + self.logger.warning( + f"Discarding payload {metadata_container_image_path} for {origTaggedName} (mismatching digest)" + ) + metadata_container_image_path = None + else: + self.logger.warning( + f"Discarding payload {metadata_container_image_path} for {origTaggedName} (mismatching file size)" + ) + metadata_container_image_path = None + else: + self.logger.warning( + f"Discarding payload {metadata_container_image_path} (not found)" + ) + metadata_container_image_path = None + the_containers.append( Container( origTaggedName=origTaggedName, type=container_type, registries=registries, taggedName=cast("URIType", taggedName), + localPath=cast("AbsPath", container_image_path), + metadataLocalPath=cast( + "AbsPath", metadata_container_image_path + ), architecture=None - if containerrow.container_arch is None + if containerrow.source_container_arch is None else cast( "ProcessorArchitecture", - str(containerrow.container_arch), + str(containerrow.source_container_arch), ), operatingSystem=None - if containerrow.container_platform is None + if containerrow.source_container_platform is None else cast( "ContainerOperatingSystem", - str(containerrow.container_platform), + str(containerrow.source_container_platform), ), fingerprint=cast("Fingerprint", fingerprint), - source_type=putative_additional_container_type, + source_type=source_container_type, image_signature=image_signature, ) ) except Exception as e: self.logger.exception( - f"Unable to assign from additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" + f"Unable to assign from additional type {str(containerrow.source_container_additional_type)} for {str(containerrow.source_container)}" ) return container_type, the_containers @@ -1976,7 +2182,7 @@ def generateWorkflowMetaFromJSONLD( # also supported. # So, we are starting with the retrospective provenance # gathering the list of containers, to learn - # whi. + # which. try: qexecs = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_RUNS_SPARQL, @@ -1996,7 +2202,12 @@ def generateWorkflowMetaFromJSONLD( self.logger.debug(f"\tExecution {execrow.execution}") contresult = self._parseContainersFromExecution( - g, execrow.execution, main_entity=matched_crate.mainentity + g, + execrow.execution, + main_entity=matched_crate.mainentity, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) # TODO: deal with more than one execution if contresult is None: @@ -2041,6 +2252,9 @@ def generateWorkflowMetaFromJSONLD( contresult = self._parseContainersFromWorkflow( g, main_entity=matched_crate.mainentity, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) # TODO: deal with more than one execution if contresult is not None: diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index b31de735..86337f9e 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1649,7 +1649,6 @@ def FromPreviousROCrate( logging.debug( f"Repo {repo} workflow type {workflow_type} container factory {container_type}" ) - logging.debug(f"Containers {the_containers}") workflow_meta: "WritableWorkflowMetaConfigBlock" = { "workflow_id": workflow_pid, "workflow_type": workflow_type.shortname, From 42280d1d9b03d805b30e3af503f2a9b7b881b11f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 8 Jun 2024 21:51:46 +0200 Subject: [PATCH 27/62] Fixed case where only source containers are declared --- wfexs_backend/utils/rocrate.py | 87 +++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 6 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index f671a476..6e75a54c 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -573,7 +573,7 @@ def identifyROCrate( """ OBTAIN_RUN_CONTAINERS_SPARQL: "Final[str]" = """\ -SELECT DISTINCT ?container ?container_snapshot_size ?container_snapshot_sha256 ?container_additional_type ?type_of_container ?type_of_container_type ?source_container ?source_container_additional_type ?source_container_registry ?source_container_name ?source_container_tag ?source_container_sha256 ?source_container_platform ?source_container_arch ?source_container_metadata ?source_container_metadata_size ?source_container_metadata_sha256 +SELECT DISTINCT ?container ?container_snapshot_size ?container_snapshot_sha256 ?container_additional_type ?type_of_container ?type_of_container_type ?source_container ?source_container_additional_type ?source_container_registry ?source_container_name ?source_container_tag ?source_container_sha256 ?source_container_platform ?source_container_arch ?source_container_metadata ?source_container_metadata_size ?source_container_metadata_sha256 ?type_of_source_container ?type_of_source_container_type WHERE { { ?execution wrterm:containerImage ?source_container . @@ -586,13 +586,13 @@ def identifyROCrate( OPTIONAL { ?source_container s:softwareRequirements ?source_container_type ; - s:applicationCategory ?type_of_container . + s:applicationCategory ?type_of_source_container . ?source_container_type a s:SoftwareApplication ; - s:applicationCategory ?type_of_container_type . + s:applicationCategory ?type_of_source_container_type . FILTER( - STRSTARTS(str(?type_of_container), str(wikidata:)) && - STRSTARTS(str(?type_of_container_type), str(wikidata:)) + STRSTARTS(str(?type_of_source_container), str(wikidata:)) && + STRSTARTS(str(?type_of_source_container_type), str(wikidata:)) ) . } @@ -612,6 +612,18 @@ def identifyROCrate( ?container s:contentSize ?container_snapshot_size . } + OPTIONAL { + ?container + s:softwareRequirements ?container_type ; + s:applicationCategory ?type_of_container . + ?container_type + a s:SoftwareApplication ; + s:applicationCategory ?type_of_container_type . + FILTER( + STRSTARTS(str(?type_of_container), str(wikidata:)) && + STRSTARTS(str(?type_of_container_type), str(wikidata:)) + ) . + } } OPTIONAL { ?source_container wrterm:registry ?source_container_registry . @@ -1186,7 +1198,9 @@ def __parseContainersResults( payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": container_type: "Optional[ContainerType]" = None + source_container_type: "Optional[ContainerType]" = None additional_container_type: "Optional[ContainerType]" = None + additional_source_container_type: "Optional[ContainerType]" = None the_containers: "MutableSequence[Container]" = [] # This is the first pass, to learn about the kind of # container factory to use @@ -1210,6 +1224,20 @@ def __parseContainersResults( f"Not all the containers of execution {main_entity} were materialized with {container_type} factory (also found {putative_container_type})" ) + if containerrow.type_of_source_container is not None: + putative_source_container_type = ApplicationCategory2ContainerType.get( + str(containerrow.type_of_source_container) + ) + if source_container_type is None: + source_container_type = putative_source_container_type + elif ( + putative_source_container_type is not None + and putative_source_container_type != source_container_type + ): + self.logger.warning( + f"Not all the source containers of execution {main_entity} were materialized with {source_container_type} factory (also found {putative_source_container_type})" + ) + # These hints should be left by any compliant WRROC # implementation if containerrow.container_additional_type is not None: @@ -1243,11 +1271,58 @@ def __parseContainersResults( f"Unable to map additional type {str(containerrow.container_additional_type)} for {str(containerrow.container)}" ) + # These hints should be left by any compliant WRROC + # implementation + if containerrow.source_container_additional_type is not None: + try: + putative_additional_source_container_image_additional_type = ( + StrContainerAdditionalType2ContainerImageAdditionalType.get( + str(containerrow.source_container_additional_type) + ) + ) + putative_additional_source_container_type = ( + None + if putative_additional_source_container_image_additional_type + is None + else ( + AdditionalType2ContainerType.get( + putative_additional_source_container_image_additional_type + ) + ) + ) + if additional_source_container_type is None: + additional_source_container_type = ( + putative_additional_source_container_type + ) + elif ( + putative_additional_source_container_type is not None + and putative_additional_source_container_type + not in (source_container_type, additional_source_container_type) + ): + self.logger.warning( + f"Not all the source containers of execution {main_entity} were labelled with {additional_source_container_type} factory (also found {putative_additional_source_container_type})" + ) + except Exception as e: + self.logger.error( + f"Unable to map additional type {str(containerrow.source_container_additional_type)} for {str(containerrow.source_container)}" + ) + + # Assigning this, as it is going to be used later to + # build the list of containers + if ( + source_container_type is None + and additional_source_container_type is not None + ): + source_container_type = additional_source_container_type + # Assigning this, as it is going to be used later to # build the list of containers if container_type is None and additional_container_type is not None: container_type = additional_container_type + if container_type is None and source_container_type is not None: + container_type = source_container_type + if container_type is None: return None @@ -1262,7 +1337,7 @@ def __parseContainersResults( for key, val in containerrow.asdict().items(): self.logger.debug(f"{key} => {val}") - source_container_type: "Optional[ContainerType]" = None + source_container_type = None if containerrow.source_container_additional_type is not None: try: putative_additional_source_container_image_additional_type = ( From f1c50d680ca9aaa14c7596f04e578f122f47d6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 8 Jun 2024 22:11:02 +0200 Subject: [PATCH 28/62] Added the scaffolding code needed to propagate the rescued cached inputs and environment --- wfexs_backend/utils/rocrate.py | 60 +++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 6e75a54c..3d05071b 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -1741,7 +1741,8 @@ def _parseInputsFromExecution( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - ) -> "ParamsBlock": + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qinputs = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_EXECUTION_INPUTS_SPARQL, @@ -1754,7 +1755,9 @@ def _parseInputsFromExecution( }, ) - return self.__parseInputsResults(qinputsres, g, default_licences, public_name) + return self.__parseInputsResults( + qinputsres, g, default_licences, public_name, payload_dir=payload_dir + ) def _parseInputsFromMainEntity( self, @@ -1762,7 +1765,8 @@ def _parseInputsFromMainEntity( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - ) -> "ParamsBlock": + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qwinputs = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_WORKFLOW_INPUTS_SPARQL, @@ -1775,7 +1779,9 @@ def _parseInputsFromMainEntity( }, ) - return self.__parseInputsResults(qwinputsres, g, default_licences, public_name) + return self.__parseInputsResults( + qwinputsres, g, default_licences, public_name, payload_dir=payload_dir + ) def __parseInputsResults( self, @@ -1783,9 +1789,11 @@ def __parseInputsResults( g: "rdflib.graph.Graph", default_licences: "Sequence[str]", public_name: "str", - ) -> "ParamsBlock": + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # TODO: implement this params: "MutableParamsBlock" = {} + cached_inputs: "MutableSequence[MaterializedInput]" = [] for inputrow in qinputsres: assert isinstance( inputrow, rdflib.query.ResultRow @@ -1897,7 +1905,7 @@ def __parseInputsResults( else: base[param_last] = the_value - return params + return params, cached_inputs if payload_dir is not None else None def _parseEnvFromExecution( self, @@ -1906,7 +1914,8 @@ def _parseEnvFromExecution( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - ) -> "EnvironmentBlock": + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + ) -> "Tuple[EnvironmentBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qenv = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_EXECUTION_ENV_SPARQL, @@ -1919,7 +1928,9 @@ def _parseEnvFromExecution( }, ) - return self.__parseEnvResults(qenvres, g, default_licences, public_name) + return self.__parseEnvResults( + qenvres, g, default_licences, public_name, payload_dir=payload_dir + ) def _parseEnvFromMainEntity( self, @@ -1927,7 +1938,8 @@ def _parseEnvFromMainEntity( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - ) -> "EnvironmentBlock": + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + ) -> "Tuple[EnvironmentBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qwenv = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_WORKFLOW_ENV_SPARQL, @@ -1940,7 +1952,9 @@ def _parseEnvFromMainEntity( }, ) - return self.__parseEnvResults(qwenvres, g, default_licences, public_name) + return self.__parseEnvResults( + qwenvres, g, default_licences, public_name, payload_dir=payload_dir + ) def __parseEnvResults( self, @@ -1948,12 +1962,14 @@ def __parseEnvResults( g: "rdflib.graph.Graph", default_licences: "Sequence[str]", public_name: "str", - ) -> "EnvironmentBlock": + payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + ) -> "Tuple[EnvironmentBlock, Optional[Sequence[MaterializedInput]]]": """ This method is (almost) identical to __parseInputsResults """ # TODO: implement this environment: "MutableMapping[str, Any]" = {} + cached_environment: "MutableSequence[MaterializedInput]" = [] for envrow in qenvres: assert isinstance( envrow, rdflib.query.ResultRow @@ -2057,7 +2073,7 @@ def __parseEnvResults( else: environment[env_name] = the_value - return environment + return environment, cached_environment if payload_dir is not None else None def _getLicences( self, @@ -2292,20 +2308,26 @@ def generateWorkflowMetaFromJSONLD( # TODO: which are the needed inputs, to be integrated # into the latter workflow_meta? - params = self._parseInputsFromExecution( + params, cached_inputs = self._parseInputsFromExecution( g, execrow.execution, main_entity=matched_crate.mainentity, default_licences=crate_licences, public_name=public_name, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) - environment = self._parseEnvFromExecution( + environment, cached_environment = self._parseEnvFromExecution( g, execrow.execution, main_entity=matched_crate.mainentity, default_licences=crate_licences, public_name=public_name, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) outputs = self._parseOutputsFromExecution( @@ -2335,18 +2357,24 @@ def generateWorkflowMetaFromJSONLD( if contresult is not None: container_type, the_containers = contresult - params = self._parseInputsFromMainEntity( + params, cached_inputs = self._parseInputsFromMainEntity( g, main_entity=matched_crate.mainentity, default_licences=crate_licences, public_name=public_name, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) - environment = self._parseEnvFromMainEntity( + environment, cached_environment = self._parseEnvFromMainEntity( g, main_entity=matched_crate.mainentity, default_licences=crate_licences, public_name=public_name, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) if len(outputs) == 0: From feafafcee6d259e1ad7d30cfcddd20ec15ca93c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sun, 9 Jun 2024 23:33:07 +0200 Subject: [PATCH 29/62] Added both input and environment payload processing. * First, both input and environment SPARLQ queries related to executions and staged workflows have been rebuilt, to improve them and give the row elements the very same column names. * Then, a common method which performs input file validation has been developed, in order to discard "smelly" content. * Last, the MaterializedInput instances are properly built by the common method. --- wfexs_backend/utils/rocrate.py | 364 ++++++++++++++++++++++----------- 1 file changed, 242 insertions(+), 122 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 3d05071b..db9d897e 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -64,6 +64,7 @@ RelPath, RepoURL, RepoTag, + SymbolicParamName, URIType, ) @@ -104,7 +105,10 @@ from ..common import ( ContainerType, ContentKind, + DefaultNoLicenceTuple, + LicensedURI, LocalWorkflow, + MaterializedContent, MaterializedInput, ) @@ -666,7 +670,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_WORKFLOW_INPUTS_SPARQL: "Final[str]" = """\ -SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid ?file_size ?file_sha256 WHERE { ?main_entity bsworkflow:input ?inputfp . ?inputfp @@ -679,27 +683,11 @@ def identifyROCrate( ?input a s:MediaObject . BIND (?input AS ?fileid) - OPTIONAL { - ?input - s:contentUrl ?fileuri . - } - OPTIONAL { - ?input - s:identifier ?filepid . - } } UNION { # A directory, which is a schema.org Dataset ?input a s:Dataset . BIND (?input AS ?fileid) - OPTIONAL { - ?input - s:contentUrl ?fileuri . - } - OPTIONAL { - ?input - s:identifier ?filepid . - } FILTER EXISTS { # subquery to determine it is not an empty Dataset SELECT ?dircomp @@ -712,9 +700,9 @@ def identifyROCrate( } } UNION { # A single property value, which can be either Integer, Text, Boolean or Float + BIND (?input AS ?fileid) ?input - a s:PropertyValue ; - s:value ?value . + a s:PropertyValue . } UNION { # A combination of files or directories or property values VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } @@ -724,15 +712,23 @@ def identifyROCrate( ?component a ?leaf_type . BIND (?component AS ?fileid) - OPTIONAL { - ?component s:contentUrl ?fileuri . - } - OPTIONAL { - ?component s:identifier ?filepid . - } - OPTIONAL { - ?component s:value ?value . - } + } + OPTIONAL { + ?fileid s:contentUrl ?fileuri . + } + OPTIONAL { + ?fileid s:identifier ?filepid . + } + OPTIONAL { + ?fileid + wrterm:sha256 ?file_sha256 . + } + OPTIONAL { + ?fileid + s:contentSize ?file_size . + } + OPTIONAL { + ?fileid s:value ?value . } } """ @@ -740,7 +736,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_WORKFLOW_ENV_SPARQL: "Final[str]" = """\ -SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid +SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid ?file_sha256 ?file_size WHERE { ?main_entity wrterm:environment ?envfp . ?envfp @@ -754,28 +750,12 @@ def identifyROCrate( a s:MediaObject ; s:name ?name_env . BIND (?env AS ?fileid) - OPTIONAL { - ?env - s:contentUrl ?fileuri . - } - OPTIONAL { - ?env - s:identifier ?filepid . - } } UNION { # A directory, which is a schema.org Dataset ?env a s:Dataset ; s:name ?name_env . BIND (?env AS ?fileid) - OPTIONAL { - ?env - s:contentUrl ?fileuri . - } - OPTIONAL { - ?env - s:identifier ?filepid . - } FILTER EXISTS { # subquery to determine it is not an empty Dataset SELECT ?dircomp @@ -790,8 +770,8 @@ def identifyROCrate( # A single property value, which can be either Integer, Text, Boolean or Float ?env a s:PropertyValue ; - s:name ?name_env ; - s:value ?value . + s:name ?name_env . + BIND (?env AS ?fileid) } UNION { # A combination of files or directories or property values VALUES ( ?leaf_type ) { ( s:Integer ) ( s:Text ) ( s:Boolean ) ( s:Float ) ( s:MediaObject ) ( s:Dataset ) } @@ -802,15 +782,23 @@ def identifyROCrate( ?component a ?leaf_type . BIND (?component AS ?fileid) - OPTIONAL { - ?component s:contentUrl ?fileuri . - } - OPTIONAL { - ?component s:identifer ?filepid . - } - OPTIONAL { - ?component s:value ?value . - } + } + OPTIONAL { + ?fileid s:contentUrl ?fileuri . + } + OPTIONAL { + ?fileid s:identifier ?filepid . + } + OPTIONAL { + ?fileid + wrterm:sha256 ?file_sha256 . + } + OPTIONAL { + ?fileid + s:contentSize ?file_size . + } + OPTIONAL { + ?fileid s:value ?value . } } """ @@ -847,7 +835,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_EXECUTION_INPUTS_SPARQL: "Final[str]" = """\ -SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid +SELECT ?input ?name ?inputfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid ?file_sha256 ?file_size WHERE { ?execution s:object ?input . { @@ -857,14 +845,6 @@ def identifyROCrate( a s:MediaObject ; s:exampleOfWork ?inputfp . BIND (?input AS ?fileid) - OPTIONAL { - ?input - s:contentUrl ?fileuri . - } - OPTIONAL { - ?input - s:identifier ?filepid . - } ?inputfp a bs:FormalParameter ; s:name ?name ; @@ -876,14 +856,6 @@ def identifyROCrate( a s:Dataset ; s:exampleOfWork ?inputfp . BIND (?input AS ?fileid) - OPTIONAL { - ?input - s:contentUrl ?fileuri . - } - OPTIONAL { - ?input - s:identifier ?filepid . - } ?inputfp a bs:FormalParameter ; s:name ?name ; @@ -903,8 +875,8 @@ def identifyROCrate( VALUES (?additional_type) { ( "Integer" ) ( "Text" ) ( "Boolean" ) ( "Float" ) } ?input a s:PropertyValue ; - s:exampleOfWork ?inputfp ; - s:value ?value . + s:exampleOfWork ?inputfp . + BIND (?input AS ?fileid) ?inputfp a bs:FormalParameter ; s:name ?name ; @@ -924,15 +896,23 @@ def identifyROCrate( ?component a ?leaf_type . BIND (?component AS ?fileid) - OPTIONAL { - ?component s:contentUrl ?fileuri . - } - OPTIONAL { - ?component s:identifier ?filepid . - } - OPTIONAL { - ?component s:value ?value . - } + } + OPTIONAL { + ?fileid s:contentUrl ?fileuri . + } + OPTIONAL { + ?fileid s:identifier ?filepid . + } + OPTIONAL { + ?fileid + wrterm:sha256 ?file_sha256 . + } + OPTIONAL { + ?fileid + s:contentSize ?file_size . + } + OPTIONAL { + ?fileid s:value ?value . } } """ @@ -940,7 +920,7 @@ def identifyROCrate( # This compound query is much faster when each of the UNION components # is evaluated separately OBTAIN_EXECUTION_ENV_SPARQL: "Final[str]" = """\ -SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid +SELECT ?env ?name ?name_env ?envfp ?additional_type ?fileuri ?filepid ?value ?component ?leaf_type ?fileid ?file_sha256 ?file_size WHERE { ?execution wrterm:environment ?env . { @@ -951,14 +931,6 @@ def identifyROCrate( s:name ?name_env ; s:exampleOfWork ?envfp . BIND (?env AS ?fileid) - OPTIONAL { - ?env - s:contentUrl ?fileuri . - } - OPTIONAL { - ?env - s:identifier ?filepid . - } ?envfp a bs:FormalParameter ; s:name ?name ; @@ -971,14 +943,6 @@ def identifyROCrate( s:name ?name_env ; s:exampleOfWork ?envfp . BIND (?env AS ?fileid) - OPTIONAL { - ?env - s:contentUrl ?fileuri . - } - OPTIONAL { - ?env - s:identifier ?filepid . - } ?envfp a bs:FormalParameter ; s:name ?name ; @@ -999,8 +963,8 @@ def identifyROCrate( ?env a s:PropertyValue ; s:name ?name_env ; - s:exampleOfWork ?envfp ; - s:value ?value . + s:exampleOfWork ?envfp . + BIND (?env AS ?fileid) ?envfp a bs:FormalParameter ; s:name ?name ; @@ -1021,15 +985,23 @@ def identifyROCrate( ?component a ?leaf_type . BIND (?component AS ?fileid) - OPTIONAL { - ?component s:contentUrl ?fileuri . - } - OPTIONAL { - ?component s:identifier ?filepid . - } - OPTIONAL { - ?component s:value ?value . - } + } + OPTIONAL { + ?fileid s:contentUrl ?fileuri . + } + OPTIONAL { + ?fileid s:identifier ?filepid . + } + OPTIONAL { + ?fileid + wrterm:sha256 ?file_sha256 . + } + OPTIONAL { + ?fileid + s:contentSize ?file_size . + } + OPTIONAL { + ?fileid s:value ?value . } } """ @@ -1783,6 +1755,116 @@ def _parseInputsFromMainEntity( qwinputsres, g, default_licences, public_name, payload_dir=payload_dir ) + def __processPayloadInput( + self, + inputrow: "rdflib.query.ResultRow", + payload_dir: "Union[pathlib.Path, ZipfilePath, zipfile.Path]", + the_uri: "str", + licences: "Sequence[str]", + input_type: "str", + kindobj: "ContentKind", + cached_inputs_hash: "MutableMapping[str, MaterializedInput]", + ) -> "MutableMapping[str, MaterializedInput]": + input_uri = str(inputrow.fileid) + input_name = str(inputrow.name) + input_parsed_uri = urllib.parse.urlparse(input_uri) + if input_parsed_uri.scheme == self.RELATIVE_ROCRATE_SCHEME: + input_path = input_parsed_uri.path + if input_path.startswith("/"): + input_path = input_path[1:] + + located_input = payload_dir / input_path + include_input = located_input.exists() + if include_input: + # Is it what it was claimed? + include_input = ( + kindobj == ContentKind.File and located_input.is_file() + ) or (kindobj == ContentKind.Directory and located_input.is_dir()) + if not include_input: + self.logger.warning( + f"Discarding payload {input_path} for {input_type} {input_name} (not is a {kindobj.value})" + ) + else: + self.logger.warning( + f"Discarding payload {input_path} for {input_type} {input_name} (not found)" + ) + + if ( + include_input + and kindobj == ContentKind.File + and inputrow.file_size is not None + ): + # Does the recorded file size match? + if hasattr(located_input, "stat"): + the_size = located_input.stat().st_size + else: + the_size = located_input.root.getinfo(input_path).file_size + if isinstance( + inputrow.file_size, + rdflib.term.Literal, + ): + file_size = int(inputrow.file_size.value) + else: + file_size = int(str(inputrow.file_size)) + + include_input = the_size == file_size + if not include_input: + self.logger.warning( + f"Discarding payload {input_path} for {input_type} {input_name} (mismatching file size)" + ) + + if include_input and kindobj == ContentKind.File: + with located_input.open(mode="rb") as lI: + the_signature = ComputeDigestFromFileLike( + cast("IO[bytes]", lI), + digestAlgorithm="sha256", + ) + if inputrow.file_sha256 is not None: + file_signature = stringifyDigest( + "sha256", + bytes.fromhex(str(inputrow.file_sha256)), + ) + + include_input = file_signature == the_signature + if not include_input: + self.logger.warning( + f"Discarding payload {input_path} for {input_type} {input_name} (mismatching digest)" + ) + else: + file_signature = the_signature + + if include_input: + licences_tuple = ( + cast("Tuple[URIType, ...]", tuple(licences)) + if len(licences) > 0 + else DefaultNoLicenceTuple + ) + mat_content = MaterializedContent( + local=cast("AbsPath", input_path), + licensed_uri=LicensedURI( + uri=cast("URIType", the_uri), + licences=licences_tuple, + ), + # TODO: better inference, as it might have a side effect + prettyFilename=cast("RelPath", located_input.name), + kind=kindobj, + ) + cached_input = cached_inputs_hash.get(input_name) + if cached_input is None: + cached_input = MaterializedInput( + name=cast("SymbolicParamName", input_name), + values=[mat_content], + # implicit=, + ) + else: + cached_input = cached_input._replace( + values=[*cached_input.values, mat_content], + ) + + cached_inputs_hash[input_name] = cached_input + + return cached_inputs_hash + def __parseInputsResults( self, qinputsres: "rdflib.query.Result", @@ -1793,7 +1875,7 @@ def __parseInputsResults( ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # TODO: implement this params: "MutableParamsBlock" = {} - cached_inputs: "MutableSequence[MaterializedInput]" = [] + cached_inputs_hash: "MutableMapping[str, MaterializedInput]" = {} for inputrow in qinputsres: assert isinstance( inputrow, rdflib.query.ResultRow @@ -1812,6 +1894,7 @@ def __parseInputsResults( additional_type = str(inputrow.additional_type) valarr: "Optional[MutableSequence[Any]]" = None valobj: "Optional[MutableMapping[str, Any]]" = None + kindobj: "Optional[ContentKind]" = None # Is it a nested one? if additional_type == "Collection": leaf_type = str(inputrow.leaf_type) @@ -1826,12 +1909,15 @@ def __parseInputsResults( # Is it a file or a directory? if additional_type in ("File", "Dataset"): + kindobj = ( + ContentKind.Directory + if additional_type == "Dataset" + else ContentKind.File + ) valobj = base.setdefault( param_last, { - "c-l-a-s-s": ContentKind.Directory.name - if additional_type == "Dataset" - else ContentKind.File.name, + "c-l-a-s-s": kindobj.value, }, ) @@ -1874,6 +1960,18 @@ def __parseInputsResults( valurl.append(the_url) else: valobj["url"] = the_url + + if payload_dir is not None and inputrow.fileid is not None: + assert kindobj is not None + self.__processPayloadInput( + inputrow, + payload_dir, + the_uri, + licences, + "input", + kindobj, + cached_inputs_hash, + ) else: the_value_node: "rdflib.term.Identifier" = inputrow.value the_value: "Union[str, int, float, bool]" @@ -1905,7 +2003,10 @@ def __parseInputsResults( else: base[param_last] = the_value - return params, cached_inputs if payload_dir is not None else None + return ( + params, + list(cached_inputs_hash.values()) if payload_dir is not None else None, + ) def _parseEnvFromExecution( self, @@ -1969,7 +2070,7 @@ def __parseEnvResults( """ # TODO: implement this environment: "MutableMapping[str, Any]" = {} - cached_environment: "MutableSequence[MaterializedInput]" = [] + cached_environment_hash: "MutableMapping[str, MaterializedInput]" = {} for envrow in qenvres: assert isinstance( envrow, rdflib.query.ResultRow @@ -1981,6 +2082,7 @@ def __parseEnvResults( additional_type = str(envrow.additional_type) valarr: "Optional[MutableSequence[Any]]" = None valobj: "Optional[MutableMapping[str, Any]]" = None + kindobj: "Optional[ContentKind]" = None # Is it a nested one? if additional_type == "Collection": leaf_type = str(envrow.leaf_type) @@ -1995,12 +2097,15 @@ def __parseEnvResults( # Is it a file or a directory? if additional_type in ("File", "Dataset"): + kindobj = ( + ContentKind.Directory + if additional_type == "Dataset" + else ContentKind.File + ) valobj = environment.setdefault( env_name, { - "c-l-a-s-s": ContentKind.Directory.name - if additional_type == "Dataset" - else ContentKind.File.name, + "c-l-a-s-s": kindobj.value, }, ) @@ -2042,6 +2147,18 @@ def __parseEnvResults( valurl.append(the_url) else: valobj["url"] = the_url + + if payload_dir is not None and envrow.fileid is not None: + assert kindobj is not None + self.__processPayloadInput( + envrow, + payload_dir, + the_uri, + licences, + "environment variable", + kindobj, + cached_environment_hash, + ) else: the_value_node: "rdflib.term.Identifier" = envrow.value the_value: "Union[str, int, float, bool]" @@ -2073,7 +2190,10 @@ def __parseEnvResults( else: environment[env_name] = the_value - return environment, cached_environment if payload_dir is not None else None + return ( + environment, + list(cached_environment_hash.values()) if payload_dir is not None else None, + ) def _getLicences( self, From 05676d6135cca7c93b6b8d793d88d6d556a5a080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 10 Jun 2024 00:02:08 +0200 Subject: [PATCH 30/62] Better handling of container payload validation --- wfexs_backend/utils/rocrate.py | 250 ++++++++++++++++++--------------- 1 file changed, 140 insertions(+), 110 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index db9d897e..83535c48 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -1383,69 +1383,79 @@ def __parseContainersResults( container_image_path = container_image_path[1:] located_snapshot = payload_dir / container_image_path - if located_snapshot.exists(): - if containerrow.container_snapshot_size is not None: - if hasattr(located_snapshot, "stat"): - the_size = located_snapshot.stat().st_size - else: - the_size = located_snapshot.root.getinfo( - container_image_path - ).file_size - if isinstance( - containerrow.container_snapshot_size, - rdflib.term.Literal, - ): - container_snapshot_size = int( - containerrow.container_snapshot_size.value + include_container_image = located_snapshot.exists() + if include_container_image: + include_container_image = located_snapshot.is_file() + if not include_container_image: + self.logger.warning( + f"Discarding container image payload {container_image_path} for {origTaggedName} (is not a file)" + ) + else: + self.logger.warning( + f"Discarding container image payload {container_image_path} for {origTaggedName} (not found)" + ) + + if ( + include_container_image + and containerrow.container_snapshot_size is not None + ): + if hasattr(located_snapshot, "stat"): + the_size = located_snapshot.stat().st_size + else: + the_size = located_snapshot.root.getinfo( + container_image_path + ).file_size + if isinstance( + containerrow.container_snapshot_size, + rdflib.term.Literal, + ): + container_snapshot_size = int( + containerrow.container_snapshot_size.value + ) + else: + container_snapshot_size = int( + str(containerrow.container_snapshot_size) + ) + include_container_image = ( + the_size == container_snapshot_size + ) + if not include_container_image: + self.logger.warning( + f"Discarding container image payload {container_image_path} for {origTaggedName} (mismatching file size)" + ) + + if include_container_image: + with located_snapshot.open(mode="rb") as lS: + computed_image_signature = ( + ComputeDigestFromFileLike( + cast("IO[bytes]", lS), + digestAlgorithm="sha256", ) - else: - container_snapshot_size = int( + ) + if ( + containerrow.container_snapshot_sha256 + is not None + ): + image_signature = stringifyDigest( + "sha256", + bytes.fromhex( str( - containerrow.container_snapshot_size - ) - ) - if the_size == container_snapshot_size: - with located_snapshot.open(mode="rb") as lS: - computed_image_signature = ( - ComputeDigestFromFileLike( - cast("IO[bytes]", lS), - digestAlgorithm="sha256", - ) - ) - if ( - containerrow.container_snapshot_sha256 - is not None - ): - image_signature = stringifyDigest( - "sha256", - bytes.fromhex( - str( - containerrow.container_snapshot_sha256 - ) - ), + containerrow.container_snapshot_sha256 ) + ), + ) - if ( - image_signature - != computed_image_signature - ): - self.logger.warning( - f"Discarding payload {container_image_path} for {origTaggedName} (mismatching digest)" - ) - container_image_path = None - else: - image_signature = ( - computed_image_signature - ) - else: + include_container_image = ( + image_signature == computed_image_signature + ) + if not include_container_image: self.logger.warning( - f"Discarding payload {container_image_path} for {origTaggedName} (mismatching file size)" + f"Discarding payload {container_image_path} for {origTaggedName} (mismatching digest)" ) - container_image_path = None - else: - self.logger.warning( - f"Discarding payload {container_image_path} (not found)" - ) + else: + image_signature = computed_image_signature + + if not include_container_image: container_image_path = None if containerrow.source_container_metadata is not None: @@ -1470,66 +1480,86 @@ def __parseContainersResults( located_metadata = ( payload_dir / metadata_container_image_path ) - if located_metadata.exists(): + include_metadata_container_image = ( + located_metadata.exists() + ) + if include_metadata_container_image: + include_metadata_container_image = ( + located_metadata.is_file() + ) + if not include_metadata_container_image: + self.logger.warning( + f"Discarding container metadata payload {metadata_container_image_path} for {origTaggedName} (is not a file)" + ) + else: + self.logger.warning( + f"Discarding container metadata payload {metadata_container_image_path} for {origTaggedName} (not found)" + ) + + if ( + include_metadata_container_image + and containerrow.source_container_metadata_size + is not None + ): + if hasattr(located_metadata, "stat"): + the_size = located_metadata.stat().st_size + else: + the_size = located_metadata.root.getinfo( + metadata_container_image_path + ).file_size + if isinstance( + containerrow.source_container_metadata_size, + rdflib.term.Literal, + ): + source_container_metadata_size = int( + containerrow.source_container_metadata_size.value + ) + else: + source_container_metadata_size = int( + str( + containerrow.source_container_metadata_size + ) + ) + + include_metadata_container_image = ( + the_size == source_container_metadata_size + ) + if not include_metadata_container_image: + self.logger.warning( + f"Discarding container metadata payload {container_image_path} for {origTaggedName} (mismatching file size)" + ) + + if include_metadata_container_image: + with located_metadata.open(mode="rb") as lM: + computed_source_container_metadata_signature = ( + ComputeDigestFromFileLike( + cast("IO[bytes]", lM), + digestAlgorithm="sha256", + ) + ) if ( - containerrow.source_container_metadata_size + containerrow.source_container_metadata_sha256 is not None ): - if hasattr(located_metadata, "stat"): - the_size = located_metadata.stat().st_size - else: - the_size = located_metadata.root.getinfo( - metadata_container_image_path - ).file_size - if isinstance( - containerrow.source_container_metadata_size, - rdflib.term.Literal, - ): - source_container_metadata_size = int( - containerrow.source_container_metadata_size.value - ) - else: - source_container_metadata_size = int( + source_container_metadata_signature = stringifyDigest( + "sha256", + bytes.fromhex( str( - containerrow.source_container_metadata_size - ) - ) - if the_size == source_container_metadata_size: - with located_metadata.open(mode="rb") as lM: - computed_source_container_metadata_signature = ComputeDigestFromFileLike( - cast("IO[bytes]", lM), - digestAlgorithm="sha256", + containerrow.source_container_metadata_sha256 ) - if ( - containerrow.source_container_metadata_sha256 - is not None - ): - source_container_metadata_signature = stringifyDigest( - "sha256", - bytes.fromhex( - str( - containerrow.source_container_metadata_sha256 - ) - ), - ) - - if ( - source_container_metadata_signature - != computed_source_container_metadata_signature - ): - self.logger.warning( - f"Discarding payload {metadata_container_image_path} for {origTaggedName} (mismatching digest)" - ) - metadata_container_image_path = None - else: + ), + ) + + include_metadata_container_image = ( + source_container_metadata_signature + == computed_source_container_metadata_signature + ) + if not include_metadata_container_image: self.logger.warning( - f"Discarding payload {metadata_container_image_path} for {origTaggedName} (mismatching file size)" + f"Discarding container metadata payload {metadata_container_image_path} for {origTaggedName} (mismatching digest)" ) - metadata_container_image_path = None - else: - self.logger.warning( - f"Discarding payload {metadata_container_image_path} (not found)" - ) + + if not include_metadata_container_image: metadata_container_image_path = None the_containers.append( From 69136b1bbc1598e5670b96baf9847ae6541be078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Jun 2024 04:27:44 +0200 Subject: [PATCH 31/62] Make ZipfilePath a subclass of pathlib.Path , to simplify future code --- wfexs_backend/utils/zipfile_path.py | 116 +++++++++++++++++----------- 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index 88200be6..4e0bfd9c 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -29,13 +29,12 @@ if TYPE_CHECKING: - from os import ( - PathLike, - ) + import os from typing import ( Any, Dict, + Generator, IO, Iterable, Iterator, @@ -148,7 +147,7 @@ def getinfo(self, name: "str") -> "ZipInfo": @classmethod def make( - cls, source: "Union[CompleteDirs, ZipFile, str, PathLike[str]]" + cls, source: "Union[CompleteDirs, ZipFile, str, os.PathLike[str]]" ) -> "CompleteDirs": """ Given a source (filename or zipfile), return an @@ -197,16 +196,16 @@ def _extract_text_encoding( def path_relative_to( - path: "Union[Path, pathlib.Path, zipfile.Path]", - other: "Union[Path, pathlib.Path, zipfile.Path]", - *extra: "Union[str, PathLike[str]]" + path: "pathlib.Path", other: "pathlib.Path", *extra: "Union[str, os.PathLike[str]]" ) -> "str": - # Method body is borrowed from Python 3.12 + # Method body is partially borrowed from Python 3.12 # zipfile.Path.relative_to method. return posixpath.relpath(str(path), str(other.joinpath(*extra))) -class Path: +# Older versions of Python do not have zipfile.Path +# and newer are not compatible with pathlib.Path +class ZipfilePath(pathlib.Path): """ A pathlib-compatible interface for zip files. @@ -284,10 +283,12 @@ class Path: 'mem' """ - __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" + __repr = "{self.__class__.__name__}({self._root.filename!r}, {self._at!r})" def __init__( - self, root: "Union[str, CompleteDirs, PathLike[str], ZipFile]", at: "str" = "" + self, + root: "Union[str, CompleteDirs, os.PathLike[str], ZipFile]", + at: "str" = "", ): """ Construct a Path from a ZipFile or filename. @@ -298,15 +299,16 @@ def __init__( original type, the caller should either create a separate ZipFile object or pass a filename. """ - self.root = FastLookup.make(root) - self.at = at + self._root = FastLookup.make(root) + self._at = at - def open( + def open( # type: ignore[override] self, mode: "str" = "r", - *args: "Any", pwd: "Optional[bytes]" = None, - **kwargs: "Any" + buffering: "int" = -1, + encoding: "Optional[str]" = None, + newline: "Optional[str]" = None, ) -> "Union[IO[str], IO[bytes]]": """ Open this entry as text or binary following the semantics @@ -318,79 +320,101 @@ def open( zip_mode = mode[0] if not self.exists() and zip_mode == "r": raise FileNotFoundError(self) - stream = self.root.open(self.at, zip_mode, pwd=pwd) + stream = self._root.open(self._at, mode=zip_mode, pwd=pwd) if "b" in mode: - if args or kwargs: - raise ValueError("encoding args invalid for binary operation") + # if args or kwargs: + # raise ValueError("encoding args invalid for binary operation") return stream # Text mode: - encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) - return io.TextIOWrapper(stream, encoding, *args, **kwargs) + # encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + encoding = io.text_encoding(encoding, 2) + return io.TextIOWrapper( + stream, + encoding=encoding, + newline=newline, + line_buffering=(buffering > 0), + ) @property def name(self) -> "str": - return pathlib.Path(self.at).name or self.filename.name + return pathlib.Path(self._at).name or self.filename.name @property def filename(self) -> "pathlib.Path": - assert self.root.filename is not None - return pathlib.Path(self.root.filename).joinpath(self.at) + assert self._root.filename is not None + return pathlib.Path(self._root.filename).joinpath(self._at) def read_text(self, *args: "Any", **kwargs: "Any") -> "str": - encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) - with self.open("r", encoding, *args, **kwargs) as strm: + kwargs["mode"] = "r" + with self.open(*args, **kwargs) as strm: return cast("str", strm.read()) def read_bytes(self) -> "bytes": with self.open("rb") as strm: return cast("bytes", strm.read()) - def _is_child(self, path: "Path") -> "bool": - return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") + def _is_child(self, path: "ZipfilePath") -> "bool": + return posixpath.dirname(path._at.rstrip("/")) == self._at.rstrip("/") - def _next(self, at: "str") -> "Path": - return self.__class__(self.root, at) + def _next(self, at: "str") -> "ZipfilePath": + return self.__class__(self._root, at) def is_dir(self) -> "bool": - return not self.at or self.at.endswith("/") + return not self._at or self._at.endswith("/") def is_file(self) -> "bool": return self.exists() and not self.is_dir() - def exists(self) -> "bool": - return self.at in self.root._name_set() + def exists(self, *, follow_symlinks: bool = False) -> "bool": + return self._at in self._root._name_set() - def iterdir(self) -> "Iterator[Path]": + def iterdir(self) -> "Generator[ZipfilePath, None, None]": if not self.is_dir(): raise ValueError("Can't listdir a file") - subs = map(self._next, self.root.namelist()) - return filter(self._is_child, subs) + subs = map(self._next, self._root.namelist()) + return cast("Generator[ZipfilePath, None, None]", filter(self._is_child, subs)) def __str__(self) -> "str": - assert self.root.filename is not None - return posixpath.join(self.root.filename, self.at) + assert self._root.filename is not None + return posixpath.join(self._root.filename, self._at) def __repr__(self) -> "str": return self.__repr.format(self=self) - def joinpath(self, *other: "Union[str, PathLike[str]]") -> "Path": - next = posixpath.join(self.at, *other) - return self._next(self.root.resolve_dir(next)) + def joinpath(self, *other: "Union[str, os.PathLike[str]]") -> "ZipfilePath": + next = posixpath.join(self._at, *other) + return self._next(self._root.resolve_dir(next)) __truediv__ = joinpath @property - def parent(self) -> "Union[Path, pathlib.Path]": - if not self.at: - return self.filename.parent - parent_at = posixpath.dirname(self.at.rstrip("/")) + def parent(self) -> "ZipfilePath": + if not self._at: + return self.filename.parent # type: ignore[return-value] + parent_at = posixpath.dirname(self._at.rstrip("/")) if parent_at: parent_at += "/" return self._next(parent_at) + @property + def zip_root(self) -> "ZipFile": + return self._root + + def relative_to( # type: ignore[override] + self, + other: "Union[str, os.PathLike[str]]", + /, + *_deprecated: "Union[str, os.PathLike[str]]", + walk_up: bool = False, + ) -> "pathlib.Path": + return pathlib.Path(path_relative_to(self, pathlib.Path(other))) + + def with_name(self, name: "Union[str, os.PathLike[str]]") -> "ZipfilePath": + return self.parent.joinpath(name) + # Older versions of Python do not have zipfile.Path if sys.version_info[:2] < (3, 8): import zipfile - zipfile.Path = Path + zipfile.Path = ZipfilePath From 72de80d584b8ec864ee22bca1b0b4b2db21348af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Jun 2024 04:30:33 +0200 Subject: [PATCH 32/62] Added several changes to widen link_or_copy compatibility to also PathLike and pathlib.Path --- wfexs_backend/utils/contents.py | 63 ++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/wfexs_backend/utils/contents.py b/wfexs_backend/utils/contents.py index 89c7614f..f03f1463 100644 --- a/wfexs_backend/utils/contents.py +++ b/wfexs_backend/utils/contents.py @@ -20,6 +20,7 @@ import logging import os +import pathlib import shutil from typing import ( cast, @@ -234,46 +235,58 @@ def CWLDesc2Content( return matValues -def copy2_nofollow(src: "str", dest: "str") -> "None": +def copy2_nofollow( + src: "Union[str, os.PathLike[str]]", dest: "Union[str, os.PathLike[str]]" +) -> "None": shutil.copy2(src, dest, follow_symlinks=False) -def link_or_copy(src: "AnyPath", dest: "AnyPath", force_copy: "bool" = False) -> None: - assert os.path.exists( - src - ), f"File {src} must exist to be linked or copied {os.path.exists(src)} {os.path.lexists(src)}" +def link_or_copy( + src: "Union[AnyPath, os.PathLike[str]]", + dest: "Union[AnyPath, os.PathLike[str]]", + force_copy: "bool" = False, +) -> None: + link_or_copy_pathlib(pathlib.Path(src), pathlib.Path(dest), force_copy=force_copy) + + +def link_or_copy_pathlib( + src: "pathlib.Path", dest: "pathlib.Path", force_copy: "bool" = False +) -> None: + assert ( + src.exists() + ), f"File {src.as_posix()} must exist to be linked or copied {src.exists()} {src.exists(follow_symlinks=False)}" # We should not deal with symlinks - src = cast("AbsPath", os.path.realpath(src)) - dest = cast("AbsPath", os.path.realpath(dest)) + src = src.resolve() + dest = dest.resolve() # Avoid losing everything by overwriting itself - if os.path.exists(dest) and os.path.samefile(src, dest): + dest_exists = dest.exists() + if dest_exists and src.samefile(dest): return # First, check whether inputs and content # are in the same filesystem # as of https://unix.stackexchange.com/a/44250 - dest_exists = os.path.exists(dest) dest_or_ancestor_exists = dest_exists dest_or_ancestor = dest while not dest_or_ancestor_exists: - dest_or_ancestor = cast("AbsPath", os.path.dirname(dest_or_ancestor)) - dest_or_ancestor_exists = os.path.exists(dest_or_ancestor) - dest_st_dev = os.lstat(dest_or_ancestor).st_dev + dest_or_ancestor = dest_or_ancestor.parent + dest_or_ancestor_exists = dest_or_ancestor.exists() + dest_st_dev = dest_or_ancestor.lstat().st_dev # It could be a subtree of not existing directories if not dest_exists: - dest_parent = os.path.dirname(dest) - if not os.path.isdir(dest_parent): - os.makedirs(dest_parent) + dest_parent = dest.parent + if not dest_parent.is_dir(): + dest_parent.mkdir(parents=True) # Now, link or copy - if os.lstat(src).st_dev == dest_st_dev and not force_copy: + if src.lstat().st_dev == dest_st_dev and not force_copy: try: - if os.path.isfile(src): + if src.is_file(): if dest_exists: - os.unlink(dest) - os.link(src, dest) + dest.unlink() + dest.hardlink_to(src) else: # Recursively hardlinking # as of https://stackoverflow.com/a/10778930 @@ -287,9 +300,9 @@ def link_or_copy(src: "AnyPath", dest: "AnyPath", force_copy: "bool" = False) -> # device, it can happen both paths are in different # bind mounts, which forbid hard links if ose.errno != 18: - if ose.errno == 1 and os.path.isfile(src): + if ose.errno == 1 and src.is_file(): try: - with open(src, mode="rb") as dummy: + with src.open(mode="rb") as dummy: readable = dummy.readable() except OSError as dummy_err: readable = False @@ -308,11 +321,11 @@ def link_or_copy(src: "AnyPath", dest: "AnyPath", force_copy: "bool" = False) -> force_copy = True if force_copy: - if os.path.isfile(src): + if src.is_file(): # Copying the content # as it is in a separated filesystem if dest_exists: - os.unlink(dest) + dest.unlink() shutil.copy2(src, dest) else: # Recursively copying the content @@ -322,7 +335,9 @@ def link_or_copy(src: "AnyPath", dest: "AnyPath", force_copy: "bool" = False) -> shutil.copytree(src, dest, copy_function=copy2_nofollow) -def real_unlink_if_exists(the_path: "AnyPath", fail_ok: "bool" = False) -> "None": +def real_unlink_if_exists( + the_path: "Union[AnyPath, os.PathLike[str]]", fail_ok: "bool" = False +) -> "None": if os.path.lexists(the_path): try: canonical_to_be_erased = os.path.realpath(the_path) From 2f1a20e5e1c117ff10c5ba3f43d6b869cd6222a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Jun 2024 04:38:24 +0200 Subject: [PATCH 33/62] Added PathlibLike common type alias, in preparation for next commits. --- wfexs_backend/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index ebe8d5fe..f71c941e 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -92,6 +92,14 @@ def create_augmented_context( # This is either a relative or an absolute path AnyPath: TypeAlias = Union[RelPath, AbsPath] + # These declarations are for "new world" + # MaterializedPathContent, LocalPathWorkflow + # and indirectly MaterializedPathInput + import pathlib + + PathlibLike: TypeAlias = pathlib.Path + + DEFAULT_DOCKER_CMD = cast("SymbolicName", "docker") DEFAULT_SINGULARITY_CMD = cast("SymbolicName", "singularity") DEFAULT_APPTAINER_CMD = cast("SymbolicName", "apptainer") From 7cd41d97afc662081cebc90388e49c2bac80f35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Jun 2024 04:38:41 +0200 Subject: [PATCH 34/62] Container factories have been refactored to use pathlib.Path instead of "glorified strings" --- wfexs_backend/container_factories/__init__.py | 186 +++++++++--------- .../abstract_docker_container.py | 9 +- .../container_factories/docker_container.py | 37 ++-- .../container_factories/no_container.py | 5 +- .../container_factories/podman_container.py | 37 ++-- .../singularity_container.py | 82 ++++---- 6 files changed, 179 insertions(+), 177 deletions(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 6f3baad9..0221bcaa 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -21,6 +21,7 @@ from dataclasses import dataclass import json import os +import pathlib import tempfile import atexit import platform @@ -72,6 +73,7 @@ AnyPath, ContainerTaggedName, Fingerprint, + PathlibLike, RelPath, URIType, ) @@ -92,12 +94,13 @@ class AbstractImageManifestMetadata(TypedDict): import yaml - YAMLLoader: TypeAlias = Union[yaml.Loader, yaml.CLoader] + AnyYAMLLoader: TypeAlias = Union[yaml.Loader, yaml.CLoader] from .. import common from ..utils.contents import ( link_or_copy, + link_or_copy_pathlib, real_unlink_if_exists, ) @@ -133,17 +136,19 @@ class Container(ContainerTaggedName): taggedName: "URIType" = cast("URIType", "") architecture: "Optional[ProcessorArchitecture]" = None operatingSystem: "Optional[ContainerOperatingSystem]" = None - localPath: "Optional[AbsPath]" = None + localPath: "Optional[PathlibLike]" = None signature: "Optional[Fingerprint]" = None fingerprint: "Optional[Fingerprint]" = None - metadataLocalPath: "Optional[AbsPath]" = None + metadataLocalPath: "Optional[PathlibLike]" = None source_type: "Optional[ContainerType]" = None image_signature: "Optional[Fingerprint]" = None def _value_defaults_fixes(self) -> None: # This code is needed for old working directories if self.metadataLocalPath is None and self.localPath is not None: - self.metadataLocalPath = cast("AbsPath", self.localPath + META_JSON_POSTFIX) + self.metadataLocalPath = self.localPath.with_name( + self.localPath.name + META_JSON_POSTFIX + ) # And this is to tell the kind of source container type if self.source_type is None: @@ -202,7 +207,9 @@ def decompose_docker_tagged_name( return None, tagged_name, None @classmethod - def ContainerYAMLConstructor(cls, loader: "YAMLLoader", node: "Any") -> "Container": + def ContainerYAMLConstructor( + cls, loader: "AnyYAMLLoader", node: "Any" + ) -> "Container": fields = loader.construct_mapping(node) # This could be a fix for old cases being parsed # where the concept of image_signature did not exist. @@ -213,7 +220,7 @@ def ContainerYAMLConstructor(cls, loader: "YAMLLoader", node: "Any") -> "Contain return cls(**fields) # type: ignore[misc] @classmethod - def RegisterYAMLConstructor(cls, loader: "Type[YAMLLoader]") -> None: + def RegisterYAMLConstructor(cls, loader: "Type[AnyYAMLLoader]") -> None: # yaml.add_constructor('!python/object:wfexs_backend.common.Container', container_yaml_constructor) # yaml.constructor.Constructor.add_constructor('tag:yaml.org,2002:python/object:wfexs_backend.common.Container', container_yaml_constructor) loader.add_constructor( @@ -222,6 +229,19 @@ def RegisterYAMLConstructor(cls, loader: "Type[YAMLLoader]") -> None: ) +REGISTER_CONSTRUCTOR = True +if REGISTER_CONSTRUCTOR: + YAMLLoader: "Type[AnyYAMLLoader]" + try: + from yaml import CLoader as YAMLLoader + except ImportError: + from yaml import Loader as YAMLLoader + # This is needed to keep backward compatibility + # with ancient working directories + Container.RegisterYAMLConstructor(YAMLLoader) + REGISTER_CONSTRUCTOR = False + + class ContainerFactoryException(AbstractWfExSException): """ Exceptions fired by instances of ContainerFactory @@ -255,7 +275,7 @@ class ContainerCacheHandler: def __init__( self, - containers_cache_dir: "Optional[AbsPath]", + containers_cache_dir: "Optional[pathlib.Path]", engine_name: "str", simple_file_name_method: "ContainerFileNamingMethod", ): @@ -269,69 +289,60 @@ def __init__( # TODO: create caching database??? # containers_cache_dir if containers_cache_dir is None: - containers_cache_dir = cast( - "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") + containers_cache_dir = pathlib.Path( + tempfile.mkdtemp(prefix="wfexs", suffix="backend") ) # Assuring this temporal directory is removed at the end atexit.register(shutil.rmtree, containers_cache_dir, True) else: - os.makedirs(containers_cache_dir, exist_ok=True) + containers_cache_dir.mkdir(parents=True, exist_ok=True) # But, for materialized containers, we should use common directories # This for the containers themselves self.containersCacheDir = containers_cache_dir # This for the symlinks to the containers, following the engine convention - self.engineContainersSymlinkDir = cast( - "AbsPath", os.path.join(self.containersCacheDir, engine_name) - ) - os.makedirs(self.engineContainersSymlinkDir, exist_ok=True) + self.engineContainersSymlinkDir = self.containersCacheDir / engine_name + self.engineContainersSymlinkDir.mkdir(parents=True, exist_ok=True) self.simpleFileNameMethod = simple_file_name_method - def _genTmpContainerPath(self) -> "AbsPath": + def _genTmpContainerPath(self) -> "pathlib.Path": """ This is a helper method """ - return cast("AbsPath", os.path.join(self.containersCacheDir, str(uuid.uuid4()))) + return self.containersCacheDir / str(uuid.uuid4()) def _genContainerPaths( self, container: "ContainerTaggedName" - ) -> "Tuple[AbsPath, AbsPath]": + ) -> "Tuple[pathlib.Path, pathlib.Path]": containerFilename = self.simpleFileNameMethod( cast("URIType", container.origTaggedName) ) containerFilenameMeta = containerFilename + META_JSON_POSTFIX - localContainerPath = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilename), - ) - localContainerPathMeta = cast( - "AbsPath", - os.path.join(self.engineContainersSymlinkDir, containerFilenameMeta), - ) + localContainerPath = self.engineContainersSymlinkDir / containerFilename + localContainerPathMeta = self.engineContainersSymlinkDir / containerFilenameMeta return localContainerPath, localContainerPathMeta - def _computeFingerprint(self, image_path: "AnyPath") -> "Fingerprint": - return cast("Fingerprint", ComputeDigestFromFile(image_path)) + def _computeFingerprint(self, image_path: "pathlib.Path") -> "Fingerprint": + return cast("Fingerprint", ComputeDigestFromFile(image_path.as_posix())) def _computeCanonicalImagePath( - self, image_path: "AbsPath" - ) -> "Tuple[AbsPath, Fingerprint]": + self, image_path: "pathlib.Path" + ) -> "Tuple[pathlib.Path, Fingerprint]": imageSignature = self._computeFingerprint(image_path) # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - canonical_image_path = os.path.join( - self.containersCacheDir, - imageSignature.replace("=", "~").replace("/", "-").replace("+", "_"), - ) + canonical_image_path = self.containersCacheDir / imageSignature.replace( + "=", "~" + ).replace("/", "-").replace("+", "_") - return cast("AbsPath", canonical_image_path), imageSignature + return canonical_image_path, imageSignature def query( self, container: "ContainerTaggedName" - ) -> "Tuple[bool, AbsPath, AbsPath, Optional[Fingerprint]]": + ) -> "Tuple[bool, pathlib.Path, pathlib.Path, Optional[Fingerprint]]": """ This method checks whether the container snapshot and its metadata are in the caching directory @@ -340,11 +351,11 @@ def query( trusted_copy = False imageSignature: "Optional[Fingerprint]" = None - if os.path.isfile(localContainerPath): - if os.path.islink(localContainerPath): + if localContainerPath.is_file(): + if localContainerPath.is_symlink(): # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPath = os.readlink(localContainerPath) - fsImageSignature = os.path.basename(unlinkedContainerPath) + unlinkedContainerPath = localContainerPath.readlink() + fsImageSignature = unlinkedContainerPath.name imageSignature = cast( "Fingerprint", fsImageSignature.replace("~", "=") @@ -353,14 +364,10 @@ def query( ) # Do not trust paths outside the caching directory - canonicalContainerPath = os.path.join( - self.containersCacheDir, - fsImageSignature, - ) + canonicalContainerPath = self.containersCacheDir / fsImageSignature - trusted_copy = os.path.samefile( - os.path.realpath(localContainerPath), - os.path.realpath(canonicalContainerPath), + trusted_copy = localContainerPath.resolve().samefile( + canonicalContainerPath.resolve() ) else: ( @@ -368,9 +375,9 @@ def query( imageSignature, ) = self._computeCanonicalImagePath(localContainerPath) - if os.path.samefile(localContainerPath, canonicalContainerPath): + if localContainerPath.samefile(canonicalContainerPath): trusted_copy = True - elif os.path.isfile(canonicalContainerPath): + elif canonicalContainerPath.is_file(): canonicalImageSignature = self._computeFingerprint( canonicalContainerPath ) @@ -378,9 +385,9 @@ def query( trusted_copy = canonicalImageSignature == imageSignature if trusted_copy: - if os.path.isfile(localContainerPathMeta): + if localContainerPathMeta.is_file(): try: - with open(localContainerPathMeta, mode="r", encoding="utf-8") as mH: + with localContainerPathMeta.open(mode="r", encoding="utf-8") as mH: signaturesAndManifest = cast( "AbstractImageManifestMetadata", json.load(mH) ) @@ -398,29 +405,25 @@ def query( def genStagedContainersDirPaths( self, container: "ContainerTaggedName", - stagedContainersDir: "AnyPath", - ) -> "Tuple[AbsPath, AbsPath]": + stagedContainersDir: "pathlib.Path", + ) -> "Tuple[pathlib.Path, pathlib.Path]": containerFilename = self.simpleFileNameMethod( cast("URIType", container.origTaggedName) ) containerFilenameMeta = containerFilename + META_JSON_POSTFIX - containerPath = cast( - "AbsPath", os.path.join(stagedContainersDir, containerFilename) - ) + containerPath = stagedContainersDir / containerFilename - containerPathMeta = cast( - "AbsPath", os.path.join(stagedContainersDir, containerFilenameMeta) - ) + containerPathMeta = stagedContainersDir / containerFilenameMeta return containerPath, containerPathMeta def transfer( self, container: "ContainerTaggedName", - stagedContainersDir: "AnyPath", + stagedContainersDir: "pathlib.Path", force: "bool" = False, - ) -> "Optional[Tuple[AbsPath, AbsPath]]": + ) -> "Optional[Tuple[pathlib.Path, pathlib.Path]]": """ This method is used to transfer both the container snapshot and its metadata from the caching directory to stagedContainersDir @@ -437,23 +440,23 @@ def transfer( # Last, but not the least important # Hardlink or copy the container and its metadata + stagedContainersDir.mkdir(parents=True, exist_ok=True) containerPath, containerPathMeta = self.genStagedContainersDirPaths( container, stagedContainersDir ) - os.makedirs(stagedContainersDir, exist_ok=True) - if force or not os.path.exists(containerPath): - link_or_copy(localContainerPath, containerPath) - if force or not os.path.exists(containerPathMeta): - link_or_copy(localContainerPathMeta, containerPathMeta) + if force or not containerPath.exists(): + link_or_copy_pathlib(localContainerPath, containerPath) + if force or not containerPathMeta.exists(): + link_or_copy_pathlib(localContainerPathMeta, containerPathMeta) return (containerPath, containerPathMeta) def update( self, container: "ContainerTaggedName", - image_path: "AbsPath", - image_metadata_path: "AbsPath", + image_path: "pathlib.Path", + image_metadata_path: "pathlib.Path", do_move: "bool" = True, ) -> "None": # First, let's remove what it is still there @@ -466,8 +469,8 @@ def update( canonicalContainerPath, imageSignature = self._computeCanonicalImagePath( image_path ) - canonicalContainerPathMeta = cast( - "AbsPath", canonicalContainerPath + META_JSON_POSTFIX + canonicalContainerPathMeta = canonicalContainerPath.with_name( + canonicalContainerPath.name + META_JSON_POSTFIX ) # And ..... transfer!!! @@ -481,16 +484,12 @@ def update( ) # Last, the symbolic links - os.symlink( - os.path.relpath(canonicalContainerPath, self.engineContainersSymlinkDir), - localContainerPath, + localContainerPath.symlink_to( + os.path.relpath(canonicalContainerPath, self.engineContainersSymlinkDir) ) - os.symlink( - os.path.relpath( - canonicalContainerPathMeta, self.engineContainersSymlinkDir - ), - localContainerPathMeta, + localContainerPathMeta.symlink_to( + os.path.relpath(canonicalContainerPathMeta, self.engineContainersSymlinkDir) ) def invalidate(self, container: "ContainerTaggedName") -> "None": @@ -509,11 +508,11 @@ class ContainerFactory(abc.ABC): def __init__( self, simpleFileNameMethod: "ContainerFileNamingMethod", - containersCacheDir: "Optional[AnyPath]" = None, - stagedContainersDir: "Optional[AnyPath]" = None, + containersCacheDir: "Optional[pathlib.Path]" = None, + stagedContainersDir: "Optional[pathlib.Path]" = None, tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", - tempDir: "Optional[AnyPath]" = None, + tempDir: "Optional[pathlib.Path]" = None, ): """ Abstract init method @@ -539,19 +538,17 @@ def __init__( # This for the containers themselves # containersCacheDir if containersCacheDir is None: - self.containersCacheDir = cast( - "AbsPath", tempfile.mkdtemp(prefix="wfexs", suffix="backend") + self.containersCacheDir = pathlib.Path( + tempfile.mkdtemp(prefix="wfexs", suffix="backend") ) # Assuring this temporal directory is removed at the end atexit.register(shutil.rmtree, self.containersCacheDir, True) else: - self.containersCacheDir = cast( - "AbsPath", os.path.abspath(containersCacheDir) - ) + self.containersCacheDir = containersCacheDir.absolute() if tempDir is None: - tempDir = cast( - "AbsPath", tempfile.mkdtemp(prefix="WfExS-container", suffix="tempdir") + tempDir = pathlib.Path( + tempfile.mkdtemp(prefix="WfExS-container", suffix="tempdir") ) # Assuring this temporal directory is removed at the end atexit.register(shutil.rmtree, tempDir, True) @@ -572,10 +569,7 @@ def __init__( self.stagedContainersDir = stagedContainersDir # This for the symlinks to the containers, following the engine convention - self.engineContainersSymlinkDir = cast( - "AbsPath", os.path.join(self.containersCacheDir, engine_name) - ) - os.makedirs(self.engineContainersSymlinkDir, exist_ok=True) + self.engineContainersSymlinkDir = self.cc_handler.engineContainersSymlinkDir # This variable contains the dictionary of set up environment # variables needed to run the tool with the proper setup @@ -631,7 +625,7 @@ def command(self) -> "str": return self.runtime_cmd @property - def cacheDir(self) -> "AbsPath": + def cacheDir(self) -> "pathlib.Path": """ This method returns the symlink dir instead of the cache dir as the entries following the naming convention of the engine @@ -687,7 +681,7 @@ def architecture(self) -> "Tuple[ContainerOperatingSystem, ProcessorArchitecture def materializeContainers( self, tagList: "Sequence[ContainerTaggedName]", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Sequence[Container]": @@ -730,7 +724,7 @@ def materializeContainers( def materializeSingleContainer( self, tag: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Optional[Container]": @@ -742,7 +736,7 @@ def materializeSingleContainer( def deployContainers( self, containers_list: "Sequence[Container]", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, force: "bool" = False, ) -> "Sequence[Container]": """ @@ -768,7 +762,7 @@ def deployContainers( def deploySingleContainer( self, container: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, force: "bool" = False, ) -> "Tuple[Container, bool]": """ diff --git a/wfexs_backend/container_factories/abstract_docker_container.py b/wfexs_backend/container_factories/abstract_docker_container.py index 4d160a7c..70315578 100644 --- a/wfexs_backend/container_factories/abstract_docker_container.py +++ b/wfexs_backend/container_factories/abstract_docker_container.py @@ -46,6 +46,7 @@ ) if TYPE_CHECKING: + import pathlib from types import ( ModuleType, ) @@ -287,7 +288,7 @@ def _rmi( def _load( self, - archivefile: "AbsPath", + archivefile: "pathlib.Path", dockerTag: "str", matEnv: "Mapping[str, str]", ) -> "Tuple[ExitVal, str, str]": @@ -303,7 +304,7 @@ def _load( ) with package.open( - archivefile, mode="rb" + archivefile.as_posix(), mode="rb" ) as d_in, tempfile.NamedTemporaryFile() as d_out, tempfile.NamedTemporaryFile() as d_err: self.logger.debug(f"loading {self.variant_name()} container {dockerTag}") with subprocess.Popen( @@ -334,11 +335,11 @@ def _load( def _save( self, dockerTag: "str", - destfile: "AbsPath", + destfile: "pathlib.Path", matEnv: "Mapping[str, str]", ) -> "Tuple[ExitVal, str]": with pgzip.open( - destfile, mode="wb" + destfile.as_posix(), mode="wb" ) as d_out, tempfile.NamedTemporaryFile() as d_err: self.logger.debug(f"saving {self.variant_name()} container {dockerTag}") with subprocess.Popen( diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index dd2e9785..3739aa35 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -25,6 +25,7 @@ ) if TYPE_CHECKING: + import pathlib from typing import ( Any, Mapping, @@ -93,11 +94,11 @@ def trimmable_manifest_keys(cls) -> "Sequence[str]": def __init__( self, simpleFileNameMethod: "ContainerFileNamingMethod", - containersCacheDir: "Optional[AnyPath]" = None, - stagedContainersDir: "Optional[AnyPath]" = None, + containersCacheDir: "Optional[pathlib.Path]" = None, + stagedContainersDir: "Optional[pathlib.Path]" = None, tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", - tempDir: "Optional[AnyPath]" = None, + tempDir: "Optional[pathlib.Path]" = None, ): super().__init__( simpleFileNameMethod=simpleFileNameMethod, @@ -217,7 +218,7 @@ def _enrichFingerprint( def materializeSingleContainer( self, tag: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Optional[Container]": @@ -235,8 +236,8 @@ def materializeSingleContainer( fetch_metadata = True trusted_copy = False - localContainerPath: "Optional[AbsPath]" = None - localContainerPathMeta: "Optional[AbsPath]" = None + localContainerPath: "Optional[pathlib.Path]" = None + localContainerPathMeta: "Optional[pathlib.Path]" = None imageSignature: "Optional[Fingerprint]" = None image_id: "Optional[Fingerprint]" = None manifestsImageSignature: "Optional[Fingerprint]" = None @@ -371,14 +372,14 @@ def materializeSingleContainer( raise ContainerEngineException(errstr) # This is needed for the metadata - imageSignature = self.cc_handler._computeFingerprint( - cast("AnyPath", tmpContainerPath) - ) + imageSignature = self.cc_handler._computeFingerprint(tmpContainerPath) - tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX + tmpContainerPathMeta = tmpContainerPath.with_name( + tmpContainerPath.name + META_JSON_POSTFIX + ) # Last, save the metadata itself for further usage - with open(tmpContainerPathMeta, mode="w", encoding="utf-8") as tcpM: + with tmpContainerPathMeta.open(mode="w", encoding="utf-8") as tcpM: manifest_metadata: "DockerManifestMetadata" = { "image_id": image_id, "image_signature": imageSignature, @@ -391,7 +392,7 @@ def materializeSingleContainer( self.cc_handler.update( tag, image_path=tmpContainerPath, - image_metadata_path=cast("AbsPath", tmpContainerPathMeta), + image_metadata_path=tmpContainerPathMeta, do_move=True, ) @@ -441,7 +442,7 @@ def materializeSingleContainer( def deploySingleContainer( self, container: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, force: "bool" = False, ) -> "Tuple[Container, bool]": # Should we load the image? @@ -460,18 +461,18 @@ def deploySingleContainer( manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None - if not os.path.isfile(containerPath): - errmsg = f"Docker saved image {os.path.basename(containerPath)} is not in the staged working dir for {tag_name}" + if not containerPath.is_file(): + errmsg = f"Docker saved image {containerPath.name} is not in the staged working dir for {tag_name}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) - if not os.path.isfile(containerPathMeta): - errmsg = f"Docker saved image metadata {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" + if not containerPathMeta.is_file(): + errmsg = f"Docker saved image metadata {containerPathMeta.name} is not in the staged working dir for {tag_name}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) try: - with open(containerPathMeta, mode="r", encoding="utf-8") as mH: + with containerPathMeta.open(mode="r", encoding="utf-8") as mH: signaturesAndManifest = cast("DockerManifestMetadata", json.load(mH)) imageSignature_in_metadata = signaturesAndManifest["image_signature"] manifestsImageSignature = signaturesAndManifest["manifests_signature"] diff --git a/wfexs_backend/container_factories/no_container.py b/wfexs_backend/container_factories/no_container.py index a1d9671a..684fca46 100644 --- a/wfexs_backend/container_factories/no_container.py +++ b/wfexs_backend/container_factories/no_container.py @@ -23,6 +23,7 @@ ) if TYPE_CHECKING: + import pathlib from typing import ( Any, Mapping, @@ -86,7 +87,7 @@ def engine_version(self) -> "ContainerEngineVersionStr": def materializeSingleContainer( self, tag: "ContainerTaggedName", - containers_dir: "Optional[Union[RelPath, AbsPath]]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Optional[Container]": @@ -98,7 +99,7 @@ def materializeSingleContainer( def deploySingleContainer( self, container: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, force: "bool" = False, ) -> "Tuple[Container, bool]": """ diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index f48c424a..44090b98 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -25,6 +25,7 @@ ) if TYPE_CHECKING: + import pathlib from typing import ( Any, Mapping, @@ -95,11 +96,11 @@ def trimmable_manifest_keys(cls) -> "Sequence[str]": def __init__( self, simpleFileNameMethod: "ContainerFileNamingMethod", - containersCacheDir: "Optional[AnyPath]" = None, - stagedContainersDir: "Optional[AnyPath]" = None, + containersCacheDir: "Optional[pathlib.Path]" = None, + stagedContainersDir: "Optional[pathlib.Path]" = None, tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", - tempDir: "Optional[AnyPath]" = None, + tempDir: "Optional[pathlib.Path]" = None, ): super().__init__( simpleFileNameMethod=simpleFileNameMethod, @@ -222,7 +223,7 @@ def _genPodmanTag( def materializeSingleContainer( self, tag: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Optional[Container]": @@ -241,8 +242,8 @@ def materializeSingleContainer( fetch_metadata = True trusted_copy = False - localContainerPath: "Optional[AbsPath]" = None - localContainerPathMeta: "Optional[AbsPath]" = None + localContainerPath: "Optional[pathlib.Path]" = None + localContainerPathMeta: "Optional[pathlib.Path]" = None imageSignature: "Optional[Fingerprint]" = None image_id: "Optional[Fingerprint]" = None manifestsImageSignature: "Optional[Fingerprint]" = None @@ -369,22 +370,22 @@ def materializeSingleContainer( ) # Removing partial dumps - if os.path.exists(tmpContainerPath): + if tmpContainerPath.exists(): try: - os.unlink(tmpContainerPath) + tmpContainerPath.unlink() except: pass raise ContainerEngineException(errstr) # This is needed for the metadata - imageSignature = self.cc_handler._computeFingerprint( - cast("AnyPath", tmpContainerPath) - ) + imageSignature = self.cc_handler._computeFingerprint(tmpContainerPath) - tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX + tmpContainerPathMeta = tmpContainerPath.with_name( + tmpContainerPath.name + META_JSON_POSTFIX + ) # Last, save the metadata itself for further usage - with open(tmpContainerPathMeta, mode="w", encoding="utf-8") as tcpM: + with tmpContainerPathMeta.open(mode="w", encoding="utf-8") as tcpM: manifest_metadata: "DockerManifestMetadata" = { "image_id": image_id, "image_signature": imageSignature, @@ -397,7 +398,7 @@ def materializeSingleContainer( self.cc_handler.update( tag, image_path=tmpContainerPath, - image_metadata_path=cast("AbsPath", tmpContainerPathMeta), + image_metadata_path=tmpContainerPathMeta, do_move=True, ) @@ -448,7 +449,7 @@ def materializeSingleContainer( def deploySingleContainer( self, container: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, force: "bool" = False, ) -> "Tuple[Container, bool]": # Should we load the image? @@ -467,13 +468,13 @@ def deploySingleContainer( manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None - if not os.path.isfile(containerPath): - errmsg = f"Podman saved image {os.path.basename(containerPath)} is not in the staged working dir for {tag_name}" + if not containerPath.is_file(): + errmsg = f"Podman saved image {containerPath.name} is not in the staged working dir for {tag_name}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) if not os.path.isfile(containerPathMeta): - errmsg = f"FATAL ERROR: Podman saved image metadata {os.path.basename(containerPathMeta)} is not in the staged working dir for {tag_name}" + errmsg = f"FATAL ERROR: Podman saved image metadata {containerPathMeta.name} is not in the staged working dir for {tag_name}" self.logger.error(errmsg) raise ContainerFactoryException(errmsg) diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index 01b05151..fb50cabf 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -20,6 +20,7 @@ import json import os import os.path +import pathlib import re import shutil import subprocess @@ -125,11 +126,11 @@ class SingularityContainerFactory(ContainerFactory): def __init__( self, simpleFileNameMethod: "ContainerFileNamingMethod", - containersCacheDir: "Optional[AnyPath]" = None, - stagedContainersDir: "Optional[AnyPath]" = None, + containersCacheDir: "Optional[pathlib.Path]" = None, + stagedContainersDir: "Optional[pathlib.Path]" = None, tools_config: "Optional[ContainerLocalConfig]" = None, engine_name: "str" = "unset", - tempDir: "Optional[AnyPath]" = None, + tempDir: "Optional[pathlib.Path]" = None, ): super().__init__( simpleFileNameMethod=simpleFileNameMethod, @@ -145,15 +146,15 @@ def __init__( # This is needed due a bug in singularity 3.6, where # singularity pull --disable-cache does not create a container - singularityCacheDir = os.path.join(self.stagedContainersDir, ".singularity") - os.makedirs(singularityCacheDir, exist_ok=True) + singularityCacheDir = self.stagedContainersDir / ".singularity" + singularityCacheDir.mkdir(parents=True, exist_ok=True) self._environment.update( { - "APPTAINER_TMPDIR": self.tempDir, - "APPTAINER_CACHEDIR": singularityCacheDir, - "SINGULARITY_TMPDIR": self.tempDir, - "SINGULARITY_CACHEDIR": singularityCacheDir, + "APPTAINER_TMPDIR": self.tempDir.as_posix(), + "APPTAINER_CACHEDIR": singularityCacheDir.as_posix(), + "SINGULARITY_TMPDIR": self.tempDir.as_posix(), + "SINGULARITY_CACHEDIR": singularityCacheDir.as_posix(), } ) @@ -199,7 +200,9 @@ def AcceptsContainerType( ) def _getContainerArchitecture( - self, container_filename: "AnyPath", matEnv: "Mapping[str, str]" = {} + self, + container_filename: "Union[AnyPath, os.PathLike[str]]", + matEnv: "Mapping[str, str]" = {}, ) -> "Optional[ProcessorArchitecture]": if len(matEnv) == 0: matEnv = dict(os.environ) @@ -360,7 +363,7 @@ def _getContainerArchitecture( def materializeSingleContainer( self, tag: "ContainerTaggedName", - containers_dir: "Optional[Union[RelPath, AbsPath]]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Optional[Container]": @@ -474,7 +477,10 @@ def _genSingTag( return singTag, parsedTag, singPullTag, isDocker def _pull( - self, singTag: "str", tmpContainerPath: "str", matEnv: "Mapping[str, str]" + self, + singTag: "str", + tmpContainerPath: "Union[str, os.PathLike[str]]", + matEnv: "Mapping[str, str]", ) -> "Tuple[ExitVal, str, str]": with tempfile.NamedTemporaryFile() as s_out, tempfile.NamedTemporaryFile() as s_err: self.logger.debug( @@ -507,7 +513,7 @@ def _materializeSingleContainerSing( tag: "ContainerTaggedName", matEnv: "Mapping[str, str]" = {}, dhelp: "DockerHelper" = DockerHelper(), - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Union[Container, FailedContainerTag]": @@ -521,8 +527,8 @@ def _materializeSingleContainerSing( fetch_metadata = True trusted_copy = False - localContainerPath: "Optional[AbsPath]" = None - localContainerPathMeta: "Optional[AbsPath]" = None + localContainerPath: "Optional[pathlib.Path]" = None + localContainerPathMeta: "Optional[pathlib.Path]" = None imageSignature: "Optional[Fingerprint]" = None fingerprint: "Optional[Fingerprint]" = None if not force: @@ -535,9 +541,7 @@ def _materializeSingleContainerSing( if trusted_copy: try: - with open( - localContainerPathMeta, mode="r", encoding="utf8" - ) as tcpm: + with localContainerPathMeta.open(mode="r", encoding="utf8") as tcpm: raw_metadata = json.load(tcpm) if isinstance(raw_metadata, dict) and ( "registryServer" in raw_metadata @@ -596,7 +600,7 @@ def _materializeSingleContainerSing( # Now, time to fetch the container itself # (if it is needed) - tmpContainerPath: "Optional[str]" = None + tmpContainerPath: "Optional[pathlib.Path]" = None if not trusted_copy: if offline: raise ContainerFactoryException( @@ -610,7 +614,7 @@ def _materializeSingleContainerSing( # Reading the output and error for the report if s_retval == 0: - if not os.path.exists(tmpContainerPath): + if not tmpContainerPath.exists(): raise ContainerFactoryException( "FATAL ERROR: Singularity finished properly but it did not materialize {} into {}".format( tag_name, tmpContainerPath @@ -618,9 +622,7 @@ def _materializeSingleContainerSing( ) # This is needed for the metadata - imageSignature = self.cc_handler._computeFingerprint( - cast("AnyPath", tmpContainerPath) - ) + imageSignature = self.cc_handler._computeFingerprint(tmpContainerPath) else: errstr = f"""\ Could not materialize singularity image {singTag} ({singPullTag}). Retval {s_retval} @@ -634,9 +636,9 @@ def _materializeSingleContainerSing( ====== {s_err_v}""" - if os.path.exists(tmpContainerPath): + if tmpContainerPath.exists(): try: - os.unlink(tmpContainerPath) + tmpContainerPath.unlink() except: pass self.logger.error(errstr) @@ -651,7 +653,7 @@ def _materializeSingleContainerSing( # When no metadata exists, we are bringing the metadata # to a temporary path - tmpContainerPathMeta: "Optional[str]" = None + tmpContainerPathMeta: "Optional[pathlib.Path]" = None if fetch_metadata: if offline: raise ContainerFactoryException( @@ -662,7 +664,9 @@ def _materializeSingleContainerSing( assert localContainerPath is not None tmpContainerPath = self.cc_handler._genTmpContainerPath() link_or_copy(localContainerPath, tmpContainerPath) - tmpContainerPathMeta = tmpContainerPath + META_JSON_POSTFIX + tmpContainerPathMeta = tmpContainerPath.with_name( + tmpContainerPath.name + META_JSON_POSTFIX + ) self.logger.debug( f"downloading temporary container metadata: {tag_name} => {tmpContainerPathMeta}" @@ -686,7 +690,7 @@ def _materializeSingleContainerSing( tag_pull_details = tag_details # Save the temporary metadata - with open(tmpContainerPathMeta, mode="w", encoding="utf8") as tcpm: + with tmpContainerPathMeta.open(mode="w", encoding="utf8") as tcpm: tmp_meta: "SingularityManifest" if tag_details is not None: assert tag_pull_details is not None @@ -731,13 +735,13 @@ def _materializeSingleContainerSing( if tmpContainerPath is not None and tmpContainerPathMeta is not None: self.cc_handler.update( tag, - image_path=cast("AbsPath", tmpContainerPath), - image_metadata_path=cast("AbsPath", tmpContainerPathMeta), + image_path=tmpContainerPath, + image_metadata_path=tmpContainerPathMeta, do_move=True, ) if containers_dir is None: - containers_dir = self.stagedContainersDir + containers_dir = pathlib.Path(self.stagedContainersDir) # Do not allow overwriting in offline mode transferred_image = self.cc_handler.transfer( @@ -763,7 +767,7 @@ def _materializeSingleContainerSing( def materializeContainers( self, tagList: "Sequence[ContainerTaggedName]", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, ) -> "Sequence[Container]": @@ -822,7 +826,7 @@ def materializeContainers( def deploySingleContainer( self, container: "ContainerTaggedName", - containers_dir: "Optional[AnyPath]" = None, + containers_dir: "Optional[pathlib.Path]" = None, force: "bool" = False, ) -> "Tuple[Container, bool]": """ @@ -835,18 +839,18 @@ def deploySingleContainer( container, containers_dir ) - if not os.path.isfile(containerPath): - errmsg = f"SIF saved image {os.path.basename(containerPath)} is not in the staged working dir for {container.origTaggedName}" + if not containerPath.is_file(): + errmsg = f"SIF saved image {containerPath.name} is not in the staged working dir for {container.origTaggedName}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) - if not os.path.isfile(containerPathMeta): - errmsg = f"SIF saved image metadata {os.path.basename(containerPathMeta)} is not in the staged working dir for {container.origTaggedName}" + if not containerPathMeta.is_file(): + errmsg = f"SIF saved image metadata {containerPathMeta.name} is not in the staged working dir for {container.origTaggedName}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) try: - with open(containerPathMeta, mode="r", encoding="utf-8") as mH: + with containerPathMeta.open(mode="r", encoding="utf-8") as mH: signaturesAndManifest = cast("SingularityManifest", json.load(mH)) imageSignature_in_metadata = signaturesAndManifest["image_signature"] @@ -904,7 +908,7 @@ def deploySingleContainer( imageSignature = self.cc_handler._computeFingerprint(containerPath) if imageSignature != imageSignature_in_metadata: - errmsg = f"Image signature recorded in {os.path.basename(containerPathMeta)} does not match image signature of {os.path.basename(containerPath)}" + errmsg = f"Image signature recorded in {containerPathMeta.name} does not match image signature of {containerPath.name}" self.logger.exception(errmsg) raise ContainerFactoryException(errmsg) From d955cee25b218a1abfa6da63b3a3f022756aa8c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Jun 2024 04:41:18 +0200 Subject: [PATCH 35/62] wfexs_backend.workflow_engine has been updated to properly use new container factories --- wfexs_backend/workflow_engines/__init__.py | 26 +++++++++++-------- .../workflow_engines/nextflow_engine.py | 4 ++- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index d0b1c0e0..567ff351 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -18,6 +18,7 @@ from __future__ import absolute_import import os +import pathlib import sys import tempfile import atexit @@ -503,7 +504,7 @@ def __init__( elif not os.path.isabs(stagedContainersDir): stagedContainersDir = cast("AbsPath", os.path.abspath(stagedContainersDir)) os.makedirs(stagedContainersDir, exist_ok=True) - self.stagedContainersDir = cast("AbsPath", stagedContainersDir) + self.stagedContainersDir = pathlib.Path(stagedContainersDir) # Setting up common properties tools_config = local_config.get("tools", {}) @@ -529,17 +530,16 @@ def __init__( self.logger.debug(f"Instantiating container type {container_type}") # For materialized containers, we should use common directories # This for the containers themselves - containersCacheDir = cast( - "AnyPath", - os.path.join(cacheDir, "containers", container_factory_clazz.__name__), + containersCacheDir = ( + pathlib.Path(cacheDir) / "containers" / container_factory_clazz.__name__ ) self.container_factory = container_factory_clazz( simpleFileNameMethod=self.simpleContainerFileName, containersCacheDir=containersCacheDir, - stagedContainersDir=stagedContainersDir, + stagedContainersDir=self.stagedContainersDir, tools_config=tools_config, engine_name=self.__class__.__name__, - tempDir=self.tempDir, + tempDir=pathlib.Path(self.tempDir), ) isUserNS = self.container_factory.supportsFeature("userns") @@ -772,13 +772,15 @@ def materialize_containers( force: "bool" = False, ) -> "Tuple[ContainerEngineVersionStr, Sequence[Container], ContainerOperatingSystem, ProcessorArchitecture]": if containersDir is None: - containersDir = self.stagedContainersDir + containersDirPath = self.stagedContainersDir + else: + containersDirPath = pathlib.Path(containersDir) return ( self.container_factory.engine_version(), self.container_factory.materializeContainers( listOfContainerTags, - containers_dir=containersDir, + containers_dir=containersDirPath, offline=offline, force=force, ), @@ -792,17 +794,19 @@ def deploy_containers( force: "bool" = False, ) -> "Sequence[Container]": if containersDir is None: - containersDir = self.stagedContainersDir + containersDirPath = self.stagedContainersDir + else: + containersDirPath = pathlib.Path(containersDir) return self.container_factory.deployContainers( containers_list=containers_list, - containers_dir=containersDir, + containers_dir=containersDirPath, force=force, ) @property def staged_containers_dir(self) -> "AnyPath": - return self.stagedContainersDir + return cast("AbsPath", self.stagedContainersDir.as_posix()) def create_job_directories(self) -> "Tuple[str, AbsPath, AbsPath, AbsPath]": outputDirPostfix = "_" + str(int(time.time())) + "_" + str(os.getpid()) diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index 6e85833b..0cfa194d 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -754,7 +754,9 @@ def runNextflowCommand( self.logger.debug("Command => nextflow " + " ".join(commandLine)) if containers_path is None: - containers_path = self.container_factory.cacheDir + containers_path = cast( + "AnyPath", self.container_factory.cacheDir.as_posix() + ) if self.engine_mode == EngineMode.Docker: ( retval, From 3fdbf12f591ea0c9fac66297ed6621a935868ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sat, 15 Jun 2024 04:43:30 +0200 Subject: [PATCH 36/62] Remaining changes to properly use new container factories --- wfexs_backend/ro_crate.py | 47 ++- wfexs_backend/utils/rocrate.py | 698 +++++++++++++++++---------------- wfexs_backend/wfexs_backend.py | 2 +- wfexs_backend/workflow.py | 11 +- 4 files changed, 383 insertions(+), 375 deletions(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 89afb33c..b447db92 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -1102,7 +1102,7 @@ def _add_wfexs_to_crate( ) readme_file = self._add_file_to_crate( - readme_md_path, + pathlib.Path(readme_md_path), the_uri=None, the_name=cast("RelPath", "README.md"), the_mime="text/markdown", @@ -1271,16 +1271,16 @@ def _add_containers( if crate_cont is None: # Now, add container metadata, which is going to be # consumed by WfExS or third parties - metadataLocalPath: "Optional[str]" = None + metadataLocalPath: "Optional[pathlib.Path]" = None if container.metadataLocalPath is not None: metadataLocalPath = container.metadataLocalPath # This code is needed for old working directories if metadataLocalPath is None and container.localPath is not None: - metadataLocalPath = container.localPath + META_JSON_POSTFIX + metadataLocalPath = container.localPath.with_name( + container.localPath.name + META_JSON_POSTFIX + ) - if metadataLocalPath is not None and os.path.exists( - metadataLocalPath - ): + if metadataLocalPath is not None and metadataLocalPath.exists(): meta_file = self._add_file_to_crate( the_path=metadataLocalPath, the_uri=None, @@ -1420,7 +1420,9 @@ def addWorkflowInputs( ): # TODO: embed metadata_array in some way assert isinstance(itemInValues, MaterializedContent) - itemInLocalSource = itemInValues.local # local source + itemInLocalSource = pathlib.Path( + itemInValues.local + ) # local source itemInURISource = itemInValues.licensed_uri.uri # uri source itemInURILicences: "Optional[MutableSequence[LicenceDescription]]" = ( @@ -1442,7 +1444,7 @@ def addWorkflowInputs( if matched_licence is not None: itemInURILicences.append(matched_licence) - if os.path.isfile(itemInLocalSource): + if itemInLocalSource.is_file(): the_signature: "Optional[Fingerprint]" = None if itemInValues.fingerprint is not None: digest, algo = extract_digest(itemInValues.fingerprint) @@ -1456,7 +1458,9 @@ def addWorkflowInputs( the_uri=itemInURISource, the_name=cast( "RelPath", - os.path.relpath(itemInLocalSource, self.work_dir), + itemInLocalSource.relative_to( + self.work_dir + ).as_posix(), ), the_signature=the_signature, the_licences=itemInURILicences, @@ -1466,7 +1470,9 @@ def addWorkflowInputs( # An extrapolated input, which needs special handling if itemInValues.extrapolated_local is not None: crate_extrapolated_file = self._add_file_to_crate( - the_path=itemInValues.extrapolated_local, + the_path=pathlib.Path( + itemInValues.extrapolated_local + ), the_uri=None, the_name=cast( "RelPath", @@ -1522,13 +1528,15 @@ def addWorkflowInputs( else: crate_coll = crate_extrapolated_file - elif os.path.isdir(itemInLocalSource): + elif itemInLocalSource.is_dir(): crate_dataset, _ = self._add_directory_as_dataset( itemInLocalSource, itemInURISource, the_name=cast( "RelPath", - os.path.relpath(itemInLocalSource, self.work_dir) + itemInLocalSource.relative_to( + self.work_dir + ).as_posix() + "/", ), do_attach=do_attach, @@ -1741,7 +1749,7 @@ def addWorkflowInputs( def _add_file_to_crate( self, - the_path: "str", + the_path: "pathlib.Path", the_uri: "Optional[URIType]", the_id: "Optional[str]" = None, the_name: "Optional[RelPath]" = None, @@ -1817,7 +1825,7 @@ def _add_collection_to_crate( def _add_directory_as_dataset( self, - the_path: "str", + the_path: "pathlib.Path", the_uri: "URIType", the_id: "Optional[str]" = None, the_name: "Optional[RelPath]" = None, @@ -1870,7 +1878,7 @@ def _add_directory_as_dataset( ) if the_file.is_file(): the_file_crate = self._add_file_to_crate( - the_path=the_file.path, + the_path=pathlib.Path(the_file.path), the_uri=the_item_uri, the_size=the_file.stat().st_size, do_attach=do_attach, @@ -1885,7 +1893,7 @@ def _add_directory_as_dataset( the_dir_crate, the_subfiles_crates, ) = self._add_directory_as_dataset( - the_path=the_file.path, + the_path=pathlib.Path(the_file.path), the_uri=the_item_uri, do_attach=do_attach, ) @@ -2163,7 +2171,7 @@ def _add_workflow_to_crate( self.staged_setup.workflow_dir, ) the_entity = self._add_file_to_crate( - the_path=os.path.join(the_workflow.dir, rel_file), + the_path=pathlib.Path(the_workflow.dir) / rel_file, the_name=the_s_name, the_alternate_name=cast("RelPath", the_alternate_name), the_uri=cast("URIType", rocrate_file_id), @@ -2429,6 +2437,7 @@ def addWorkflowExecution( # Processing the log files if len(stagedExec.logfile) > 0: + work_dir = pathlib.Path(self.work_dir) crate_coll: "Union[Collection, FixedFile, None]" if len(stagedExec.logfile) > 1: crate_coll = self._add_collection_to_crate() @@ -2437,7 +2446,7 @@ def addWorkflowExecution( for logfile in stagedExec.logfile: the_log_file = self._add_file_to_crate( - os.path.join(self.work_dir, logfile), the_uri=None, the_name=logfile + work_dir / logfile, the_uri=None, the_name=logfile ) if crate_coll is None: crate_coll = the_log_file @@ -2675,7 +2684,7 @@ def _add_GeneratedContent_to_crate( the_content_uri = None crate_file = self._add_file_to_crate( - the_path=the_content.local, + the_path=pathlib.Path(the_content.local), the_uri=the_content_uri, the_name=cast("RelPath", dest_path), the_alternate_name=cast("RelPath", alternateName), diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 83535c48..6a58626d 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -30,8 +30,8 @@ import zipfile # Older versions of Python do not have zipfile.Path -if sys.version_info[:2] < (3, 8): - from .zipfile_path import Path as ZipfilePath +# and newer do not inherit from pathlib.Path +from .zipfile_path import ZipfilePath from typing import ( cast, @@ -56,16 +56,21 @@ from typing_extensions import ( Final, + TypeAlias, ) from ..common import ( AbsPath, + EngineVersion, Fingerprint, + PathlibLike, RelPath, RepoURL, RepoTag, SymbolicParamName, URIType, + URIWithMetadata, + WFLangVersion, ) from ..container_factories import ( @@ -90,8 +95,6 @@ WorkflowType, ) - from .zipfile_path import Path as ZipfilePath - # Needed by pyld to detect it import aiohttp import pyld # type: ignore[import, import-untyped] @@ -134,6 +137,76 @@ # import magic +# "New world" declarations + + +class LocalPathWorkflow(NamedTuple): + """ + dir: The path to the directory where the checkout was applied + relPath: Inside the checkout, the relative path to the workflow definition + effectiveCheckout: hex hash of the materialized checkout + langVersion: workflow language version / revision + relPathFiles: files composing the workflow, which can be either local + or remote ones (i.e. CWL) + """ + + dir: "PathlibLike" + relPath: "Optional[RelPath]" + effectiveCheckout: "Optional[RepoTag]" + langVersion: "Optional[Union[EngineVersion, WFLangVersion]]" = None + relPathFiles: "Optional[Sequence[RelPath]]" = None + + +class MaterializedPathContent(NamedTuple): + """ + local: Local absolute path of the content which was materialized. It + can be either a path in the cached inputs directory, or an absolute + path in the inputs directory of the execution + licensed_uri: Either an URL or a CURIE of the content which was materialized, + needed for the provenance + prettyFilename: The preferred filename to use in the inputs directory + of the execution environment + fingerprint: If it is available, propagate the computed fingerprint + from the cache. + """ + + local_path: "PathlibLike" + licensed_uri: "LicensedURI" + prettyFilename: "RelPath" + kind: "ContentKind" = ContentKind.File + metadata_array: "Optional[Sequence[URIWithMetadata]]" = None + extrapolated_local: "Optional[AbsPath]" = None + fingerprint: "Optional[Fingerprint]" = None + + @classmethod + def _key_fixes(cls) -> "Mapping[str, str]": + return {"uri": "licensed_uri"} + + +if TYPE_CHECKING: + MaterializedPathInputValues: TypeAlias = Union[ + Sequence[bool], + Sequence[str], + Sequence[int], + Sequence[float], + Sequence[MaterializedPathContent], + ] + + +class MaterializedPathInput(NamedTuple): + """ + name: Name of the input + values: list of associated values, which can be literal ones or + instances from MaterializedContent + """ + + name: "SymbolicParamName" + values: "MaterializedPathInputValues" + secondaryInputs: "Optional[Sequence[MaterializedPathContent]]" = None + autoFilled: "bool" = False + implicit: "bool" = False + + class ReproducibilityLevel(enum.IntEnum): Minimal = enum.auto() # Minimal / no reproducibility is requested Metadata = enum.auto() # Metadata reproducibility is requested @@ -222,10 +295,10 @@ class ROCrateToolboxException(Exception): def ReadROCrateMetadata( workflowROCrateFilename: "str", public_name: "str" -) -> "Tuple[Any, Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]]": +) -> "Tuple[Any, Optional[pathlib.Path]]": # Is it a bare file or an archive? jsonld_filename: "Optional[str]" = None - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None + payload_dir: "Optional[pathlib.Path]" = None if os.path.isdir(workflowROCrateFilename): possible_jsonld_filename = os.path.join( workflowROCrateFilename, ROCRATE_JSONLD_FILENAME @@ -277,7 +350,7 @@ def ReadROCrateMetadata( raise ROCrateToolboxException( f"{ROCRATE_JSONLD_FILENAME} from within {public_name} has unmanagable MIME {putative_mime_ld}" ) - payload_dir = zipfile.Path(workflowROCrateFilename) + payload_dir = ZipfilePath(workflowROCrateFilename) else: raise ROCrateToolboxException( f"The RO-Crate parsing code does not know how to parse {public_name} with MIME {putative_mime}" @@ -498,7 +571,7 @@ def identifyROCrate( return (resrow, g) OBTAIN_WORKFLOW_PID_SPARQL: "Final[str]" = """\ -SELECT ?identifier ?workflow_repository ?workflow_version ?workflow_url ?workflow_alternate_name ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version +SELECT ?origmainentity ?identifier ?workflow_repository ?workflow_version ?workflow_url ?workflow_alternate_name ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version ?file_size ?file_sha256 WHERE { ?mainentity s:programmingLanguage ?programminglanguage . ?programminglanguage @@ -513,55 +586,81 @@ def identifyROCrate( s:identifier ?programminglanguage_identifier . } { - { - FILTER NOT EXISTS { - ?mainentity s:isBasedOn ?origmainentity . - ?origmainentity - a bs:ComputationalWorkflow ; - dcterms:conformsTo ?bsworkflowprofile . - FILTER ( - STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) - ) . - } - OPTIONAL { - ?mainentity s:codeRepository ?workflow_repository . - } - OPTIONAL { - ?mainentity s:version ?workflow_version . - } - OPTIONAL { - ?mainentity s:url ?workflow_url . - } - OPTIONAL { - ?mainentity s:identifier ?identifier . - } - OPTIONAL { - ?mainentity s:alternateName ?workflow_alternate_name . - } - } UNION { + FILTER NOT EXISTS { ?mainentity s:isBasedOn ?origmainentity . ?origmainentity a bs:ComputationalWorkflow ; dcterms:conformsTo ?bsworkflowprofile . - OPTIONAL { - ?origmainentity s:codeRepository ?workflow_repository . - } - OPTIONAL { - ?origmainentity s:version ?workflow_version . - } - OPTIONAL { - ?origmainentity s:url ?workflow_url . - } FILTER ( STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) ) . - OPTIONAL { - ?origmainentity s:identifier ?identifier . - } - OPTIONAL { - ?origmainentity s:alternateName ?workflow_alternate_name . - } } + BIND (?mainentity AS ?origmainentity) + } UNION { + ?mainentity s:isBasedOn ?origmainentity . + ?origmainentity + a bs:ComputationalWorkflow ; + dcterms:conformsTo ?bsworkflowprofile . + FILTER ( + STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) + ) + } + OPTIONAL { + ?origmainentity s:codeRepository ?workflow_repository . + } + OPTIONAL { + ?origmainentity s:version ?workflow_version . + } + OPTIONAL { + ?origmainentity s:url ?workflow_url . + } + OPTIONAL { + ?origmainentity s:identifier ?identifier . + } + OPTIONAL { + ?origmainentity s:alternateName ?workflow_alternate_name . + } + OPTIONAL { + ?origmainentity + wrterm:sha256 ?file_sha256 . + } + OPTIONAL { + ?origmainentity + s:contentSize ?file_size . + } +} +""" + + LIST_PARTS_SPARQL: "Final[str]" = """\ +SELECT ?part_entity ?identifier ?part_repository ?part_version ?part_url ?part_name ?part_alternate_name ?file_size ?file_sha256 +WHERE { + ?entity s:hasPart+ $part_entity . + ?part_entity a s:MediaObject . + OPTIONAL { + ?part_entity s:codeRepository ?part_repository . + } + OPTIONAL { + ?part_entity s:version ?part_version . + } + OPTIONAL { + ?part_entity s:url ?part_url . + } + OPTIONAL { + ?part_entity s:identifier ?identifier . + } + OPTIONAL { + ?part_entity s:name ?part_name . + } + OPTIONAL { + ?part_entity s:alternateName ?part_alternate_name . + } + OPTIONAL { + ?part_entity + wrterm:sha256 ?file_sha256 . + } + OPTIONAL { + ?part_entity + s:contentSize ?file_size . } } """ @@ -1120,7 +1219,7 @@ def _parseContainersFromWorkflow( self, g: "rdflib.graph.Graph", main_entity: "rdflib.term.Identifier", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": # Get the list of containers qcontainers = rdflib.plugins.sparql.prepareQuery( @@ -1144,7 +1243,7 @@ def _parseContainersFromExecution( g: "rdflib.graph.Graph", execution: "rdflib.term.Identifier", main_entity: "rdflib.term.Identifier", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": # Get the list of containers qcontainers = rdflib.plugins.sparql.prepareQuery( @@ -1167,7 +1266,7 @@ def __parseContainersResults( self, qcontainersres: "rdflib.query.Result", main_entity: "rdflib.term.Identifier", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Optional[Tuple[ContainerType, Sequence[Container]]]": container_type: "Optional[ContainerType]" = None source_container_type: "Optional[ContainerType]" = None @@ -1341,7 +1440,6 @@ def __parseContainersResults( fingerprint = None origTaggedName = "" taggedName = "" - image_signature = None if source_container_type == ContainerType.Docker: the_registry = ( str(containerrow.source_container_registry) @@ -1367,200 +1465,45 @@ def __parseContainersResults( fingerprint = origTaggedName container_image_path: "Optional[str]" = None + located_snapshot: "Optional[pathlib.Path]" = None metadata_container_image_path: "Optional[str]" = None + located_metadata: "Optional[pathlib.Path]" = None + image_signature: "Optional[Fingerprint]" = None if payload_dir is not None: if containerrow.container != containerrow.source_container: - container_image_uri = str(containerrow.container) - container_image_parsed_uri = urllib.parse.urlparse( - container_image_uri + include_container_image = self.__processPayloadEntity( + the_entity=containerrow.container, + payload_dir=payload_dir, + kindobj=ContentKind.File, + entity_type="container image", + entity_name=origTaggedName, + the_file_size=containerrow.container_snapshot_size, + the_file_sha256=containerrow.container_snapshot_sha256, ) - if ( - container_image_parsed_uri.scheme - == self.RELATIVE_ROCRATE_SCHEME - ): - container_image_path = container_image_parsed_uri.path - if container_image_path.startswith("/"): - container_image_path = container_image_path[1:] - - located_snapshot = payload_dir / container_image_path - include_container_image = located_snapshot.exists() - if include_container_image: - include_container_image = located_snapshot.is_file() - if not include_container_image: - self.logger.warning( - f"Discarding container image payload {container_image_path} for {origTaggedName} (is not a file)" - ) - else: - self.logger.warning( - f"Discarding container image payload {container_image_path} for {origTaggedName} (not found)" - ) - - if ( - include_container_image - and containerrow.container_snapshot_size is not None - ): - if hasattr(located_snapshot, "stat"): - the_size = located_snapshot.stat().st_size - else: - the_size = located_snapshot.root.getinfo( - container_image_path - ).file_size - if isinstance( - containerrow.container_snapshot_size, - rdflib.term.Literal, - ): - container_snapshot_size = int( - containerrow.container_snapshot_size.value - ) - else: - container_snapshot_size = int( - str(containerrow.container_snapshot_size) - ) - include_container_image = ( - the_size == container_snapshot_size - ) - if not include_container_image: - self.logger.warning( - f"Discarding container image payload {container_image_path} for {origTaggedName} (mismatching file size)" - ) - - if include_container_image: - with located_snapshot.open(mode="rb") as lS: - computed_image_signature = ( - ComputeDigestFromFileLike( - cast("IO[bytes]", lS), - digestAlgorithm="sha256", - ) - ) - if ( - containerrow.container_snapshot_sha256 - is not None - ): - image_signature = stringifyDigest( - "sha256", - bytes.fromhex( - str( - containerrow.container_snapshot_sha256 - ) - ), - ) - - include_container_image = ( - image_signature == computed_image_signature - ) - if not include_container_image: - self.logger.warning( - f"Discarding payload {container_image_path} for {origTaggedName} (mismatching digest)" - ) - else: - image_signature = computed_image_signature - - if not include_container_image: - container_image_path = None + if include_container_image is not None: + ( + container_image_path, + located_snapshot, + image_signature, + ) = include_container_image if containerrow.source_container_metadata is not None: - container_metadata_uri = str( - containerrow.source_container_metadata - ) - container_metadata_parsed_uri = urllib.parse.urlparse( - container_metadata_uri + include_metadata_container_image = self.__processPayloadEntity( + the_entity=containerrow.source_container_metadata, + payload_dir=payload_dir, + kindobj=ContentKind.File, + entity_type="container metadata", + entity_name=origTaggedName, + the_file_size=containerrow.source_container_metadata_size, + the_file_sha256=containerrow.source_container_metadata_sha256, ) - if ( - container_metadata_parsed_uri.scheme - == self.RELATIVE_ROCRATE_SCHEME - ): - metadata_container_image_path = ( - container_metadata_parsed_uri.path - ) - if metadata_container_image_path.startswith("/"): - metadata_container_image_path = ( - metadata_container_image_path[1:] - ) - - located_metadata = ( - payload_dir / metadata_container_image_path - ) - include_metadata_container_image = ( - located_metadata.exists() - ) - if include_metadata_container_image: - include_metadata_container_image = ( - located_metadata.is_file() - ) - if not include_metadata_container_image: - self.logger.warning( - f"Discarding container metadata payload {metadata_container_image_path} for {origTaggedName} (is not a file)" - ) - else: - self.logger.warning( - f"Discarding container metadata payload {metadata_container_image_path} for {origTaggedName} (not found)" - ) - - if ( - include_metadata_container_image - and containerrow.source_container_metadata_size - is not None - ): - if hasattr(located_metadata, "stat"): - the_size = located_metadata.stat().st_size - else: - the_size = located_metadata.root.getinfo( - metadata_container_image_path - ).file_size - if isinstance( - containerrow.source_container_metadata_size, - rdflib.term.Literal, - ): - source_container_metadata_size = int( - containerrow.source_container_metadata_size.value - ) - else: - source_container_metadata_size = int( - str( - containerrow.source_container_metadata_size - ) - ) - - include_metadata_container_image = ( - the_size == source_container_metadata_size - ) - if not include_metadata_container_image: - self.logger.warning( - f"Discarding container metadata payload {container_image_path} for {origTaggedName} (mismatching file size)" - ) - - if include_metadata_container_image: - with located_metadata.open(mode="rb") as lM: - computed_source_container_metadata_signature = ( - ComputeDigestFromFileLike( - cast("IO[bytes]", lM), - digestAlgorithm="sha256", - ) - ) - if ( - containerrow.source_container_metadata_sha256 - is not None - ): - source_container_metadata_signature = stringifyDigest( - "sha256", - bytes.fromhex( - str( - containerrow.source_container_metadata_sha256 - ) - ), - ) - - include_metadata_container_image = ( - source_container_metadata_signature - == computed_source_container_metadata_signature - ) - if not include_metadata_container_image: - self.logger.warning( - f"Discarding container metadata payload {metadata_container_image_path} for {origTaggedName} (mismatching digest)" - ) - - if not include_metadata_container_image: - metadata_container_image_path = None + + if include_metadata_container_image is not None: + ( + metadata_container_image_path, + located_metadata, + computed_source_container_metadata_signature, + ) = include_metadata_container_image the_containers.append( Container( @@ -1568,10 +1511,8 @@ def __parseContainersResults( type=container_type, registries=registries, taggedName=cast("URIType", taggedName), - localPath=cast("AbsPath", container_image_path), - metadataLocalPath=cast( - "AbsPath", metadata_container_image_path - ), + localPath=located_snapshot, + metadataLocalPath=located_metadata, architecture=None if containerrow.source_container_arch is None else cast( @@ -1743,7 +1684,7 @@ def _parseInputsFromExecution( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qinputs = rdflib.plugins.sparql.prepareQuery( @@ -1767,7 +1708,7 @@ def _parseInputsFromMainEntity( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qwinputs = rdflib.plugins.sparql.prepareQuery( @@ -1788,110 +1729,55 @@ def _parseInputsFromMainEntity( def __processPayloadInput( self, inputrow: "rdflib.query.ResultRow", - payload_dir: "Union[pathlib.Path, ZipfilePath, zipfile.Path]", + payload_dir: "pathlib.Path", the_uri: "str", licences: "Sequence[str]", input_type: "str", kindobj: "ContentKind", cached_inputs_hash: "MutableMapping[str, MaterializedInput]", ) -> "MutableMapping[str, MaterializedInput]": - input_uri = str(inputrow.fileid) input_name = str(inputrow.name) - input_parsed_uri = urllib.parse.urlparse(input_uri) - if input_parsed_uri.scheme == self.RELATIVE_ROCRATE_SCHEME: - input_path = input_parsed_uri.path - if input_path.startswith("/"): - input_path = input_path[1:] - - located_input = payload_dir / input_path - include_input = located_input.exists() - if include_input: - # Is it what it was claimed? - include_input = ( - kindobj == ContentKind.File and located_input.is_file() - ) or (kindobj == ContentKind.Directory and located_input.is_dir()) - if not include_input: - self.logger.warning( - f"Discarding payload {input_path} for {input_type} {input_name} (not is a {kindobj.value})" - ) - else: - self.logger.warning( - f"Discarding payload {input_path} for {input_type} {input_name} (not found)" - ) - - if ( - include_input - and kindobj == ContentKind.File - and inputrow.file_size is not None - ): - # Does the recorded file size match? - if hasattr(located_input, "stat"): - the_size = located_input.stat().st_size - else: - the_size = located_input.root.getinfo(input_path).file_size - if isinstance( - inputrow.file_size, - rdflib.term.Literal, - ): - file_size = int(inputrow.file_size.value) - else: - file_size = int(str(inputrow.file_size)) - - include_input = the_size == file_size - if not include_input: - self.logger.warning( - f"Discarding payload {input_path} for {input_type} {input_name} (mismatching file size)" - ) - - if include_input and kindobj == ContentKind.File: - with located_input.open(mode="rb") as lI: - the_signature = ComputeDigestFromFileLike( - cast("IO[bytes]", lI), - digestAlgorithm="sha256", - ) - if inputrow.file_sha256 is not None: - file_signature = stringifyDigest( - "sha256", - bytes.fromhex(str(inputrow.file_sha256)), - ) - - include_input = file_signature == the_signature - if not include_input: - self.logger.warning( - f"Discarding payload {input_path} for {input_type} {input_name} (mismatching digest)" - ) - else: - file_signature = the_signature + include_input = self.__processPayloadEntity( + the_entity=inputrow.fileid, + payload_dir=payload_dir, + kindobj=kindobj, + entity_type=input_type, + entity_name=input_name, + the_file_size=inputrow.file_size, + the_file_sha256=inputrow.file_sha256, + ) - if include_input: - licences_tuple = ( - cast("Tuple[URIType, ...]", tuple(licences)) - if len(licences) > 0 - else DefaultNoLicenceTuple + if include_input is not None: + input_path, located_input, file_signature = include_input + licences_tuple = ( + cast("Tuple[URIType, ...]", tuple(licences)) + if len(licences) > 0 + else DefaultNoLicenceTuple + ) + mat_content = MaterializedContent( + local=cast("AbsPath", input_path), + licensed_uri=LicensedURI( + uri=cast("URIType", the_uri), + licences=licences_tuple, + ), + # TODO: better inference, as it might have a side effect + prettyFilename=cast("RelPath", located_input.name), + kind=kindobj, + fingerprint=file_signature, + ) + cached_input = cached_inputs_hash.get(input_name) + if cached_input is None: + cached_input = MaterializedInput( + name=cast("SymbolicParamName", input_name), + values=[mat_content], + # implicit=, ) - mat_content = MaterializedContent( - local=cast("AbsPath", input_path), - licensed_uri=LicensedURI( - uri=cast("URIType", the_uri), - licences=licences_tuple, - ), - # TODO: better inference, as it might have a side effect - prettyFilename=cast("RelPath", located_input.name), - kind=kindobj, + else: + cached_input = cached_input._replace( + values=[*cached_input.values, mat_content], ) - cached_input = cached_inputs_hash.get(input_name) - if cached_input is None: - cached_input = MaterializedInput( - name=cast("SymbolicParamName", input_name), - values=[mat_content], - # implicit=, - ) - else: - cached_input = cached_input._replace( - values=[*cached_input.values, mat_content], - ) - cached_inputs_hash[input_name] = cached_input + cached_inputs_hash[input_name] = cached_input return cached_inputs_hash @@ -1901,7 +1787,7 @@ def __parseInputsResults( g: "rdflib.graph.Graph", default_licences: "Sequence[str]", public_name: "str", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[ParamsBlock, Optional[Sequence[MaterializedInput]]]": # TODO: implement this params: "MutableParamsBlock" = {} @@ -2045,7 +1931,7 @@ def _parseEnvFromExecution( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[EnvironmentBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qenv = rdflib.plugins.sparql.prepareQuery( @@ -2069,7 +1955,7 @@ def _parseEnvFromMainEntity( main_entity: "rdflib.term.Identifier", default_licences: "Sequence[str]", public_name: "str", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[EnvironmentBlock, Optional[Sequence[MaterializedInput]]]": # Get the list of inputs qwenv = rdflib.plugins.sparql.prepareQuery( @@ -2093,7 +1979,7 @@ def __parseEnvResults( g: "rdflib.graph.Graph", default_licences: "Sequence[str]", public_name: "str", - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[EnvironmentBlock, Optional[Sequence[MaterializedInput]]]": """ This method is (almost) identical to __parseInputsResults @@ -2261,15 +2147,104 @@ def _getLicences( return licences + def __processPayloadEntity( + self, + the_entity: "rdflib.term.Identifier", + payload_dir: "pathlib.Path", + kindobj: "ContentKind", + entity_type: "str", + entity_name: "str", + the_file_size: "rdflib.term.Node", + the_file_sha256: "rdflib.term.Node", + ) -> "Optional[Tuple[str, pathlib.Path, Optional[Fingerprint]]]": + entity_uri = str(the_entity) + entity_parsed_uri = urllib.parse.urlparse(entity_uri) + include_entity = entity_parsed_uri.scheme == self.RELATIVE_ROCRATE_SCHEME + + entity_path: "Optional[str]" = None + located_entity: "Optional[pathlib.Path]" = None + if include_entity: + entity_path = entity_parsed_uri.path + if entity_path.startswith("/"): + entity_path = entity_path[1:] + + located_entity = payload_dir / entity_path + include_entity = located_entity.exists() + if not include_entity: + self.logger.warning( + f"Discarding payload {entity_path} for {entity_type} {entity_name} (not found)" + ) + + if include_entity: + assert located_entity is not None + include_entity = ( + kindobj == ContentKind.File and located_entity.is_file() + ) or (kindobj == ContentKind.Directory and located_entity.is_dir()) + + if not include_entity: + self.logger.warning( + f"Discarding payload {entity_path} for {entity_type} {entity_name} (not is a {kindobj.value})" + ) + + if include_entity and kindobj == ContentKind.File and the_file_size is not None: + assert entity_path is not None + assert located_entity is not None + # Does the recorded file size match? + if isinstance(located_entity, ZipfilePath): + the_size = located_entity.zip_root.getinfo(entity_path).file_size + else: + the_size = located_entity.stat().st_size + if isinstance(the_file_size, rdflib.term.Literal): + file_size = int(the_file_size.value) + else: + file_size = int(str(the_file_size)) + + include_entity = the_size == file_size + if not include_entity: + self.logger.warning( + f"Discarding payload {entity_path} for {entity_type} {entity_name} (mismatching file size {the_size} vs {file_size})" + ) + + file_signature: "Optional[Fingerprint]" = None + if include_entity and kindobj == ContentKind.File: + assert located_entity is not None + with located_entity.open(mode="rb") as lE: + the_signature = ComputeDigestFromFileLike( + lE, + digestAlgorithm="sha256", + ) + if the_file_sha256 is not None: + file_signature = stringifyDigest( + "sha256", + bytes.fromhex(str(the_file_sha256)), + ) + + include_entity = file_signature == the_signature + if not include_entity: + self.logger.warning( + f"Discarding payload {entity_path} for {entity_type} {entity_name} (mismatching digest)" + ) + else: + file_signature = the_signature + + if include_entity: + assert entity_path is not None + assert located_entity is not None + return (entity_path, located_entity, file_signature) + else: + return None + def extractWorkflowMetadata( self, g: "rdflib.graph.Graph", main_entity: "rdflib.term.Identifier", default_repo: "Optional[str]", public_name: "str", - ) -> "Tuple[RemoteRepo, WorkflowType]": + payload_dir: "Optional[pathlib.Path]" = None, + ) -> "Tuple[RemoteRepo, WorkflowType, Optional[LocalWorkflow]]": # This query will tell us where the original workflow was located, # its language and version + cached_workflow: "Optional[LocalWorkflow]" = None qlang = rdflib.plugins.sparql.prepareQuery( self.OBTAIN_WORKFLOW_PID_SPARQL, initNs=self.SPARQL_NS, @@ -2359,7 +2334,32 @@ def extractWorkflowMetadata( programminglanguage_url, programminglanguage_identifier ) - return repo, workflow_type + if payload_dir is not None: + include_main_entity = self.__processPayloadEntity( + the_entity=langrow.origmainentity, + payload_dir=payload_dir, + kindobj=ContentKind.File, + entity_type="main workflow component", + entity_name="PEPE", # FIXME + the_file_size=langrow.file_size, + the_file_sha256=langrow.file_sha256, + ) + + if include_main_entity is not None: + main_entity_path = include_main_entity[0] + + # TODO + # workflow_parts = self.__list_entity_parts(langrow.origmainentity, payload_dir) + + # cached_workflow = LocalWorkflow( + # dir=cast("AbsPath", ""), + # relPath=cast("RelPath", main_entity_path), + # effectiveCheckout=, + # # langVersion= + # relPathFiles=rel_path_files, + # ) + + return repo, workflow_type, cached_workflow def generateWorkflowMetaFromJSONLD( self, @@ -2368,7 +2368,7 @@ def generateWorkflowMetaFromJSONLD( retrospective_first: "bool" = True, reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, strict_reproducibility_level: "bool" = False, - payload_dir: "Optional[Union[pathlib.Path, ZipfilePath, zipfile.Path]]" = None, + payload_dir: "Optional[pathlib.Path]" = None, ) -> "Tuple[RemoteRepo, WorkflowType, ContainerType, ParamsBlock, EnvironmentBlock, OutputsBlock, Optional[LocalWorkflow], Sequence[Container], Optional[Sequence[MaterializedInput]], Optional[Sequence[MaterializedInput]]]": matched_crate, g = self.identifyROCrate(jsonld_obj, public_name) # Is it an RO-Crate? @@ -2392,18 +2392,20 @@ def generateWorkflowMetaFromJSONLD( f"JSON-LD from {public_name} is not a WRROC Workflow" ) - cached_workflow: "Optional[LocalWorkflow]" = None cached_inputs: "Optional[Sequence[MaterializedInput]]" = None cached_environment: "Optional[Sequence[MaterializedInput]]" = None # The default crate licences crate_licences = self._getLicences(g, matched_crate.mainentity, public_name) - repo, workflow_type = self.extractWorkflowMetadata( + repo, workflow_type, cached_workflow = self.extractWorkflowMetadata( g, matched_crate.mainentity, default_repo=str(matched_crate.wfhrepourl), public_name=public_name, + payload_dir=payload_dir + if reproducibility_level >= ReproducibilityLevel.Full + else None, ) # At this point we know WfExS supports the workflow engine. diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index b0168b1e..5e07b521 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -2621,7 +2621,7 @@ def getWorkflowRepoFromROCrateFile( # fetched content (needed by Nextflow) # Some RO-Crates might have this value missing or ill-built - repo, workflow_type = self.rocrate_toolbox.extractWorkflowMetadata( + repo, workflow_type, _ = self.rocrate_toolbox.extractWorkflowMetadata( g, matched_crate.mainentity, default_repo=str(matched_crate.wfhrepourl), diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 86337f9e..8b1b06b3 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -225,8 +225,6 @@ WorkflowMetaConfigBlock: TypeAlias = Mapping[str, Any] WritableWorkflowMetaConfigBlock: TypeAlias = MutableMapping[str, Any] - from .utils.zipfile_path import Path as ZipfilePath - import urllib.parse @@ -280,10 +278,6 @@ except ImportError: from yaml import Loader as YAMLLoader, Dumper as YAMLDumper -# This is needed to keep backward compatibility -# with ancient working directories -Container.RegisterYAMLConstructor(YAMLLoader) - from .common import ( AbstractWfExSException, Attribution, @@ -1452,7 +1446,7 @@ def FromPreviousInstanceDeclaration( @staticmethod def _transferInputs( - payload_dir: "Union[pathlib.Path, ZipfilePath, zipfile.Path]", + payload_dir: "pathlib.Path", inputs_dir: "pathlib.Path", cached_inputs: "Sequence[MaterializedInput]", ) -> "Sequence[MaterializedInput]": @@ -1543,6 +1537,9 @@ def FromPreviousROCrate( payload_dir=payload_dir, ) + # logging.error(f"Containers {the_containers}") + # logging.error(f"Inputs {cached_inputs}") + # sys.exit(1) # Now, some postprocessing... if ( reproducibility_level >= ReproducibilityLevel.Full From 63f4ea2da999790b3a58a984186e90ae74c787d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sun, 16 Jun 2024 02:03:14 +0200 Subject: [PATCH 37/62] Added marshalling handling of pathlib.Path and fixed handling of named tuples --- wfexs_backend/utils/marshalling_handling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/utils/marshalling_handling.py b/wfexs_backend/utils/marshalling_handling.py index 1fde7ec9..ec6d64c8 100644 --- a/wfexs_backend/utils/marshalling_handling.py +++ b/wfexs_backend/utils/marshalling_handling.py @@ -24,6 +24,7 @@ import copy import enum import logging +import pathlib from typing import ( TYPE_CHECKING, cast, @@ -61,11 +62,14 @@ def recurse_m(x: "Iterable[Any]") -> "Iterable[Any]": "_enum": obj.__class__.__name__, "value": obj.value, } + elif obj_is(pathlib.Path): + # Store the path, not the instance + return obj.as_posix() elif obj_is(tuple) and hasattr(obj, "_fields"): # namedtuple fields = zip(obj._fields, recurse_m(obj)) class_name = obj.__class__.__name__ return dict(fields, **{"_type": class_name}) - elif obj_is(abc.ABC) and hasattr(obj, "__dataclass_fields__"): # dataclass + elif obj_is(object) and hasattr(obj, "__dataclass_fields__"): # dataclass fields_m = map( lambda field: (field, marshall_namedtuple(getattr(obj, field))), obj.__dataclass_fields__.keys(), From 5cf15530364fd40aeb2405e29fd36f482e6fee22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sun, 16 Jun 2024 02:04:54 +0200 Subject: [PATCH 38/62] Added unmarshalling fixes for Container, now pathlib.Path is used instead of bare strings --- wfexs_backend/container_factories/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 0221bcaa..0cf4e3b6 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -144,11 +144,18 @@ class Container(ContainerTaggedName): image_signature: "Optional[Fingerprint]" = None def _value_defaults_fixes(self) -> None: + if isinstance(self.localPath, str): + # Properly casting the path + self.localPath = pathlib.Path(self.localPath) + # This code is needed for old working directories if self.metadataLocalPath is None and self.localPath is not None: self.metadataLocalPath = self.localPath.with_name( self.localPath.name + META_JSON_POSTFIX ) + elif isinstance(self.metadataLocalPath, str): + # Properly casting the path + self.metadataLocalPath = pathlib.Path(self.metadataLocalPath) # And this is to tell the kind of source container type if self.source_type is None: From 45e686b8d13b918f0c642e6bd6833b5f4df70abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sun, 16 Jun 2024 16:37:26 +0200 Subject: [PATCH 39/62] Minor tweaks to allow better stacked unmarshalling --- wfexs_backend/utils/marshalling_handling.py | 36 +++++++++++++-------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/wfexs_backend/utils/marshalling_handling.py b/wfexs_backend/utils/marshalling_handling.py index ec6d64c8..5bf43af8 100644 --- a/wfexs_backend/utils/marshalling_handling.py +++ b/wfexs_backend/utils/marshalling_handling.py @@ -108,6 +108,7 @@ def recurse_u( return map(lambda l: unmarshall_namedtuple(l, myglobals), x) # recurse_orig = lambda x, myglobals: map(lambda l: unmarshall_namedtuple(l, myglobals), x) + objres = obj obj_is = partial(isinstance, obj) if obj_is((collections.abc.Mapping, dict)): if "_enum" in obj: # originally an enum @@ -154,10 +155,12 @@ def recurse_u( # theTypeName = clazz.__name__ # Fixes where some key was added or removed along the development + c_objn = objn + v_fixes_m = getattr(clazz, "_value_fixes", None) if callable(v_fixes_m): - v_fixes = v_fixes_m() - c_objn = copy.copy(objn) + v_fixes = cast("Callable[[], Mapping[str, str]]", v_fixes_m)() + c_objn = copy.copy(c_objn) for dest_key, source_key in v_fixes.items(): if source_key is None: # Removal if it is there @@ -167,8 +170,15 @@ def recurse_u( # Addition if it is there if source_key in c_objn: c_objn[dest_key] = c_objn[source_key] - else: - c_objn = objn + + # Complex fixes, like type change + # this is needed for namedtuples, where their values are immutable + # once the object is built + m_fixes_m = getattr(clazz, "_mapping_fixes", None) + if callable(m_fixes_m): + c_objn = cast( + "Callable[[Mapping[str, Any]], Mapping[str, Any]]", m_fixes_m + )(c_objn) # Fixes where some key was renamed along the development fixes_m = getattr(clazz, "_key_fixes", None) @@ -190,17 +200,17 @@ def recurse_u( # return clazz(**fields) try: - return clazz(**fields) + objres = clazz(**fields) except: - logger.error(f"Unmarshalling Error instantiating {clazz.__name__}") + logger.exception(f"Unmarshalling Error instantiating {clazz.__name__}") raise elif obj_is(collections.abc.Iterable) and not obj_is(str): # print(type(obj)) return type(obj)(recurse_u(obj, myglobals)) - else: - if isinstance(obj, object): - if hasattr(obj, "_value_defaults_fixes") and callable( - getattr(obj, "_value_defaults_fixes") - ): - obj._value_defaults_fixes() - return obj + + if isinstance(objres, object): + if hasattr(objres, "_value_defaults_fixes") and callable( + getattr(objres, "_value_defaults_fixes") + ): + objres._value_defaults_fixes() + return objres From 817659456d9267821b51e9bde45b4dfa16922a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sun, 16 Jun 2024 16:44:19 +0200 Subject: [PATCH 40/62] `scantree` method has been moved to wfexs_backend.utils.digests Also, minor adjustments in method type annotations. --- wfexs_backend/utils/digests.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/utils/digests.py b/wfexs_backend/utils/digests.py index 6626693e..0d47cab6 100644 --- a/wfexs_backend/utils/digests.py +++ b/wfexs_backend/utils/digests.py @@ -49,6 +49,7 @@ ) from ..common import ( + AbsPath, AbstractGeneratedContent, AnyPath, Fingerprint, @@ -63,7 +64,6 @@ def hex(self) -> "str": from ..common import ( - scantree, GeneratedContent, ) @@ -166,7 +166,7 @@ def ComputeDigestFromFileLike( @functools.lru_cache(maxsize=32) def ComputeDigestFromFile( - filename: "AnyPath", + filename: "Union[AnyPath, os.PathLike[str]]", digestAlgorithm: "str" = DEFAULT_DIGEST_ALGORITHM, bufferSize: "int" = DEFAULT_DIGEST_BUFFER_SIZE, repMethod: "Union[FingerprintMethod, RawFingerprintMethod]" = stringifyDigest, @@ -293,8 +293,30 @@ def compute_sha1_git_from_any(path: "str") -> "Tuple[str, str]": raise FileNotFoundError(f"Unable to process path {path}") +# Next method has been borrowed from FlowMaps +def scantree(path: "Union[AnyPath, os.PathLike[str]]") -> "Iterator[os.DirEntry[str]]": + """Recursively yield DirEntry objects for given directory.""" + + hasDirs = False + for entry in os.scandir(path): + # We are avoiding to enter in loops around '.' and '..' + if entry.is_dir(follow_symlinks=False): + if entry.name[0] != ".": + hasDirs = True + else: + yield entry + + # We are leaving the dirs to the end + if hasDirs: + for entry in os.scandir(path): + # We are avoiding to enter in loops around '.' and '..' + if entry.is_dir(follow_symlinks=False) and entry.name[0] != ".": + yield entry + yield from scantree(cast("AbsPath", entry.path)) + + def ComputeDigestFromDirectory( - dirname: "AnyPath", + dirname: "Union[AnyPath, os.PathLike[str]]", digestAlgorithm: "str" = DEFAULT_DIGEST_ALGORITHM, bufferSize: "int" = DEFAULT_DIGEST_BUFFER_SIZE, repMethod: "FingerprintMethod" = stringifyDigest, @@ -330,7 +352,7 @@ def ComputeDigestFromDirectory( def ComputeDigestFromGeneratedContentList( - dirname: "AnyPath", + dirname: "Union[AnyPath, os.PathLike[str]]", theValues: "Sequence[AbstractGeneratedContent]", digestAlgorithm: "str" = DEFAULT_DIGEST_ALGORITHM, bufferSize: "int" = DEFAULT_DIGEST_BUFFER_SIZE, From 19ca1d3215da4a3590486ad16a49b125e16d3eb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Sun, 16 Jun 2024 16:49:52 +0200 Subject: [PATCH 41/62] Transition to pathlib.Path in MaterializedContent, AbstractGeneratedContent and LocalWorkflow. This transition has caused major code changes in path handling, and also indirectly in workflow engines and container factories. --- tests/pushers/test_cache.py | 3 +- tests/pushers/test_nextcloud.py | 7 +- wfexs_backend/common.py | 62 +++--- wfexs_backend/container_factories/__init__.py | 1 + .../pushers/abstract_token_export.py | 2 +- wfexs_backend/pushers/cache_export.py | 13 +- wfexs_backend/pushers/nextcloud_export.py | 3 +- wfexs_backend/ro_crate.py | 10 +- wfexs_backend/utils/contents.py | 12 +- wfexs_backend/utils/rocrate.py | 72 +------ wfexs_backend/wfexs_backend.py | 3 +- wfexs_backend/workflow.py | 122 +++++------ wfexs_backend/workflow_engines/__init__.py | 92 ++++----- wfexs_backend/workflow_engines/cwl_engine.py | 60 +++--- .../workflow_engines/nextflow_engine.py | 194 +++++++++--------- 15 files changed, 283 insertions(+), 373 deletions(-) diff --git a/tests/pushers/test_cache.py b/tests/pushers/test_cache.py index 118cc243..9f126a15 100644 --- a/tests/pushers/test_cache.py +++ b/tests/pushers/test_cache.py @@ -22,6 +22,7 @@ from pathlib import Path import os +import pathlib import sys import urllib.error @@ -147,7 +148,7 @@ def test_cache_push(tmpdir) -> "None": # type: ignore[no-untyped-def] # Fourth, push file into the cache pushed_contents = cep.push( items=[ - GeneratedContent(local=cast("AbsPath", naive_path)), + GeneratedContent(local=pathlib.Path(naive_path)), ], preferred_id=booked_entry.draft_id, licences=[ diff --git a/tests/pushers/test_nextcloud.py b/tests/pushers/test_nextcloud.py index b6040875..333c8bda 100644 --- a/tests/pushers/test_nextcloud.py +++ b/tests/pushers/test_nextcloud.py @@ -21,9 +21,8 @@ import datetime import logging -from pathlib import Path - import os +import pathlib import sys import urllib.error @@ -728,10 +727,10 @@ def test_nextcloud_push(file_params: "ParamTestData") -> "None": preferred_id=booked_entry.draft_id, items=[ GeneratedContent( - local=cast("AbsPath", naive_path), + local=pathlib.Path(naive_path), ), GeneratedContent( - local=cast("AbsPath", naive_path), + local=pathlib.Path(naive_path), preferredFilename=cast("RelPath", STREAM_FILENAME), ), ], diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index f71c941e..13417850 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -19,10 +19,12 @@ from __future__ import absolute_import import abc +import copy from dataclasses import dataclass import datetime import enum import os +import pathlib from typing import ( cast, NamedTuple, @@ -92,11 +94,6 @@ def create_augmented_context( # This is either a relative or an absolute path AnyPath: TypeAlias = Union[RelPath, AbsPath] - # These declarations are for "new world" - # MaterializedPathContent, LocalPathWorkflow - # and indirectly MaterializedPathInput - import pathlib - PathlibLike: TypeAlias = pathlib.Path @@ -370,14 +367,25 @@ class MaterializedContent(NamedTuple): from the cache. """ - local: "AbsPath" + local: "PathlibLike" licensed_uri: "LicensedURI" prettyFilename: "RelPath" kind: "ContentKind" = ContentKind.File metadata_array: "Optional[Sequence[URIWithMetadata]]" = None - extrapolated_local: "Optional[AbsPath]" = None + extrapolated_local: "Optional[PathlibLike]" = None fingerprint: "Optional[Fingerprint]" = None + @classmethod + def _mapping_fixes(cls, orig: "Mapping[str, Any]") -> "Mapping[str, Any]": + dest = cast("MutableMapping[str, Any]", copy.copy(orig)) + dest["local"] = pathlib.Path(dest["local"]) + + extra = dest.get("extrapolated_local") + if extra is not None: + dest["extrapolated_local"] = pathlib.Path(extra) + + return dest + @classmethod def _key_fixes(cls) -> "Mapping[str, str]": return {"uri": "licensed_uri"} @@ -480,11 +488,18 @@ class AbstractGeneratedContent(abc.ABC): uploaded from the computational environment """ - local: "AnyPath" + local: "PathlibLike" signature: "Optional[Fingerprint]" = None uri: "Optional[LicensedURI]" = None preferredFilename: "Optional[RelPath]" = None + @classmethod + def _mapping_fixes(cls, orig: "Mapping[str, Any]") -> "Mapping[str, Any]": + dest = cast("MutableMapping[str, Any]", copy.copy(orig)) + dest["local"] = pathlib.Path(dest["local"]) + + return dest + @dataclass class GeneratedContent(AbstractGeneratedContent): @@ -554,12 +569,19 @@ class LocalWorkflow(NamedTuple): or remote ones (i.e. CWL) """ - dir: "AbsPath" + dir: "PathlibLike" relPath: "Optional[RelPath]" effectiveCheckout: "Optional[RepoTag]" langVersion: "Optional[Union[EngineVersion, WFLangVersion]]" = None relPathFiles: "Optional[Sequence[Union[RelPath, URIType]]]" = None + @classmethod + def _mapping_fixes(cls, orig: "Mapping[str, Any]") -> "Mapping[str, Any]": + dest = cast("MutableMapping[str, Any]", copy.copy(orig)) + dest["dir"] = pathlib.Path(dest["dir"]) + + return dest + if TYPE_CHECKING: TRS_Workflow_Descriptor: TypeAlias = str @@ -729,25 +751,3 @@ class StagedExecution(NamedTuple): outputMetaDir: "Optional[RelPath]" = None diagram: "Optional[RelPath]" = None logfile: "Sequence[RelPath]" = [] - - -# Next method has been borrowed from FlowMaps -def scantree(path: "AnyPath") -> "Iterator[os.DirEntry[str]]": - """Recursively yield DirEntry objects for given directory.""" - - hasDirs = False - for entry in os.scandir(path): - # We are avoiding to enter in loops around '.' and '..' - if entry.is_dir(follow_symlinks=False): - if entry.name[0] != ".": - hasDirs = True - else: - yield entry - - # We are leaving the dirs to the end - if hasDirs: - for entry in os.scandir(path): - # We are avoiding to enter in loops around '.' and '..' - if entry.is_dir(follow_symlinks=False) and entry.name[0] != ".": - yield entry - yield from scantree(cast("AbsPath", entry.path)) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 0cf4e3b6..54d5869a 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -147,6 +147,7 @@ def _value_defaults_fixes(self) -> None: if isinstance(self.localPath, str): # Properly casting the path self.localPath = pathlib.Path(self.localPath) + print(f"localPath {self.localPath}") # This code is needed for old working directories if self.metadataLocalPath is None and self.localPath is not None: diff --git a/wfexs_backend/pushers/abstract_token_export.py b/wfexs_backend/pushers/abstract_token_export.py index 0ab947b1..ce4e7b6b 100644 --- a/wfexs_backend/pushers/abstract_token_export.py +++ b/wfexs_backend/pushers/abstract_token_export.py @@ -201,7 +201,7 @@ def push( try: upload_response = self.upload_file_to_draft( - booked_entry, item.local, prefname + booked_entry, str(item.local), prefname ) except urllib.error.HTTPError as he: failed = True diff --git a/wfexs_backend/pushers/cache_export.py b/wfexs_backend/pushers/cache_export.py index 2ec94f0f..42dab17f 100644 --- a/wfexs_backend/pushers/cache_export.py +++ b/wfexs_backend/pushers/cache_export.py @@ -19,6 +19,7 @@ from __future__ import absolute_import import os +import pathlib import shutil import tempfile from typing import ( @@ -157,12 +158,12 @@ def push( # Create temporary destination directory (if needed) tmpdir = None - source = None + source: "Optional[pathlib.Path]" = None metadata = None try: if len(items) > 1: tmpdir = tempfile.mkdtemp(dir=self.tempdir, suffix="export") - source = tmpdir + source = pathlib.Path(tmpdir) # Now, transfer all of them for i_item, item in enumerate(items): @@ -182,7 +183,11 @@ def push( dest = cast("AbsPath", os.path.join(tmpdir, relitem)) link_or_copy(item.local, dest) else: - source = items[0].local + source = ( + items[0].local + if isinstance(items[0].local, pathlib.Path) + else pathlib.Path(items[0].local) + ) # Generated file URI injecting the preferred id an scheme uri_to_fetch = LicensedURI( @@ -192,7 +197,7 @@ def push( urllib.parse.ParseResult( scheme="file", netloc="", - path=source, + path=source.as_posix(), params="", query=urllib.parse.urlencode( {"inject_as": preferred_id}, diff --git a/wfexs_backend/pushers/nextcloud_export.py b/wfexs_backend/pushers/nextcloud_export.py index 243c7dd6..78b88490 100644 --- a/wfexs_backend/pushers/nextcloud_export.py +++ b/wfexs_backend/pushers/nextcloud_export.py @@ -21,6 +21,7 @@ import datetime import logging import os +import pathlib import shutil import tempfile from typing import ( @@ -661,7 +662,7 @@ def upload_file_to_draft( try: mappings = [ MaterializedContent( - local=cast("AbsPath", local_filename), + local=pathlib.Path(local_filename), licensed_uri=LicensedURI(uri=cast("URIType", "")), prettyFilename=cast("RelPath", remote_filename), kind=ContentKind.File, diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index b447db92..863acdcc 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -748,7 +748,7 @@ def __init__( self.licence_matcher = licence_matcher if localWorkflow.relPath is not None: - wf_local_path = os.path.join(localWorkflow.dir, localWorkflow.relPath) + wf_local_path = localWorkflow.dir / localWorkflow.relPath else: wf_local_path = localWorkflow.dir @@ -1923,12 +1923,12 @@ def _add_workflow_to_crate( was_workflow_run: "bool" = True, ) -> "FixedWorkflow": # Determining the absolute path of the workflow - the_path: "str" + the_path: "pathlib.Path" if the_workflow.relPath is not None: if os.path.isabs(the_workflow.relPath): - the_path = the_workflow.relPath + the_path = pathlib.Path(the_workflow.relPath) else: - the_path = os.path.join(the_workflow.dir, the_workflow.relPath) + the_path = the_workflow.dir / the_workflow.relPath else: the_path = the_workflow.dir @@ -1943,7 +1943,7 @@ def _add_workflow_to_crate( if remote_repo.tag is not None: wf_url += "tree/" + remote_repo.tag if the_workflow.relPath is not None: - wf_url += the_workflow.dir.rsplit("workflow")[1] + wf_url += the_workflow.dir.as_posix().rsplit("workflow")[1] parsed_repo_url = urllib.parse.urlparse(remote_repo.repo_url) if parsed_repo_url.netloc == "github.com": diff --git a/wfexs_backend/utils/contents.py b/wfexs_backend/utils/contents.py index f03f1463..06f84954 100644 --- a/wfexs_backend/utils/contents.py +++ b/wfexs_backend/utils/contents.py @@ -74,7 +74,7 @@ def GetGeneratedDirectoryContent( - thePath: "AbsPath", + thePath: "Union[AbsPath, os.PathLike[str]]", uri: "Optional[LicensedURI]" = None, preferredFilename: "Optional[RelPath]" = None, signatureMethod: "Optional[FingerprintMethod]" = None, @@ -90,7 +90,7 @@ def GetGeneratedDirectoryContent( if not entry.name.startswith("."): theValue: "Optional[AbstractGeneratedContent]" = None if entry.is_file(): - entry_path = cast("AbsPath", entry.path) + entry_path = pathlib.Path(entry.path) theValue = GeneratedContent( local=entry_path, # uri=None, @@ -102,7 +102,7 @@ def GetGeneratedDirectoryContent( ), ) elif entry.is_dir(): - entry_path = cast("AbsPath", entry.path) + entry_path = pathlib.Path(entry.path) theValue = GetGeneratedDirectoryContent( entry_path, signatureMethod=signatureMethod ) @@ -118,7 +118,7 @@ def GetGeneratedDirectoryContent( signature = None return GeneratedDirectoryContent( - local=thePath, + local=thePath if isinstance(thePath, pathlib.Path) else pathlib.Path(thePath), uri=uri, preferredFilename=preferredFilename, values=theValues, @@ -127,7 +127,7 @@ def GetGeneratedDirectoryContent( def GetGeneratedDirectoryContentFromList( - thePath: "AbsPath", + thePath: "Union[AbsPath, os.PathLike[str]]", theValues: "Sequence[AbstractGeneratedContent]", uri: "Optional[LicensedURI]" = None, preferredFilename: "Optional[RelPath]" = None, @@ -149,7 +149,7 @@ def GetGeneratedDirectoryContentFromList( signature = None return GeneratedDirectoryContent( - local=thePath, + local=thePath if isinstance(thePath, pathlib.Path) else pathlib.Path(thePath), uri=uri, preferredFilename=preferredFilename, values=theValues, diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 6a58626d..f8a313c3 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -137,76 +137,6 @@ # import magic -# "New world" declarations - - -class LocalPathWorkflow(NamedTuple): - """ - dir: The path to the directory where the checkout was applied - relPath: Inside the checkout, the relative path to the workflow definition - effectiveCheckout: hex hash of the materialized checkout - langVersion: workflow language version / revision - relPathFiles: files composing the workflow, which can be either local - or remote ones (i.e. CWL) - """ - - dir: "PathlibLike" - relPath: "Optional[RelPath]" - effectiveCheckout: "Optional[RepoTag]" - langVersion: "Optional[Union[EngineVersion, WFLangVersion]]" = None - relPathFiles: "Optional[Sequence[RelPath]]" = None - - -class MaterializedPathContent(NamedTuple): - """ - local: Local absolute path of the content which was materialized. It - can be either a path in the cached inputs directory, or an absolute - path in the inputs directory of the execution - licensed_uri: Either an URL or a CURIE of the content which was materialized, - needed for the provenance - prettyFilename: The preferred filename to use in the inputs directory - of the execution environment - fingerprint: If it is available, propagate the computed fingerprint - from the cache. - """ - - local_path: "PathlibLike" - licensed_uri: "LicensedURI" - prettyFilename: "RelPath" - kind: "ContentKind" = ContentKind.File - metadata_array: "Optional[Sequence[URIWithMetadata]]" = None - extrapolated_local: "Optional[AbsPath]" = None - fingerprint: "Optional[Fingerprint]" = None - - @classmethod - def _key_fixes(cls) -> "Mapping[str, str]": - return {"uri": "licensed_uri"} - - -if TYPE_CHECKING: - MaterializedPathInputValues: TypeAlias = Union[ - Sequence[bool], - Sequence[str], - Sequence[int], - Sequence[float], - Sequence[MaterializedPathContent], - ] - - -class MaterializedPathInput(NamedTuple): - """ - name: Name of the input - values: list of associated values, which can be literal ones or - instances from MaterializedContent - """ - - name: "SymbolicParamName" - values: "MaterializedPathInputValues" - secondaryInputs: "Optional[Sequence[MaterializedPathContent]]" = None - autoFilled: "bool" = False - implicit: "bool" = False - - class ReproducibilityLevel(enum.IntEnum): Minimal = enum.auto() # Minimal / no reproducibility is requested Metadata = enum.auto() # Metadata reproducibility is requested @@ -1755,7 +1685,7 @@ def __processPayloadInput( else DefaultNoLicenceTuple ) mat_content = MaterializedContent( - local=cast("AbsPath", input_path), + local=located_input, licensed_uri=LicensedURI( uri=cast("URIType", the_uri), licences=licences_tuple, diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 5e07b521..107c3100 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -27,6 +27,7 @@ import json import logging import os +import pathlib import re import shutil import stat @@ -2817,7 +2818,7 @@ def downloadContent( prettyFilename = cast("RelPath", firstParsedURI.path.split("/")[-1]) return MaterializedContent( - local=cached_content.path, + local=pathlib.Path(cached_content.path), licensed_uri=firstLicensedURI, prettyFilename=prettyFilename, kind=cached_content.kind, diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 8b1b06b3..77a0ca9c 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -325,6 +325,7 @@ from .utils.contents import ( bin2dataurl, link_or_copy, + link_or_copy_pathlib, ) from .utils.marshalling_handling import marshall_namedtuple, unmarshall_namedtuple from .utils.misc import config_validate @@ -1463,7 +1464,7 @@ def _transferInputs( source_file = payload_dir / value.local dest_file = inputs_dir / path_relative_to(source_file, payload_dir) new_value = value._replace( - local=cast("AbsPath", dest_file.as_posix()) + local=dest_file, ) new_values.append(new_value) @@ -1479,7 +1480,7 @@ def _transferInputs( source_file = payload_dir / secondaryInput.local dest_file = inputs_dir / path_relative_to(source_file, payload_dir) new_secondaryInput = secondaryInput._replace( - local=cast("AbsPath", dest_file.as_posix()) + local=dest_file, ) new_secondaryInputs.append(new_secondaryInput) @@ -1537,9 +1538,9 @@ def FromPreviousROCrate( payload_dir=payload_dir, ) - # logging.error(f"Containers {the_containers}") - # logging.error(f"Inputs {cached_inputs}") - # sys.exit(1) + logging.error(f"Containers {the_containers}") + logging.error(f"Inputs {cached_inputs}") + sys.exit(1) # Now, some postprocessing... if ( reproducibility_level >= ReproducibilityLevel.Full @@ -1862,7 +1863,7 @@ def fetchWorkflow( # We cannot know yet the dependencies localWorkflow = LocalWorkflow( - dir=self.workflowDir, + dir=pathlib.Path(self.workflowDir), relPath=self.repoRelPath, effectiveCheckout=self.repoEffectiveCheckout, ) @@ -2203,7 +2204,7 @@ def _fetchRemoteFile( offline: "bool", storeDir: "Union[AbsPath, CacheType]", cacheable: "bool", - inputDestDir: "AbsPath", + inputDestDir: "pathlib.Path", globExplode: "Optional[str]", prefix: "str" = "", hardenPrettyLocal: "bool" = False, @@ -2230,38 +2231,36 @@ def _fetchRemoteFile( if prettyRelname is None: prettyRelname = matContent.prettyFilename - prettyLocal = cast("AbsPath", os.path.join(inputDestDir, prettyRelname)) + prettyLocal = inputDestDir / prettyRelname # Protection against misbehaviours which could hijack the # execution environment - realPrettyLocal = os.path.realpath(prettyLocal) - realInputDestDir = os.path.realpath(inputDestDir) - if not realPrettyLocal.startswith(realInputDestDir): - prettyRelname = cast("RelPath", os.path.basename(realPrettyLocal)) - prettyLocal = cast("AbsPath", os.path.join(inputDestDir, prettyRelname)) + realPrettyLocal = prettyLocal.resolve() + realInputDestDir = inputDestDir.resolve() + if not realPrettyLocal.is_relative_to(realInputDestDir): + prettyRelname = cast("RelPath", realPrettyLocal.name) + prettyLocal = inputDestDir / prettyRelname # Checking whether local name hardening is needed if not hardenPrettyLocal: - if os.path.islink(prettyLocal): - oldLocal = os.readlink(prettyLocal) + if prettyLocal.is_symlink(): + oldLocal = prettyLocal.readlink() hardenPrettyLocal = oldLocal != matContent.local - elif os.path.exists(prettyLocal): + elif prettyLocal.exists(): hardenPrettyLocal = True if hardenPrettyLocal: # Trying to avoid collisions on input naming - prettyLocal = cast( - "AbsPath", os.path.join(inputDestDir, prefix + prettyRelname) - ) + prettyLocal = inputDestDir / (prefix + prettyRelname) if not os.path.exists(prettyLocal): # We are either hardlinking or copying here - link_or_copy(matContent.local, prettyLocal) + link_or_copy_pathlib(matContent.local, prettyLocal) remote_pairs = [] if globExplode is not None: - prettyLocalPath = pathlib.Path(prettyLocal) + prettyLocalPath = prettyLocal matParse = urllib.parse.urlparse(matContent.licensed_uri.uri) for exp in prettyLocalPath.glob(globExplode): relPath = exp.relative_to(prettyLocalPath) @@ -2290,7 +2289,7 @@ def _fetchRemoteFile( ) remote_pairs.append( MaterializedContent( - local=cast("AbsPath", str(exp)), + local=exp, licensed_uri=lic_expUri, prettyFilename=relName, metadata_array=matContent.metadata_array, @@ -2644,8 +2643,8 @@ def _fetchContentWithURIs( t_split = tabconf["column-sep"].encode("utf-8").decode("unicode-escape") t_uri_cols: "Sequence[int]" = tabconf["uri-columns"] - inputDestDir = workflowInputs_destdir - extrapolatedInputDestDir = workflowExtrapolatedInputs_destdir + inputDestDir = pathlib.Path(workflowInputs_destdir) + extrapolatedInputDestDir = pathlib.Path(workflowExtrapolatedInputs_destdir) path_tokens = linearKey.split(".") # Filling in the defaults @@ -2687,15 +2686,12 @@ def _fetchContentWithURIs( relative_dir = None if relative_dir is not None: - newInputDestDir = os.path.realpath(os.path.join(inputDestDir, relative_dir)) - if newInputDestDir.startswith(os.path.realpath(inputDestDir)): - inputDestDir = cast("AbsPath", newInputDestDir) - extrapolatedInputDestDir = cast( - "AbsPath", - os.path.realpath( - os.path.join(extrapolatedInputDestDir, relative_dir) - ), - ) + newInputDestDir = (inputDestDir / relative_dir).resolve() + if newInputDestDir.is_relative_to(inputDestDir): + inputDestDir = newInputDestDir + extrapolatedInputDestDir = ( + extrapolatedInputDestDir / relative_dir + ).resolve() # The storage dir depends on whether it can be cached or not storeDir: "Union[CacheType, AbsPath]" = ( @@ -2775,7 +2771,7 @@ def _fetchContentWithURIs( secondary_remote_pairs: "Optional[MutableSequence[MaterializedContent]]" if len(these_secondary_uris) > 0: - secondary_uri_mapping: "MutableMapping[str, str]" = dict() + secondary_uri_mapping: "MutableMapping[str, pathlib.Path]" = dict() secondary_remote_pairs = [] # Fetch each gathered URI for secondary_remote_file in these_secondary_uris: @@ -2812,18 +2808,15 @@ def _fetchContentWithURIs( # Now, reopen each file to replace URLs by paths for i_remote_pair, remote_pair in enumerate(remote_pairs): - extrapolated_local = os.path.join( - extrapolatedInputDestDir, - os.path.relpath(remote_pair.local, inputDestDir), + extrapolated_local = extrapolatedInputDestDir / os.path.relpath( + remote_pair.local, inputDestDir ) - with open( - remote_pair.local, + with remote_pair.local.open( mode="rt", encoding="utf-8", newline=t_newline, ) as tH: - with open( - extrapolated_local, + with extrapolated_local.open( mode="wt", encoding="utf-8", newline=t_newline, @@ -2850,7 +2843,7 @@ def _fetchContentWithURIs( # Should we check whether it is a URI? cols[t_uri_col] = secondary_uri_mapping[ cols[t_uri_col] - ] + ].as_posix() fixed_row = True if fixed_row: @@ -2861,7 +2854,7 @@ def _fetchContentWithURIs( # Last, fix it remote_pairs[i_remote_pair] = remote_pair._replace( kind=ContentKind.ContentWithURIs, - extrapolated_local=cast("AbsPath", extrapolated_local), + extrapolated_local=extrapolated_local, ) else: secondary_remote_pairs = None @@ -2918,7 +2911,7 @@ def fetchInputs( ContentKind.File.name, ContentKind.Directory.name, ): # input files - inputDestDir = workflowInputs_destdir + inputDestDir = pathlib.Path(workflowInputs_destdir) globExplode = None path_tokens = linearKey.split(".") @@ -3047,13 +3040,11 @@ def fetchInputs( relative_dir = None if relative_dir is not None: - newInputDestDir = os.path.realpath( - os.path.join(inputDestDir, relative_dir) - ) - if newInputDestDir.startswith( - os.path.realpath(inputDestDir) - ): - inputDestDir = cast("AbsPath", newInputDestDir) + newInputDestDir = ( + inputDestDir / relative_dir + ).resolve() + if newInputDestDir.relative_to(inputDestDir): + inputDestDir = newInputDestDir # The storage dir depends on whether it can be cached or not storeDir: "Union[CacheType, AbsPath]" = ( @@ -3165,16 +3156,12 @@ def fetchInputs( else: if inputClass == ContentKind.File.name: # Empty input, i.e. empty file - inputDestPath = cast( - "AbsPath", - os.path.join(inputDestDir, *linearKey.split(".")), - ) - os.makedirs( - os.path.dirname(inputDestPath), exist_ok=True + inputDestPath = inputDestDir.joinpath( + *linearKey.split(".") ) + inputDestPath.parent.mkdir(parents=True, exist_ok=True) # Creating the empty file - with open(inputDestPath, mode="wb") as idH: - pass + inputDestPath.touch() contentKind = ContentKind.File else: inputDestPath = inputDestDir @@ -3191,7 +3178,7 @@ def fetchInputs( ), prettyFilename=cast( "RelPath", - os.path.basename(inputDestPath), + inputDestPath.name, ), kind=contentKind, ) @@ -4608,7 +4595,7 @@ def locateExportItems( ) retval.append( MaterializedContent( - local=self.staged_setup.inputs_dir, + local=pathlib.Path(self.staged_setup.inputs_dir), licensed_uri=LicensedURI( uri=cast( "URIType", @@ -4701,12 +4688,8 @@ def locateExportItems( prettyFilename = cast("RelPath", stagedExec.outputsDir) retval.append( MaterializedContent( - local=cast( - "AbsPath", - os.path.join( - self.staged_setup.work_dir, stagedExec.outputsDir - ), - ), + local=pathlib.Path(self.staged_setup.work_dir) + / stagedExec.outputsDir, licensed_uri=LicensedURI( uri=cast( "URIType", @@ -4722,9 +4705,10 @@ def locateExportItems( ) elif item.type == ExportItemType.WorkingDirectory: # The whole working directory + assert self.staged_setup.work_dir is not None retval.append( MaterializedContent( - local=cast("AbsPath", self.staged_setup.work_dir), + local=pathlib.Path(self.staged_setup.work_dir), licensed_uri=LicensedURI( uri=cast( "URIType", "wfexs:" + self.staged_setup.instance_id @@ -4793,7 +4777,7 @@ def locateExportItems( ) retval.append( MaterializedContent( - local=cast("AbsPath", temp_rocrate_file), + local=pathlib.Path(temp_rocrate_file), licensed_uri=LicensedURI( uri=cast( "URIType", diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 567ff351..6a7ee54b 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -438,53 +438,53 @@ def __init__( ) # Assuring this temporal directory is removed at the end atexit.register(shutil.rmtree, workDir, True) - self.workDir = workDir + self.workDir = pathlib.Path(workDir) # This directory should hold intermediate workflow steps results if intermediateDir is None: - intermediateDir = cast( - "AbsPath", os.path.join(workDir, WORKDIR_INTERMEDIATE_RELDIR) - ) - os.makedirs(intermediateDir, exist_ok=True) - self.intermediateDir = intermediateDir + self.intermediateDir = self.workDir / WORKDIR_INTERMEDIATE_RELDIR + else: + self.intermediateDir = pathlib.Path(intermediateDir) + self.intermediateDir.mkdir(parents=True, exist_ok=True) # This directory will hold the final workflow results, which could # be either symbolic links to the intermediate results directory # or newly generated content if outputsDir is None: - outputsDir = cast("AbsPath", os.path.join(workDir, WORKDIR_OUTPUTS_RELDIR)) - elif not os.path.isabs(outputsDir): - outputsDir = cast("AbsPath", os.path.abspath(outputsDir)) - os.makedirs(outputsDir, exist_ok=True) - self.outputsDir = cast("AbsPath", outputsDir) + self.outputsDir = self.workDir / WORKDIR_OUTPUTS_RELDIR + else: + self.outputsDir = pathlib.Path(outputsDir) + + if not self.outputsDir.is_absolute(): + self.outputsDir = self.outputsDir.absolute() + + self.outputsDir.mkdir(parents=True, exist_ok=True) # This directory will hold diverse metadata, like execution metadata # or newly generated content if outputMetaDir is None: - outputMetaDir = cast( - "AbsPath", - os.path.join(workDir, WORKDIR_META_RELDIR, WORKDIR_OUTPUTS_RELDIR), + self.outputMetaDir = ( + self.workDir / WORKDIR_META_RELDIR / WORKDIR_OUTPUTS_RELDIR ) - os.makedirs(outputMetaDir, exist_ok=True) - self.outputMetaDir = outputMetaDir + else: + self.outputMetaDir = pathlib.Path(outputMetaDir) + + self.outputMetaDir.mkdir(parents=True, exist_ok=True) # This directory will hold stats metadata, as well as the dot representation # of the workflow execution - outputStatsDir = cast( - "AbsPath", os.path.join(outputMetaDir, WORKDIR_STATS_RELDIR) - ) - os.makedirs(outputStatsDir, exist_ok=True) + outputStatsDir = self.outputMetaDir / WORKDIR_STATS_RELDIR + outputStatsDir.mkdir(parents=True, exist_ok=True) self.outputStatsDir = outputStatsDir # This directory is here for those files which are created in order # to tweak or patch workflow executions # engine tweaks directory if engineTweaksDir is None: - engineTweaksDir = cast( - "AbsPath", os.path.join(workDir, WORKDIR_ENGINE_TWEAKS_RELDIR) - ) - os.makedirs(engineTweaksDir, exist_ok=True) - self.engineTweaksDir = engineTweaksDir + self.engineTweaksDir = self.workDir / WORKDIR_ENGINE_TWEAKS_RELDIR + else: + self.engineTweaksDir = pathlib.Path(engineTweaksDir) + self.engineTweaksDir.mkdir(parents=True, exist_ok=True) # This directory is here for temporary files of any program launched from # WfExS or the engine itself. It should be set to TMPDIR on subprocess calls @@ -808,18 +808,16 @@ def deploy_containers( def staged_containers_dir(self) -> "AnyPath": return cast("AbsPath", self.stagedContainersDir.as_posix()) - def create_job_directories(self) -> "Tuple[str, AbsPath, AbsPath, AbsPath]": + def create_job_directories( + self, + ) -> "Tuple[str, pathlib.Path, pathlib.Path, pathlib.Path]": outputDirPostfix = "_" + str(int(time.time())) + "_" + str(os.getpid()) - intermediateDir = cast( - "AbsPath", os.path.join(self.intermediateDir, outputDirPostfix) - ) - os.makedirs(intermediateDir, exist_ok=True) - outputsDir = cast("AbsPath", os.path.join(self.outputsDir, outputDirPostfix)) - os.makedirs(outputsDir, exist_ok=True) - outputMetaDir = cast( - "AbsPath", os.path.join(self.outputMetaDir, outputDirPostfix) - ) - os.makedirs(outputMetaDir, exist_ok=True) + intermediateDir = self.intermediateDir / outputDirPostfix + intermediateDir.mkdir(parents=True, exist_ok=True) + outputsDir = self.outputsDir / outputDirPostfix + outputsDir.mkdir(parents=True, exist_ok=True) + outputMetaDir = self.outputMetaDir / outputDirPostfix + outputMetaDir.mkdir(parents=True, exist_ok=True) return outputDirPostfix, intermediateDir, outputsDir, outputMetaDir @@ -924,7 +922,7 @@ def identifyMaterializedOutputs( self, matInputs: "Sequence[MaterializedInput]", expectedOutputs: "Sequence[ExpectedOutput]", - outputsDir: "AbsPath", + outputsDir: "pathlib.Path", outputsMapping: "Optional[Mapping[SymbolicOutputName, Any]]" = None, ) -> "Sequence[MaterializedOutput]": """ @@ -951,13 +949,14 @@ def identifyMaterializedOutputs( guessedOutputKindDef: "ContentKind" # We are avoiding to enter in loops around '.' and '..' if entry.is_file(): + entry_path = pathlib.Path(entry.path) matValuesDef = [ GeneratedContent( - local=cast("AbsPath", entry.path), + local=entry_path, signature=cast( "Fingerprint", ComputeDigestFromFile( - cast("AbsPath", entry.path), + entry_path, repMethod=nihDigester, ), ), @@ -967,7 +966,7 @@ def identifyMaterializedOutputs( elif entry.is_dir(follow_symlinks=False): matValuesDef = [ GetGeneratedDirectoryContent( - cast("AbsPath", entry.path), signatureMethod=nihDigester + entry_path, signatureMethod=nihDigester ) ] guessedOutputKindDef = ContentKind.Directory @@ -1025,14 +1024,14 @@ def identifyMaterializedOutputs( # FIXME: Are these elements always paths?????? if isinstance(matchedPath, str): if os.path.isabs(matchedPath): - abs_matched_path = matchedPath + abs_matched_path = pathlib.Path(matchedPath) else: - abs_matched_path = os.path.join(outputsDir, matchedPath) + abs_matched_path = outputsDir / matchedPath try: theContent: "AbstractGeneratedContent" if expectedOutput.kind == ContentKind.Directory: theContent = GetGeneratedDirectoryContent( - thePath=cast("AbsPath", abs_matched_path), + thePath=abs_matched_path, uri=None, # TODO: generate URIs when it is advised preferredFilename=expectedOutput.preferredFilename, signatureMethod=nihDigester, @@ -1040,7 +1039,7 @@ def identifyMaterializedOutputs( expMatContents.append(theContent) elif expectedOutput.kind == ContentKind.File: theContent = GeneratedContent( - local=cast("AbsPath", abs_matched_path), + local=abs_matched_path, uri=None, # TODO: generate URIs when it is advised signature=cast( "Fingerprint", @@ -1056,8 +1055,7 @@ def identifyMaterializedOutputs( ) else: # Reading the value from a file, as the glob is telling that - with open( - matchedPath, + with abs_matched_path.open( mode="r", encoding="utf-8", errors="ignore", @@ -1114,7 +1112,7 @@ def identifyMaterializedOutputs( expMatContents.append(matchedContent) elif expectedOutput.kind == ContentKind.File: matchedContent = GeneratedContent( - local=matchedPath, + local=pathlib.Path(matchedPath), uri=None, # TODO: generate URIs when it is advised signature=cast( "Fingerprint", diff --git a/wfexs_backend/workflow_engines/cwl_engine.py b/wfexs_backend/workflow_engines/cwl_engine.py index ba2e682e..2fd6132f 100644 --- a/wfexs_backend/workflow_engines/cwl_engine.py +++ b/wfexs_backend/workflow_engines/cwl_engine.py @@ -22,6 +22,7 @@ import json import logging import os +import pathlib import re import shutil import stat @@ -47,6 +48,7 @@ ) if TYPE_CHECKING: + import pathlib from typing import ( Any, Mapping, @@ -327,16 +329,16 @@ def identifyWorkflow( """ cwlPath = localWf.dir if localWf.relPath is not None: - cwlPath = cast("AbsPath", os.path.join(cwlPath, localWf.relPath)) + cwlPath = cwlPath / localWf.relPath - if os.path.isdir(cwlPath): + if cwlPath.is_dir(): self.logger.warning("CWL entrypoint cannot be a directory") return None, None # Is this a yaml? cwlVersion = None try: - with open(cwlPath, mode="r", encoding="utf-8") as pCWL: + with cwlPath.open(mode="r", encoding="utf-8") as pCWL: wf_yaml = yaml.safe_load(pCWL) # parse possible CWL cwlVersion = wf_yaml.get("cwlVersion") except Exception as e: @@ -420,7 +422,7 @@ def _materializeEngineVersionLocal( MaterializedWorkflowEngine( instance=self, workflow=LocalWorkflow( - dir=cast("AbsPath", "/"), + dir=pathlib.Path("/"), relPath=None, effectiveCheckout=None, ), @@ -700,31 +702,30 @@ def materializeWorkflow( """ localWf = matWorkflowEngine.workflow localWorkflowDir = localWf.dir + consolidatedWorkflowPath = pathlib.Path(consolidatedWorkflowDir) assert ( localWf.relPath is not None ), "CWL workflows should have a relative file path" if os.path.isabs(localWf.relPath): - localWorkflowFile = cast("AbsPath", localWf.relPath) + localWorkflowFile = pathlib.Path(localWf.relPath) else: - localWorkflowFile = cast( - "AbsPath", os.path.join(localWorkflowDir, localWf.relPath) - ) + localWorkflowFile = localWorkflowDir / localWf.relPath engineVersion = matWorkflowEngine.version # CWLWorkflowEngine directory is needed cwltool_install_dir = matWorkflowEngine.engine_path - if not os.path.isfile(localWorkflowFile): + if not localWorkflowFile.is_file(): raise WorkflowEngineException( - "CWL workflow {} has not been materialized.".format(localWorkflowFile) + f"CWL workflow {localWorkflowFile} has not been materialized (not a file)." ) # Extract hashes directories from localWorkflow ( localWorkflowUsedHashes_head, localWorkflowUsedHashes_tail, - ) = localWorkflowDir.split("/")[-2:] + ) = localWorkflowDir.parts[-2:] # Setting up workflow packed name localWorkflowPackedName = ( @@ -734,19 +735,18 @@ def materializeWorkflow( # TODO: check whether the repo is newer than the packed file - consolidatedPackedWorkflowFile = cast( - "AbsPath", os.path.join(consolidatedWorkflowDir, localWorkflowPackedName) + consolidatedPackedWorkflowFile = ( + consolidatedWorkflowPath / localWorkflowPackedName ) if ( - not os.path.isfile(consolidatedPackedWorkflowFile) + not consolidatedPackedWorkflowFile.is_file() or os.path.getsize(consolidatedPackedWorkflowFile) == 0 ): - packedLocalWorkflowFile = cast( - "AbsPath", - os.path.join(self.cacheWorkflowPackDir, localWorkflowPackedName), + packedLocalWorkflowFile = ( + pathlib.Path(self.cacheWorkflowPackDir) / localWorkflowPackedName ) if ( - not os.path.isfile(packedLocalWorkflowFile) + not packedLocalWorkflowFile.is_file() or os.path.getsize(packedLocalWorkflowFile) == 0 ): if offline: @@ -755,7 +755,7 @@ def materializeWorkflow( ) # Execute cwltool --pack - with open(packedLocalWorkflowFile, mode="wb") as packedH: + with packedLocalWorkflowFile.open(mode="wb") as packedH: with tempfile.NamedTemporaryFile() as cwltool_pack_stderr: # Writing straight to the file retVal = subprocess.Popen( @@ -763,7 +763,7 @@ def materializeWorkflow( f"{cwltool_install_dir}/bin/cwltool", "--no-doc-cache", "--pack", - localWorkflowFile, + localWorkflowFile.as_posix(), ], stdout=packedH, stderr=cwltool_pack_stderr, @@ -789,7 +789,7 @@ def materializeWorkflow( # Getting the identifiers cwlVersion = None # TODO: collect conda hints - with open(consolidatedPackedWorkflowFile, encoding="utf-8") as pLWH: + with consolidatedPackedWorkflowFile.open(mode="r", encoding="utf-8") as pLWH: wf_yaml = yaml.safe_load(pLWH) # parse packed CWL cwlVersion = wf_yaml.get("cwlVersion", "v1.0") dockerExprParser = jsonpath_ng.ext.parse( @@ -813,7 +813,7 @@ def materializeWorkflow( containerTags.add(dockerPullId) newLocalWf = LocalWorkflow( - dir=consolidatedWorkflowDir, + dir=consolidatedWorkflowPath, relPath=cast("RelPath", localWorkflowPackedName), effectiveCheckout=localWf.effectiveCheckout, langVersion=cwlVersion, @@ -1041,8 +1041,8 @@ def launchWorkflow( # the workflow with open(stdoutFilename, mode="wb+") as cwl_yaml_stdout: with open(stderrFilename, mode="ab+") as cwl_yaml_stderr: - jobIntermediateDir = intermediateDir + "/" - outputDir = outputsDir + "/" + jobIntermediateDir = intermediateDir.as_posix() + "/" + outputDir = outputsDir.as_posix() + "/" # This is needed to isolate execution environment # and teach cwltool where to find the cached images @@ -1151,7 +1151,7 @@ def launchWorkflow( ) # Now, the environment variables to include - bindable_paths = [] + bindable_paths: "MutableSequence[pathlib.Path]" = [] for mat_env in matEnvironment: if len(mat_env.values) > 0: cmd_arr.append( @@ -1166,7 +1166,7 @@ def launchWorkflow( else mat_val.extrapolated_local ) bindable_paths.append(the_local) - env_vals.append(the_local) + env_vals.append(the_local.as_posix()) else: env_vals.append(str(mat_val)) # Now, assign it @@ -1287,7 +1287,7 @@ def createYAMLFile( execInputs = self.executionInputs(matInputs, cwlInputs) if len(execInputs) != 0: with open(filename, mode="w+", encoding="utf-8") as yaml_file: - yaml.dump( + yaml.safe_dump( execInputs, yaml_file, allow_unicode=True, @@ -1407,7 +1407,7 @@ def executionInputs( ContentKind.File, ContentKind.ContentWithURIs, ): - if not os.path.exists(value.local): + if not value.local.exists(): self.logger.warning( "Input {} is not materialized".format(name) ) @@ -1415,7 +1415,7 @@ def executionInputs( value.local if value.extrapolated_local is None else value.extrapolated_local - ) + ).as_posix() eInput: "MutableMapping[str, Any]" = { "class": classType, @@ -1429,7 +1429,7 @@ def executionInputs( secInput.kind ) ], - "location": secInput.local, + "location": secInput.local.as_posix(), } for secInput in matInput.secondaryInputs ] diff --git a/wfexs_backend/workflow_engines/nextflow_engine.py b/wfexs_backend/workflow_engines/nextflow_engine.py index 0cfa194d..70d84671 100644 --- a/wfexs_backend/workflow_engines/nextflow_engine.py +++ b/wfexs_backend/workflow_engines/nextflow_engine.py @@ -24,6 +24,7 @@ import json import logging import os +import pathlib import platform import re import shutil @@ -355,16 +356,16 @@ def identifyWorkflow( nfPath = localWf.dir if localWf.relPath is not None: - nfPath = cast("AbsPath", os.path.join(nfPath, localWf.relPath)) + nfPath = nfPath / localWf.relPath - nfDir: "AbsPath" + nfDir: "pathlib.Path" # If it is a directory, we have to assume there should be a nextflow.config firstPath = None - if os.path.isdir(nfPath): + if nfPath.is_dir(): nfDir = nfPath - elif os.path.isfile(nfPath): + elif nfPath.is_file(): # Does it exist? - nfDir = cast("AbsPath", os.path.dirname(nfPath)) + nfDir = nfPath.parent # We don't know yet which is firstPath = nfPath else: @@ -375,17 +376,13 @@ def identifyWorkflow( # Trying with the defaults if firstPath is None: - firstPath = cast( - "AbsPath", os.path.join(nfDir, self.NEXTFLOW_CONFIG_FILENAME) - ) + firstPath = nfDir / self.NEXTFLOW_CONFIG_FILENAME # Does it exist? - if not os.path.isfile(firstPath): - firstPath = cast( - "AbsPath", os.path.join(nfDir, self.DEFAULT_NEXTFLOW_ENTRYPOINT) - ) + if not firstPath.is_file(): + firstPath = nfDir / self.DEFAULT_NEXTFLOW_ENTRYPOINT - if not os.path.isfile(firstPath): + if not firstPath.is_file(): # Giving up raise WorkflowEngineException( f"Could not find neither {self.NEXTFLOW_CONFIG_FILENAME} nor {self.DEFAULT_NEXTFLOW_ENTRYPOINT} in Nextflow workflow directory {nfDir}" @@ -400,12 +397,12 @@ def identifyWorkflow( includeconfigs: "Sequence[NfIncludeConfig]" interesting_assignments: "ContextAssignments" - nfConfig: "Optional[AbsPath]" = None + nfConfig: "Optional[pathlib.Path]" = None candidateNf: "Optional[RelPath]" = None candidateConfig: "Optional[RelPath]" = None - newNxfConfigs: "MutableSequence[AbsPath]" = [] + newNxfConfigs: "MutableSequence[pathlib.Path]" = [] only_names = ["manifest", "nextflow"] - absoluteCandidateNf: "Optional[AbsPath]" = None + absoluteCandidateNf: "Optional[pathlib.Path]" = None # First, are we dealing with a config or a nextflow file? with open(firstPath, mode="rt", encoding="utf-8") as nfH: firstPathContent = nfH.read() @@ -446,11 +443,9 @@ def identifyWorkflow( # Did we loaded a nextflow config file? if nfConfig is None: - possibleNfConfig = cast( - "AbsPath", os.path.join(nfDir, self.NEXTFLOW_CONFIG_FILENAME) - ) + possibleNfConfig = nfDir / self.NEXTFLOW_CONFIG_FILENAME # Only include what it is reachable - if os.path.isfile(possibleNfConfig): + if possibleNfConfig.is_file(): newNxfConfigs.append(possibleNfConfig) else: self.logger.debug( @@ -459,21 +454,23 @@ def identifyWorkflow( # Let's record all the configuration files nxfScripts: "MutableSequence[RelPath]" = [] - absolutePutativeCandidateNf: "Optional[AbsPath]" = None + absolutePutativeCandidateNf: "Optional[pathlib.Path]" = None minimalEngineVer = None kw_20_04_Pat: "Optional[Pattern[str]]" = re.compile( r"\$(?:(?:launchDir|moduleDir|projectDir)|\{(?:launchDir|moduleDir|projectDir)\})" ) while len(newNxfConfigs) > 0: - nextNewNxfConfigs: "MutableSequence[AbsPath]" = [] + nextNewNxfConfigs: "MutableSequence[pathlib.Path]" = [] for newNxfConfig in newNxfConfigs: # Do not read twice - relNewNxfConfig = cast("RelPath", os.path.relpath(newNxfConfig, nfDir)) + relNewNxfConfig = cast( + "RelPath", newNxfConfig.relative_to(nfDir).as_posix() + ) if relNewNxfConfig in nxfScripts: continue nxfScripts.append(relNewNxfConfig) - with open(newNxfConfig, mode="rt", encoding="utf-8") as nfH: + with newNxfConfig.open(mode="rt", encoding="utf-8") as nfH: newNxfConfigContent = nfH.read() try: ( @@ -508,7 +505,7 @@ def identifyWorkflow( kw_20_04_Pat = None # Time to resolve these - nfConfigDir = os.path.dirname(newNxfConfig) + nfConfigDir = newNxfConfig.parent # But first, check the manifest availability # to obtain the entrypoint manifest = interesting_assignments.get("manifest") @@ -517,13 +514,10 @@ def identifyWorkflow( if putativeCandidateNfVals is not None: for putativeCandidateNfVal in putativeCandidateNfVals: putativeCandidateNf = putativeCandidateNfVal[1] - possibleAbsolutePutativeCandidateNf = cast( - "AbsPath", - os.path.normpath( - os.path.join(nfConfigDir, putativeCandidateNf) - ), - ) - if os.path.isfile(possibleAbsolutePutativeCandidateNf): + possibleAbsolutePutativeCandidateNf = ( + nfConfigDir / putativeCandidateNf + ).resolve(strict=False) + if possibleAbsolutePutativeCandidateNf.is_file(): absolutePutativeCandidateNf = ( possibleAbsolutePutativeCandidateNf ) @@ -559,11 +553,10 @@ def identifyWorkflow( # And register all the included config files which are reachable for includeconfig in includeconfigs: relIncludePath = includeconfig.path - absIncludePath = cast( - "AbsPath", - os.path.normpath(os.path.join(nfConfigDir, relIncludePath)), + absIncludePath = (nfConfigDir / relIncludePath).resolve( + strict=False ) - if os.path.isfile(absIncludePath): + if absIncludePath.is_file(): nextNewNxfConfigs.append(absIncludePath) else: self.logger.warning( @@ -616,10 +609,10 @@ def identifyWorkflow( engineVer = self.DEFAULT_NEXTFLOW_VERSION_WITH_PODMAN # Subworkflow / submodule include detection - newNxfScripts: "MutableSequence[AbsPath]" = [entrypoint] + newNxfScripts: "MutableSequence[pathlib.Path]" = [entrypoint] only_names = ["nextflow"] while len(newNxfScripts) > 0: - nextNxfScripts: "MutableSequence[AbsPath]" = [] + nextNxfScripts: "MutableSequence[pathlib.Path]" = [] for nxfScript in newNxfScripts: relNxfScript = cast("RelPath", os.path.relpath(nxfScript, nfDir)) # Avoid loops @@ -649,18 +642,15 @@ def identifyWorkflow( raise WorkflowEngineException(errstr) from e # Register all the included files which are reachable - nxfScriptDir = os.path.dirname(nxfScript) + nxfScriptDir = nxfScript.parent for include in includes: relIncludePath = include.path if not relIncludePath.endswith(".nf"): relIncludePath += ".nf" - absIncludePath = cast( - "AbsPath", - os.path.normpath( - os.path.join(nxfScriptDir, relIncludePath) - ), + absIncludePath = (nxfScriptDir / relIncludePath).resolve( + strict=False ) - if os.path.isfile(absIncludePath): + if absIncludePath.is_file(): nextNxfScripts.append(absIncludePath) else: self.logger.warning( @@ -672,19 +662,20 @@ def identifyWorkflow( for processDecl in processes: for relTemplatePath in processDecl.templates: # Now, let's try finding it - local_template = os.path.join( - nxfScriptDir, "templates", relTemplatePath + local_template = ( + nxfScriptDir / "templates" / relTemplatePath ) - if not os.path.isfile(local_template): - local_template = os.path.join( - nfDir, "templates", relTemplatePath - ) + if not local_template.is_file(): + local_template = nfDir / "templates" / relTemplatePath # And now let's save it! - if os.path.isfile(local_template): - abs_local_template = os.path.normpath(local_template) + if local_template.is_file(): + abs_local_template = local_template.resolve( + strict=False + ) rel_local_template = cast( - "RelPath", os.path.relpath(local_template, nfDir) + "RelPath", + local_template.relative_to(nfDir).as_posix(), ) nxfScripts.append(rel_local_template) else: @@ -744,8 +735,8 @@ def runNextflowCommand( nextflow_version: "EngineVersion", commandLine: "Sequence[str]", containers_path: "Optional[AnyPath]" = None, - workdir: "Optional[AbsPath]" = None, - intermediateDir: "Optional[AbsPath]" = None, + workdir: "Optional[pathlib.Path]" = None, + intermediateDir: "Optional[pathlib.Path]" = None, nextflow_path: "Optional[EnginePath]" = None, stdoutFilename: "Optional[AbsPath]" = None, stderrFilename: "Optional[AbsPath]" = None, @@ -798,8 +789,8 @@ def runLocalNextflowCommand( nextflow_version: "EngineVersion", commandLine: "Sequence[str]", containers_path: "AnyPath", - workdir: "Optional[AbsPath]" = None, - intermediateDir: "Optional[AbsPath]" = None, + workdir: "Optional[pathlib.Path]" = None, + intermediateDir: "Optional[pathlib.Path]" = None, nextflow_install_dir: "Optional[EnginePath]" = None, stdoutFilename: "Optional[AbsPath]" = None, stderrFilename: "Optional[AbsPath]" = None, @@ -840,10 +831,12 @@ def runLocalNextflowCommand( instEnv.pop("NXF_JAVA_HOME", None) instEnv.pop("JAVA_HOME", None) - jobIntermediateDir: "str" = ( + jobIntermediateDir = ( intermediateDir if intermediateDir is not None else self.intermediateDir ) - instEnv["NXF_WORK"] = workdir if workdir is not None else jobIntermediateDir + instEnv["NXF_WORK"] = ( + workdir if workdir is not None else jobIntermediateDir + ).as_posix() instEnv["NXF_ASSETS"] = self.nxf_assets if self.logger.getEffectiveLevel() <= logging.DEBUG: instEnv["NXF_DEBUG"] = "1" @@ -944,8 +937,8 @@ def runNextflowCommandInDocker( nextflow_version: "EngineVersion", commandLine: "Sequence[str]", containers_path: "AnyPath", - workdir: "Optional[AbsPath]" = None, - intermediateDir: "Optional[AbsPath]" = None, + workdir: "Optional[pathlib.Path]" = None, + intermediateDir: "Optional[pathlib.Path]" = None, stdoutFilename: "Optional[AbsPath]" = None, stderrFilename: "Optional[AbsPath]" = None, runEnv: "Optional[Mapping[str, str]]" = None, @@ -1016,14 +1009,13 @@ def runNextflowCommandInDocker( try: if workdir is None: - workdir = cast( - "AbsPath", - os.path.abspath(self.workDir) - if not os.path.isabs(self.workDir) - else self.workDir, + workdir = ( + self.workDir.absolute() + if not self.workDir.is_absolute() + else self.workDir ) else: - os.makedirs(workdir, exist_ok=True) + workdir.mkdir(parents=True, exist_ok=True) except Exception as error: raise WorkflowEngineException( "ERROR: Unable to create nextflow working directory. Error: " @@ -1082,9 +1074,9 @@ def runNextflowCommandInDocker( + " -e TZ=" + tzstring + " -v " - + workdir + + workdir.as_posix() + ":" - + workdir + + workdir.as_posix() + ":rw,rprivate,z", "-v", "/var/run/docker.sock:/var/run/docker.sock:rw,rprivate,z", @@ -1103,7 +1095,7 @@ def runNextflowCommandInDocker( volumes = [ (homedir + "/", "ro,rprivate,z"), # (nxf_assets_dir,"rprivate,z"), - (workdir + "/", "rw,rprivate,z"), + (workdir.as_posix() + "/", "rw,rprivate,z"), # (project_path+'/',"rw,rprivate,z"), # (repo_dir+'/',"ro,rprivate,z") ] @@ -1266,7 +1258,7 @@ def _get_engine_version_str( retval, engine_ver, nxf_version_stderr_v = self.runNextflowCommand( matWfEng.version, ["-v"], - workdir=matWfEng.engine_path, + workdir=pathlib.Path(matWfEng.engine_path), nextflow_path=matWfEng.engine_path, ) @@ -1360,7 +1352,7 @@ def materializeWorkflow( nxf_params.extend(["-profile", ",".join(self.nxf_profile)]) else: nxf_params.extend(["-show-profiles"]) - nxf_params.append(localWf.dir) + nxf_params.append(localWf.dir.as_posix()) flat_retval, flat_stdout, flat_stderr = self.runNextflowCommand( matWorkflowEngine.version, @@ -1514,7 +1506,7 @@ def simpleContainerFileName(self, imageUrl: "URIType") -> "RelPath": return cast("RelPath", name + extension) def structureAsNXFParams( - self, matInputs: "Sequence[MaterializedInput]", outputsDir: "AbsPath" + self, matInputs: "Sequence[MaterializedInput]", outputsDir: "pathlib.Path" ) -> "Mapping[str, Any]": nxpParams: "MutableMapping[str, Any]" = {} @@ -1533,7 +1525,7 @@ def structureAsNXFParams( ContentKind.File, ContentKind.ContentWithURIs, ): - if not os.path.exists(value.local): + if not value.local.exists(): self.logger.warning( "Input {} has values which are not materialized".format( matInput.name @@ -1542,9 +1534,9 @@ def structureAsNXFParams( # Use the extrapolated local file containing paths # instead of the original one containing URLs nxfValues.append( - value.local + value.local.as_posix() if value.extrapolated_local is None - else value.extrapolated_local + else value.extrapolated_local.as_posix() ) else: raise WorkflowEngineException( @@ -1629,13 +1621,13 @@ def launchWorkflow( outputsDir, outputMetaDir, ) = self.create_job_directories() - outputStatsDir = os.path.join(outputMetaDir, WORKDIR_STATS_RELDIR) - os.makedirs(outputStatsDir, exist_ok=True) + outputStatsDir = outputMetaDir / WORKDIR_STATS_RELDIR + outputStatsDir.mkdir(parents=True, exist_ok=True) - timelineFile = os.path.join(outputStatsDir, "timeline.html") - reportFile = os.path.join(outputStatsDir, "report.html") - traceFile = os.path.join(outputStatsDir, "trace.tsv") - dagFile = os.path.join(outputStatsDir, STATS_DAG_DOT_FILE) + timelineFile = outputStatsDir / "timeline.html" + reportFile = outputStatsDir / "report.html" + traceFile = outputStatsDir / "trace.tsv" + dagFile = outputStatsDir / STATS_DAG_DOT_FILE # Custom variables setup runEnv = dict(os.environ) @@ -1689,7 +1681,7 @@ def launchWorkflow( # Environment variables have to be processed before we are reaching next lines # Now, the environment variables to include - bindable_paths = [] + bindable_paths: "MutableSequence[pathlib.Path]" = [] for mat_env in matEnvironment: if len(mat_env.values) > 0: envWhitelist.append(mat_env.name) @@ -1702,7 +1694,7 @@ def launchWorkflow( else mat_val.extrapolated_local ) bindable_paths.append(the_local) - env_vals.append(the_local) + env_vals.append(the_local.as_posix()) else: env_vals.append(str(mat_val)) # Now, assign it @@ -1717,34 +1709,32 @@ def launchWorkflow( ) # Corner cases of single file workflows with no nextflow.config file - originalConfFile: "Optional[str]" + originalConfFile: "Optional[pathlib.Path]" if localWf.relPath != localWf.relPathFiles[0]: - originalConfFile = os.path.join(localWf.dir, localWf.relPathFiles[0]) + originalConfFile = localWf.dir / localWf.relPathFiles[0] # Copying the workflow directory, so an additional file # can be included without changing the original one - wDir = os.path.join(outputMetaDir, "nxf_trojan") + wDir = outputMetaDir / "nxf_trojan" shutil.copytree(localWf.dir, wDir, copy_function=copy2_nofollow) - forceParamsConfFile = os.path.join(wDir, self.TROJAN_CONFIG_FILENAME) + forceParamsConfFile = wDir / self.TROJAN_CONFIG_FILENAME else: wDir = localWf.dir # Configuration file generated by WfExS to override what it is needed - forceParamsConfFile = os.path.join( - outputMetaDir, self.TROJAN_CONFIG_FILENAME - ) + forceParamsConfFile = outputMetaDir / self.TROJAN_CONFIG_FILENAME originalConfFile = None # File where all the gathered parameters are going to be stored allParamsFile = os.path.join(outputMetaDir, "all-params.json") - with open(forceParamsConfFile, mode="w", encoding="utf-8") as fPC: + with forceParamsConfFile.open(mode="w", encoding="utf-8") as fPC: # First of all, we have to replicate the contents of the # original nextflow.config, so their original methods are not out # of context if originalConfFile is not None: - with open(originalConfFile, mode="r", encoding="utf-8") as oH: + with originalConfFile.open(mode="r", encoding="utf-8") as oH: shutil.copyfileobj(oH, fPC) print("\n", file=fPC) @@ -1850,13 +1840,13 @@ def wfexs_allParams() file=fPC, ) - inputsFileName = os.path.join(outputMetaDir, self.INPUT_DECLARATIONS_FILENAME) + inputsFileName = outputMetaDir / self.INPUT_DECLARATIONS_FILENAME nxpParams = self.structureAsNXFParams(matInputs, outputsDir) if len(nxpParams) != 0: try: - with open(inputsFileName, mode="w+", encoding="utf-8") as yF: - yaml.dump(nxpParams, yF) + with inputsFileName.open(mode="w+", encoding="utf-8") as yF: + yaml.safe_dump(nxpParams, yF) except IOError as error: raise WorkflowEngineException( "ERROR: cannot create input declarations file {}, {}".format( @@ -1870,17 +1860,17 @@ def wfexs_allParams() nxf_params = [ "-log", - os.path.join(outputStatsDir, "log.txt"), + (outputStatsDir / "log.txt").as_posix(), "-C", - forceParamsConfFile, + forceParamsConfFile.as_posix(), "run", "-name", runName, "-offline", "-w", - intermediateDir, + intermediateDir.as_posix(), "-params-file", - inputsFileName, + inputsFileName.as_posix(), ] profile_input: "Optional[MaterializedInput]" = None @@ -1894,7 +1884,7 @@ def wfexs_allParams() ) # Using the copy of the original workflow - nxf_params.append(wDir) + nxf_params.append(wDir.as_posix()) stdoutFilename = cast( "AbsPath", os.path.join(outputMetaDir, WORKDIR_STDOUT_FILE) From ebbe380ec64a25a028a91ff0da0bddd2730b4a99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 17 Jun 2024 01:58:09 +0200 Subject: [PATCH 42/62] Fixed `relative_to`for `ZipfilePath` --- wfexs_backend/utils/zipfile_path.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index 4e0bfd9c..379cce55 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -407,7 +407,11 @@ def relative_to( # type: ignore[override] *_deprecated: "Union[str, os.PathLike[str]]", walk_up: bool = False, ) -> "pathlib.Path": - return pathlib.Path(path_relative_to(self, pathlib.Path(other))) + return pathlib.Path( + path_relative_to( + self, other if isinstance(other, pathlib.Path) else pathlib.Path(other) + ) + ) def with_name(self, name: "Union[str, os.PathLike[str]]") -> "ZipfilePath": return self.parent.joinpath(name) From c56d4296bee61ae34359d0cb91b95372137d033a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 17 Jun 2024 01:59:51 +0200 Subject: [PATCH 43/62] Cached LocalWorkflow representing payload workflow is now properly generated --- wfexs_backend/utils/rocrate.py | 166 ++++++++++++++++++++++++++++----- 1 file changed, 141 insertions(+), 25 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index f8a313c3..5188ee0e 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -149,6 +149,12 @@ class ContainerTypeMetadata(NamedTuple): ct_applicationCategory: "str" +class ROCratePayload(NamedTuple): + rel_path: "str" + path: "pathlib.Path" + signature: "Optional[Fingerprint]" + + ContainerTypeMetadataDetails: "Final[Mapping[ContainerType, ContainerTypeMetadata]]" = { ContainerType.Singularity: ContainerTypeMetadata( sa_id="https://apptainer.org/", @@ -501,7 +507,7 @@ def identifyROCrate( return (resrow, g) OBTAIN_WORKFLOW_PID_SPARQL: "Final[str]" = """\ -SELECT ?origmainentity ?identifier ?workflow_repository ?workflow_version ?workflow_url ?workflow_alternate_name ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version ?file_size ?file_sha256 +SELECT ?origmainentity ?identifier ?workflow_repository ?workflow_version ?workflow_url ?workflow_name ?workflow_alternate_name ?programminglanguage_identifier ?programminglanguage_url ?programminglanguage_version ?file_size ?file_sha256 WHERE { ?mainentity s:programmingLanguage ?programminglanguage . ?programminglanguage @@ -547,6 +553,9 @@ def identifyROCrate( OPTIONAL { ?origmainentity s:identifier ?identifier . } + OPTIONAL { + ?origmainentity s:name ?workflow_name . + } OPTIONAL { ?origmainentity s:alternateName ?workflow_alternate_name . } @@ -1411,11 +1420,9 @@ def __parseContainersResults( the_file_sha256=containerrow.container_snapshot_sha256, ) if include_container_image is not None: - ( - container_image_path, - located_snapshot, - image_signature, - ) = include_container_image + container_image_path = include_container_image.rel_path + located_snapshot = include_container_image.path + image_signature = include_container_image.signature if containerrow.source_container_metadata is not None: include_metadata_container_image = self.__processPayloadEntity( @@ -1429,11 +1436,13 @@ def __parseContainersResults( ) if include_metadata_container_image is not None: - ( - metadata_container_image_path, - located_metadata, - computed_source_container_metadata_signature, - ) = include_metadata_container_image + metadata_container_image_path = ( + include_metadata_container_image.rel_path + ) + located_metadata = include_metadata_container_image.path + computed_source_container_metadata_signature = ( + include_metadata_container_image.signature + ) the_containers.append( Container( @@ -1678,7 +1687,9 @@ def __processPayloadInput( ) if include_input is not None: - input_path, located_input, file_signature = include_input + input_path = include_input.rel_path + located_input = include_input.path + file_signature = include_input.signature licences_tuple = ( cast("Tuple[URIType, ...]", tuple(licences)) if len(licences) > 0 @@ -2086,7 +2097,7 @@ def __processPayloadEntity( entity_name: "str", the_file_size: "rdflib.term.Node", the_file_sha256: "rdflib.term.Node", - ) -> "Optional[Tuple[str, pathlib.Path, Optional[Fingerprint]]]": + ) -> "Optional[ROCratePayload]": entity_uri = str(the_entity) entity_parsed_uri = urllib.parse.urlparse(entity_uri) include_entity = entity_parsed_uri.scheme == self.RELATIVE_ROCRATE_SCHEME @@ -2160,10 +2171,74 @@ def __processPayloadEntity( if include_entity: assert entity_path is not None assert located_entity is not None - return (entity_path, located_entity, file_signature) + return ROCratePayload( + rel_path=entity_path, + path=located_entity, + signature=file_signature, + ) else: return None + def __list_entity_parts( + self, + g: "rdflib.graph.Graph", + entity: "rdflib.term.Identifier", + public_name: "str", + ) -> "rdflib.query.Result": + qlist = rdflib.plugins.sparql.prepareQuery( + self.LIST_PARTS_SPARQL, + initNs=self.SPARQL_NS, + ) + try: + qlistres = g.query( + qlist, + initBindings={ + "entity": entity, + }, + ) + except Exception as e: + raise ROCrateToolboxException( + f"Unable to perform JSON-LD list entity parts query over {public_name} (see cascading exceptions)" + ) from e + + return qlistres + + def __list_payload_entity_parts( + self, + g: "rdflib.graph.Graph", + entity: "rdflib.term.Identifier", + public_name: "str", + payload_dir: "pathlib.Path", + ) -> "Sequence[ROCratePayload]": + entity_parts = self.__list_entity_parts(g, entity, public_name) + + payload_entity_parts = [] + for part_row in entity_parts: + assert isinstance( + part_row, rdflib.query.ResultRow + ), "Check the SPARQL code, as it should be a SELECT query" + + included_part_entity = self.__processPayloadEntity( + the_entity=part_row.part_entity, + payload_dir=payload_dir, + kindobj=ContentKind.File, + entity_type="secondary workflow component", + entity_name=str(part_row.part_name) + if part_row.part_name is not None + else "PACO", # FIXME + the_file_size=part_row.file_size, + the_file_sha256=part_row.file_sha256, + ) + + if included_part_entity is not None: + payload_entity_parts.append(included_part_entity) + else: + self.logger.warning( + f"Entity part {str(part_row.part_entity)} from {str(entity)} in {public_name} did not have a valid payload" + ) + + return payload_entity_parts + def extractWorkflowMetadata( self, g: "rdflib.graph.Graph", @@ -2265,29 +2340,70 @@ def extractWorkflowMetadata( ) if payload_dir is not None: + workflow_name = langrow.workflow_name + if langrow.workflow_name is not None: + workflow_name = langrow.workflow_name + elif langrow.workflow_alternate_name is not None: + workflow_name = langrow.workflow_alternate_name + elif langrow.identifier is not None: + workflow_name = langrow.identifier + else: + workflow_name = None include_main_entity = self.__processPayloadEntity( the_entity=langrow.origmainentity, payload_dir=payload_dir, kindobj=ContentKind.File, entity_type="main workflow component", - entity_name="PEPE", # FIXME + entity_name=str(workflow_name) + if workflow_name is not None + else "PEPE", # FIXME the_file_size=langrow.file_size, the_file_sha256=langrow.file_sha256, ) if include_main_entity is not None: - main_entity_path = include_main_entity[0] + main_entity_relpath = include_main_entity.rel_path + main_entity_path = include_main_entity.path # TODO - # workflow_parts = self.__list_entity_parts(langrow.origmainentity, payload_dir) - - # cached_workflow = LocalWorkflow( - # dir=cast("AbsPath", ""), - # relPath=cast("RelPath", main_entity_path), - # effectiveCheckout=, - # # langVersion= - # relPathFiles=rel_path_files, - # ) + workflow_parts = self.__list_payload_entity_parts( + g, langrow.origmainentity, public_name, payload_dir + ) + if len(workflow_parts) == 0: + base_dir = main_entity_path.parent + main_entity_relpath = main_entity_path.name + rel_path_files = [] + else: + rel_path_files = list( + map(lambda part: cast("RelPath", part.rel_path), workflow_parts) + ) + common_prefix = os.path.commonpath( + [main_entity_relpath, *rel_path_files] + ) + if len(common_prefix) == 0: + base_dir = payload_dir + else: + base_dir = payload_dir / common_prefix + main_entity_relpath = main_entity_path.relative_to( + base_dir + ).as_posix() + rel_path_files = [ + cast("RelPath", part.path.relative_to(base_dir).as_posix()) + for part in workflow_parts + ] + + cached_workflow = LocalWorkflow( + dir=base_dir, + relPath=cast("RelPath", main_entity_relpath), + effectiveCheckout=repo.tag, + langVersion=cast( + "Union[EngineVersion, WFLangVersion]", + str(langrow.programminglanguage_version), + ) + if langrow.programminglanguage_version is not None + else None, + relPathFiles=rel_path_files, + ) return repo, workflow_type, cached_workflow From 7918da0f300c3a21a7ce7891ecf51d27b56550b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 17 Jun 2024 02:11:39 +0200 Subject: [PATCH 44/62] Input invalidation for replicability. Invalidation of cached inputs and environment for replicability scenarios where either a cached input or environment is replaced with a new, unmaterialized one, has been implemented. Also, rescued operational containers for re-staging scenario. --- wfexs_backend/workflow.py | 236 +++++++++++++++++++------------------- 1 file changed, 121 insertions(+), 115 deletions(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 77a0ca9c..59fffd80 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -468,6 +468,7 @@ def __init__( cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, cached_environment: "Optional[Sequence[MaterializedInput]]" = None, preferred_containers: "Sequence[Container]" = [], + preferred_operational_containers: "Sequence[Container]" = [], reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, strict_reproducibility_level: "bool" = False, ): @@ -527,6 +528,9 @@ def __init__( self.cached_inputs = cached_inputs self.cached_environment = cached_environment self.preferred_containers = copy.copy(preferred_containers) + self.preferred_operational_containers = copy.copy( + preferred_operational_containers + ) self.reproducibility_level = reproducibility_level self.strict_reproducibility_level = strict_reproducibility_level @@ -1223,7 +1227,8 @@ def __merge_params_from_file( wfexs: "WfExSBackend", base_workflow_meta: "WorkflowMetaConfigBlock", replaced_parameters_filename: "AnyPath", - ) -> "WritableWorkflowMetaConfigBlock": + ) -> "Tuple[WritableWorkflowMetaConfigBlock, Mapping[str, Set[str]]]": + transferrable_keys = ("params", "environment") new_params_meta = cls.__read_yaml_config(replaced_parameters_filename) if ( @@ -1241,7 +1246,10 @@ def __merge_params_from_file( # Now, trim everything but what it is allowed existing_keys = set(new_params_meta.keys()) - existing_keys.remove("params") + for t_key in transferrable_keys: + if t_key in existing_keys: + existing_keys.remove(t_key) + if len(existing_keys) > 0: for key in existing_keys: del new_params_meta[key] @@ -1256,9 +1264,13 @@ def __merge_params_from_file( # Last, merge workflow_meta = copy.deepcopy(base_workflow_meta) - workflow_meta["params"].update(new_params_meta["params"]) + transferred_items: "MutableMapping[str, Set[str]]" = dict() + for t_key in transferrable_keys: + if t_key in new_params_meta: + workflow_meta.setdefault(t_key, {}).update(new_params_meta[t_key]) + transferred_items[t_key] = set(new_params_meta[t_key].keys()) - return workflow_meta + return workflow_meta, transferred_items @classmethod def FromWorkDir( @@ -1344,6 +1356,7 @@ def FromStagedRecipe( cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, cached_environment: "Optional[Sequence[MaterializedInput]]" = None, preferred_containers: "Sequence[Container]" = [], + preferred_operational_containers: "Sequence[Container]" = [], reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, strict_reproducibility_level: "bool" = False, ) -> "WF": @@ -1381,6 +1394,7 @@ def FromStagedRecipe( cached_inputs=cached_inputs, cached_environment=cached_environment, preferred_containers=preferred_containers, + preferred_operational_containers=preferred_operational_containers, reproducibility_level=reproducibility_level, strict_reproducibility_level=strict_reproducibility_level, ) @@ -1415,9 +1429,67 @@ def FromPreviousInstanceDeclaration( workflow_meta = copy.deepcopy(wfInstance.staging_recipe) if replaced_parameters_filename is not None: - workflow_meta = cls.__merge_params_from_file( + workflow_meta, replaced_items = cls.__merge_params_from_file( wfexs, workflow_meta, replaced_parameters_filename ) + else: + replaced_items = dict() + + # Now, some postprocessing... + cached_inputs: "Optional[Sequence[MaterializedInput]]" = None + cached_environment: "Optional[Sequence[MaterializedInput]]" = None + the_containers: "Sequence[Container]" = [] + the_operational_containers: "Sequence[Container]" = [] + cached_workflow: "Optional[LocalWorkflow]" = None + if reproducibility_level >= ReproducibilityLevel.Full: + if wfInstance.materializedParams is not None: + cached_inputs = copy.copy(wfInstance.materializedParams) + + # Let's invalidate several params + # as several parameters could be replaced + replaced_inputs = replaced_items.get("params") + if ( + replaced_inputs is not None + and isinstance(cached_inputs, list) + and len(cached_inputs) > 0 + ): + new_cached_inputs = list( + filter( + lambda m_i: m_i.name not in replaced_inputs, cached_inputs + ) + ) + if len(new_cached_inputs) < len(cached_inputs): + cached_inputs = new_cached_inputs + + if wfInstance.materializedEnvironment is not None: + cached_environment = copy.copy(wfInstance.materializedEnvironment) + + # Let's invalidate several environment variables + # as several parameters could be replaced + replaced_environment = replaced_items.get("environment") + if ( + replaced_environment is not None + and isinstance(cached_environment, list) + and len(cached_environment) > 0 + ): + new_cached_environment = list( + filter( + lambda m_i: m_i.name not in replaced_environment, + cached_environment, + ) + ) + if len(new_cached_environment) < len(cached_environment): + cached_environment = new_cached_environment + + if wfInstance.materializedEngine is not None: + if wfInstance.materializedEngine.containers is not None: + the_containers = wfInstance.materializedEngine.containers + if wfInstance.materializedEngine.operational_containers is not None: + the_operational_containers = ( + wfInstance.materializedEngine.operational_containers + ) + + cached_workflow = wfInstance.getMaterializedWorkflow() # We have to reset the inherited paranoid mode and nickname for k_name in ("nickname", "paranoid_mode"): @@ -1437,10 +1509,10 @@ def FromPreviousInstanceDeclaration( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, - cached_workflow=wfInstance.getMaterializedWorkflow(), - cached_inputs=wfInstance.materializedParams, - cached_environment=wfInstance.materializedEnvironment, - preferred_containers=wfInstance.getMaterializedContainers(), + cached_workflow=cached_workflow, + cached_inputs=cached_inputs, + cached_environment=cached_environment, + preferred_containers=the_containers, reproducibility_level=reproducibility_level, strict_reproducibility_level=strict_reproducibility_level, ) @@ -1538,111 +1610,6 @@ def FromPreviousROCrate( payload_dir=payload_dir, ) - logging.error(f"Containers {the_containers}") - logging.error(f"Inputs {cached_inputs}") - sys.exit(1) - # Now, some postprocessing... - if ( - reproducibility_level >= ReproducibilityLevel.Full - and payload_dir is not None - and not isinstance(payload_dir, pathlib.Path) - ): - # This one is needed when the payload_dir is defined and not a - # local path, like within a zip archive. - materialized_payload_dir = pathlib.Path( - tempfile.mkdtemp(prefix="wfexs", suffix="import") - ) - atexit.register(shutil.rmtree, materialized_payload_dir, True) - - # Fix cached workflow - if cached_workflow is not None: - workflow_dir = materialized_payload_dir / WORKDIR_WORKFLOW_RELDIR - workflow_dir.mkdir(parents=True, exist_ok=True) - - # Transfer entrypoint - if cached_workflow.relPath is not None: - dest_entrypoint = workflow_dir / cached_workflow.relPath - dest_entrypoint.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2( - cast("PathLike[str]", payload_dir / cached_workflow.relPath), - dest_entrypoint, - ) - if ( - cached_workflow.relPathFiles is not None - and len(cached_workflow.relPathFiles) > 0 - ): - # And all the elements - for rel_file in cached_workflow.relPathFiles: - if rel_file == cached_workflow.relPath: - continue - p_rel_file = urllib.parse.urlparse(rel_file) - if p_rel_file.scheme != "": - continue - - dest_file = workflow_dir / rel_file - dest_file.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2( - cast("PathLike[str]", payload_dir / rel_file), dest_file - ) - - # Last, the reference - cached_workflow = cached_workflow._replace( - dir=cast("AbsPath", workflow_dir.as_posix()) - ) - - # Fix containers - if len(the_containers) > 0: - containers_dir = materialized_payload_dir / WORKDIR_CONTAINERS_RELDIR - containers_dir.mkdir(parents=True, exist_ok=True) - new_containers = [] - for the_container in the_containers: - new_container = the_container - - if new_container.localPath is not None: - source_image = payload_dir / new_container.localPath - dest_image = containers_dir / path_relative_to( - source_image, payload_dir - ) - dest_image.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(cast("PathLike[str]", source_image), dest_image) - - new_container = dataclasses.replace( - new_container, - localPath=cast("AbsPath", dest_image.as_posix()), - ) - - if new_container.metadataLocalPath is not None: - source_meta = payload_dir / new_container.metadataLocalPath - dest_meta = containers_dir / path_relative_to( - source_meta, payload_dir - ) - dest_meta.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(cast("PathLike[str]", source_meta), dest_meta) - - new_container = dataclasses.replace( - new_container, - metadataLocalPath=cast("AbsPath", dest_meta.as_posix()), - ) - - new_containers.append(new_container) - - the_containers = new_containers - - # Fix inputs - inputs_dir = materialized_payload_dir / WORKDIR_INPUTS_RELDIR - if cached_inputs is not None and len(cached_inputs) > 0: - inputs_dir.mkdir(parents=True, exist_ok=True) - cached_inputs = cls._transferInputs( - payload_dir, inputs_dir, cached_inputs - ) - - # Fix environment - if cached_environment is not None and len(cached_environment) > 0: - inputs_dir.mkdir(parents=True, exist_ok=True) - cached_environment = cls._transferInputs( - payload_dir, inputs_dir, cached_environment - ) - workflow_pid = wfexs.gen_workflow_pid(repo) logging.debug( f"Repo {repo} workflow type {workflow_type} container factory {container_type}" @@ -1663,9 +1630,11 @@ def FromPreviousROCrate( logging.debug(f"{json.dumps(workflow_meta, indent=4)}") if replaced_parameters_filename is not None: - workflow_meta = cls.__merge_params_from_file( + workflow_meta, replaced_items = cls.__merge_params_from_file( wfexs, workflow_meta, replaced_parameters_filename ) + else: + replaced_items = dict() # Last, be sure that what it has been generated is correct if wfexs.validateConfigFiles(workflow_meta, securityContextsConfigFilename) > 0: @@ -1673,6 +1642,40 @@ def FromPreviousROCrate( f"Generated WfExS description from {public_name} fails (have a look at the log messages for details)" ) + # Now, some postprocessing... + if ( + reproducibility_level >= ReproducibilityLevel.Full + and payload_dir is not None + ): + # Let's invalidate several params and environment + # as several parameters could be replaced + replaced_inputs = replaced_items.get("params") + if ( + replaced_inputs is not None + and isinstance(cached_inputs, list) + and len(cached_inputs) > 0 + ): + new_cached_inputs = list( + filter(lambda m_i: m_i.name not in replaced_inputs, cached_inputs) + ) + if len(new_cached_inputs) < len(cached_inputs): + cached_inputs = new_cached_inputs + + replaced_environment = replaced_items.get("environment") + if ( + replaced_environment is not None + and isinstance(cached_environment, list) + and len(cached_environment) > 0 + ): + new_cached_environment = list( + filter( + lambda m_i: m_i.name not in replaced_environment, + cached_environment, + ) + ) + if len(new_cached_environment) < len(cached_environment): + cached_environment = new_cached_environment + return cls.FromStagedRecipe( wfexs, workflow_meta, @@ -1687,6 +1690,7 @@ def FromPreviousROCrate( cached_inputs=cached_inputs, cached_environment=cached_environment, preferred_containers=the_containers, + # TODO: preferred_operational_containers are not rescued (yet!) reproducibility_level=reproducibility_level, strict_reproducibility_level=strict_reproducibility_level, ) @@ -1706,6 +1710,7 @@ def FromDescription( cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, cached_environment: "Optional[Sequence[MaterializedInput]]" = None, preferred_containers: "Sequence[Container]" = [], + preferred_operational_containers: "Sequence[Container]" = [], reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, strict_reproducibility_level: "bool" = False, ) -> "WF": @@ -1750,6 +1755,7 @@ def FromDescription( cached_inputs=cached_inputs, cached_environment=cached_environment, preferred_containers=preferred_containers, + preferred_operational_containers=preferred_operational_containers, reproducibility_level=reproducibility_level, strict_reproducibility_level=strict_reproducibility_level, ) From acf787c085042f67e0fd287e1d1ab8b0efc476f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 17 Jun 2024 15:54:32 +0200 Subject: [PATCH 45/62] Minor cleanup fix --- wfexs_backend/__main__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wfexs_backend/__main__.py b/wfexs_backend/__main__.py index d2ae4492..9e0c3be6 100644 --- a/wfexs_backend/__main__.py +++ b/wfexs_backend/__main__.py @@ -1505,6 +1505,9 @@ def main() -> None: strict_reproducibility_level=args.strict_reproducibility_level, ) + # This is needed to be sure the encfs instance is unmounted + atexit.register(wfInstance.cleanup) + wfSetup = wfInstance.getStagedSetup() print("\t- Working directory will be {}".format(wfSetup.work_dir), file=sys.stderr) sys.stderr.flush() From 9c638a6b671ee9544deebb00393cad051603798b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 17 Jun 2024 15:58:35 +0200 Subject: [PATCH 46/62] Several fixes on generated staging file. * Licences always have to be URIs, to properly validate against the staging definition schema. * Staging definition params were wrongly labelled in their `c-l-a-s-s`. * Also, licence curation method is now a public one at wfexs_backend.WfExSBackend instance. --- wfexs_backend/utils/rocrate.py | 18 +++++++++--- wfexs_backend/wfexs_backend.py | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 5188ee0e..eeee9d7b 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -113,6 +113,7 @@ LocalWorkflow, MaterializedContent, MaterializedInput, + NoLicenceDescription, ) from ..container_factories import ( @@ -1774,7 +1775,7 @@ def __parseInputsResults( valobj = base.setdefault( param_last, { - "c-l-a-s-s": kindobj.value, + "c-l-a-s-s": kindobj.name, }, ) @@ -1799,13 +1800,17 @@ def __parseInputsResults( if len(licences) == 0: licences = default_licences + expanded_licences = self.wfexs.curate_licence_list( + licences, default_licence=NoLicenceDescription + ) + the_url: "Union[str, Mapping[str, Any]]" if len(licences) == 0: the_url = the_uri else: the_url = { "uri": the_uri, - "licences": licences, + "licences": list(map(lambda el: el.uris[0], expanded_licences)), } valurl = valobj.get("url") @@ -1962,7 +1967,7 @@ def __parseEnvResults( valobj = environment.setdefault( env_name, { - "c-l-a-s-s": kindobj.value, + "c-l-a-s-s": kindobj.name, }, ) @@ -1986,13 +1991,18 @@ def __parseEnvResults( licences = self._getLicences(g, envrow.env, public_name) if len(licences) == 0: licences = default_licences + + expanded_licences = self.wfexs.curate_licence_list( + licences, default_licence=NoLicenceDescription + ) + the_url: "Union[str, Mapping[str, Any]]" if len(licences) == 0: the_url = str(envrow.fileuri) else: the_url = { "uri": str(envrow.fileuri), - "licences": licences, + "licences": list(map(lambda el: el.uris[0], expanded_licences)), } valurl = valobj.get("url") diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 107c3100..b81a7b05 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -68,6 +68,7 @@ DEFAULT_PROGS, LicensedURI, MaterializedContent, + NoLicenceDescription, URIWithMetadata, ) @@ -98,6 +99,11 @@ from .security_context import SecurityContextVault +from .utils.licences import ( + AcceptableLicenceSchemes, + LicenceMatcherSingleton, +) + from .utils.marshalling_handling import ( unmarshall_namedtuple, ) @@ -188,6 +194,7 @@ AnyPath, ContainerType, ExitVal, + LicenceDescription, MarshallingStatus, ProgsMapping, RelPath, @@ -211,6 +218,10 @@ StatefulFetcher, ) + from .utils.licences import ( + LicenceMatcher, + ) + from .utils.passphrase_wrapper import ( WfExSPassphraseGenerator, ) @@ -2825,3 +2836,46 @@ def downloadContent( metadata_array=cached_content.metadata_array, fingerprint=cached_content.fingerprint, ) + + _LicenceMatcher: "ClassVar[Optional[LicenceMatcher]]" = None + + @classmethod + def GetLicenceMatcher(cls) -> "LicenceMatcher": + if cls._LicenceMatcher is None: + cls._LicenceMatcher = LicenceMatcherSingleton() + assert cls._LicenceMatcher is not None + + return cls._LicenceMatcher + + def curate_licence_list( + self, + licences: "Sequence[str]", + default_licence: "Optional[LicenceDescription]" = None, + ) -> "Sequence[LicenceDescription]": + # As these licences can be in short format, resolve them to URIs + expanded_licences: "MutableSequence[LicenceDescription]" = [] + if len(licences) == 0: + expanded_licences.append(NoLicenceDescription) + else: + licence_matcher = self.GetLicenceMatcher() + rejected_licences: "MutableSequence[str]" = [] + for lic in licences: + matched_licence = licence_matcher.matchLicence(lic) + if matched_licence is None: + rejected_licences.append(lic) + if default_licence is not None: + expanded_licences.append(default_licence) + else: + expanded_licences.append(matched_licence) + + if len(rejected_licences) > 0: + if default_licence is None: + raise WFException( + f"Unsupported license URI scheme(s) or Workflow RO-Crate short license(s): {', '.join(rejected_licences)}" + ) + else: + self.logger.warning( + f"Default license {default_licence} used instead of next unsupported license URI scheme(s) or Workflow RO-Crate short license(s): {', '.join(rejected_licences)}" + ) + + return expanded_licences From 57c7989d2b15f0554789422566119a56c82e370b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 17 Jun 2024 15:59:36 +0200 Subject: [PATCH 47/62] Added input and environment injection codepaths (still untested). Also, remove duplicated code moved to wfexs_backend in a previous commit. --- wfexs_backend/workflow.py | 304 ++++++++++++++++++++++---------------- 1 file changed, 176 insertions(+), 128 deletions(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 59fffd80..67ca142b 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -52,7 +52,6 @@ CratableItem, DEFAULT_CONTAINER_TYPE, NoCratableItem, - NoLicenceDescription, ResolvedORCID, StagedExecution, ) @@ -247,10 +246,6 @@ from .ro_crate import ( WorkflowRunROCrate, ) -from .utils.licences import ( - AcceptableLicenceSchemes, - LicenceMatcherSingleton, -) from .utils.rocrate import ( ReadROCrateMetadata, ReproducibilityLevel, @@ -2114,6 +2109,7 @@ def materializeInputs( formatted_params: "Union[ParamsBlock, Sequence[Mapping[str, Any]]]", offline: "bool" = False, ignoreCache: "bool" = False, + injectable_inputs: "Optional[Sequence[MaterializedInput]]" = None, lastInput: "int" = 0, ) -> "Sequence[MaterializedInput]": assert ( @@ -2123,12 +2119,23 @@ def materializeInputs( self.extrapolatedInputsDir is not None ), "The working directory should not be corrupted beyond basic usage" + injectable_inputs_dict: "Mapping[str, MaterializedInput]" + if injectable_inputs is not None and not ignoreCache: + injectable_inputs_dict = { + injectable_input.name: injectable_input + for injectable_input in injectable_inputs + } + else: + # Ignore injected inputs salvaged from elsewhere + injectable_inputs_dict = dict() + theParams, numInputs, the_failed_uris = self.fetchInputs( formatted_params, workflowInputs_destdir=self.inputsDir, workflowExtrapolatedInputs_destdir=self.extrapolatedInputsDir, offline=offline, ignoreCache=ignoreCache, + injectable_inputs_dict=injectable_inputs_dict, lastInput=lastInput, ) @@ -2875,12 +2882,54 @@ def _fetchContentWithURIs( return theNewInputs, lastInput, the_failed_uris + def _injectContent( + self, + injectable_content: "Sequence[MaterializedContent]", + dest_path: "pathlib.Path", + pretty_relname: "str", + last_input: "int" = 1, + ) -> "Tuple[MutableSequence[MaterializedContent], int]": + injected_content: "MutableSequence[MaterializedContent]" = [] + for injectable in injectable_content: + # Detecting naming collisions + pretty_filename = injectable.prettyFilename + pretty_rel = pathlib.Path(pretty_filename) + dest_content = dest_path / pretty_rel + if dest_content.exists(): + dest_content = dest_path / pretty_relname + + # Stay here while collisions happen + while dest_content.exists(): + prefix = str(last_input) + "_" + dest_content = dest_path / pretty_rel.with_name( + prefix + pretty_rel.name + ) + last_input += 1 + + # Transfer it + dest_content.parent.mkdir(parents=True, exist_ok=True) + link_or_copy_pathlib(injectable.local, dest_content, force_copy=True) + # Second, record it + injected_content.append( + MaterializedContent( + local=dest_content, + licensed_uri=injectable.licensed_uri, + prettyFilename=pretty_filename, + kind=injectable.kind, + metadata_array=injectable.metadata_array, + fingerprint=injectable.fingerprint, + ) + ) + + return injected_content, last_input + def fetchInputs( self, params: "Union[ParamsBlock, Sequence[ParamsBlock]]", workflowInputs_destdir: "AbsPath", workflowExtrapolatedInputs_destdir: "AbsPath", prefix: "str" = "", + injectable_inputs_dict: "Mapping[str, MaterializedInput]" = {}, lastInput: "int" = 0, offline: "bool" = False, ignoreCache: "bool" = False, @@ -3052,105 +3101,131 @@ def fetchInputs( if newInputDestDir.relative_to(inputDestDir): inputDestDir = newInputDestDir - # The storage dir depends on whether it can be cached or not - storeDir: "Union[CacheType, AbsPath]" = ( - CacheType.Input if cacheable else workflowInputs_destdir + remote_pairs: "MutableSequence[MaterializedContent]" = [] + secondary_remote_pairs: "Optional[MutableSequence[MaterializedContent]]" = ( + None ) - remote_files_f: "Sequence[Sch_InputURI_Fetchable]" - if remote_files is not None: - if isinstance( - remote_files, list - ): # more than one input file - remote_files_f = remote_files - else: - remote_files_f = [ - cast("Sch_InputURI_Fetchable", remote_files) - ] - else: - inline_values_l: "Sequence[str]" - if isinstance(inline_values, list): - # more than one inline content - inline_values_l = inline_values - else: - inline_values_l = [cast("str", inline_values)] - - remote_files_f = [ - # The storage dir is always the input - # Let's use the trick of translating the content into a data URL - bin2dataurl(inline_value.encode("utf-8")) - for inline_value in inline_values_l - ] + injectable_input = injectable_inputs_dict.get(linearKey) + if ( + injectable_input is not None + and len(injectable_input.values) > 0 + ): + # Input being injected + remote_pairs, lastInput = self._injectContent( + cast( + "Sequence[MaterializedContent]", + injectable_input.values, + ), + inputDestDir, + last_input=lastInput, + pretty_relname=pretty_relname, + ) - remote_pairs: "MutableSequence[MaterializedContent]" = [] - for remote_file in remote_files_f: - lastInput += 1 - try: - t_remote_pairs = self._fetchRemoteFile( - remote_file, - contextName, - offline, - storeDir, - cacheable, - inputDestDir, - globExplode, - prefix=str(lastInput) + "_", - prettyRelname=pretty_relname, - ignoreCache=this_ignoreCache, - ) - remote_pairs.extend(t_remote_pairs) - except: - self.logger.exception( - f"Error while fetching primary URI {remote_file}" - ) - the_failed_uris.append(remote_file) + if len(remote_pairs) == 0: + # No injected content + # The storage dir depends on whether it can be cached or not + storeDir: "Union[CacheType, AbsPath]" = ( + CacheType.Input + if cacheable + else workflowInputs_destdir + ) - secondary_remote_pairs: "Optional[MutableSequence[MaterializedContent]]" - if (remote_files is not None) and ( - secondary_remote_files is not None - ): - secondary_remote_files_f: "Sequence[Sch_InputURI_Fetchable]" - if isinstance( - secondary_remote_files, list - ): # more than one input file - secondary_remote_files_f = secondary_remote_files + remote_files_f: "Sequence[Sch_InputURI_Fetchable]" + if remote_files is not None: + if isinstance( + remote_files, list + ): # more than one input file + remote_files_f = remote_files + else: + remote_files_f = [ + cast("Sch_InputURI_Fetchable", remote_files) + ] else: - secondary_remote_files_f = [ - cast( - "Sch_InputURI_Fetchable", - secondary_remote_files, - ) + inline_values_l: "Sequence[str]" + if isinstance(inline_values, list): + # more than one inline content + inline_values_l = inline_values + else: + inline_values_l = [cast("str", inline_values)] + + remote_files_f = [ + # The storage dir is always the input + # Let's use the trick of translating the content into a data URL + bin2dataurl(inline_value.encode("utf-8")) + for inline_value in inline_values_l ] - secondary_remote_pairs = [] - for secondary_remote_file in secondary_remote_files_f: - # The last fetched content prefix is the one used - # for all the secondaries + for remote_file in remote_files_f: + lastInput += 1 try: - t_secondary_remote_pairs = ( - self._fetchRemoteFile( - secondary_remote_file, - contextName, - offline, - storeDir, - cacheable, - inputDestDir, - globExplode, - prefix=str(lastInput) + "_", - ignoreCache=ignoreCache, - ) - ) - secondary_remote_pairs.extend( - t_secondary_remote_pairs + t_remote_pairs = self._fetchRemoteFile( + remote_file, + contextName, + offline, + storeDir, + cacheable, + inputDestDir, + globExplode, + prefix=str(lastInput) + "_", + prettyRelname=pretty_relname, + ignoreCache=this_ignoreCache, ) + remote_pairs.extend(t_remote_pairs) except: self.logger.exception( - f"Error while fetching secondary URI {secondary_remote_file}" + f"Error while fetching primary URI {remote_file}" ) - the_failed_uris.append(secondary_remote_file) + the_failed_uris.append(remote_file) - else: - secondary_remote_pairs = None + if (remote_files is not None) and ( + secondary_remote_files is not None + ): + secondary_remote_files_f: "Sequence[Sch_InputURI_Fetchable]" + if isinstance( + secondary_remote_files, list + ): # more than one input file + secondary_remote_files_f = ( + secondary_remote_files + ) + else: + secondary_remote_files_f = [ + cast( + "Sch_InputURI_Fetchable", + secondary_remote_files, + ) + ] + + secondary_remote_pairs = [] + for ( + secondary_remote_file + ) in secondary_remote_files_f: + # The last fetched content prefix is the one used + # for all the secondaries + try: + t_secondary_remote_pairs = ( + self._fetchRemoteFile( + secondary_remote_file, + contextName, + offline, + storeDir, + cacheable, + inputDestDir, + globExplode, + prefix=str(lastInput) + "_", + ignoreCache=ignoreCache, + ) + ) + secondary_remote_pairs.extend( + t_secondary_remote_pairs + ) + except: + self.logger.exception( + f"Error while fetching secondary URI {secondary_remote_file}" + ) + the_failed_uris.append( + secondary_remote_file + ) theInputs.append( MaterializedInput( @@ -3235,6 +3310,7 @@ def fetchInputs( workflowExtrapolatedInputs_destdir=workflowExtrapolatedInputs_destdir, prefix=linearKey + ".", lastInput=lastInput, + injectable_inputs_dict=injectable_inputs_dict, offline=offline, ignoreCache=ignoreCache, ) @@ -3266,12 +3342,18 @@ def stageWorkDir( assert self.formatted_params is not None self.materializedParams = self.materializeInputs( - self.formatted_params, offline=offline, ignoreCache=ignoreCache + self.formatted_params, + offline=offline, + ignoreCache=ignoreCache, + injectable_inputs=self.cached_inputs, ) assert self.formatted_environment is not None self.materializedEnvironment = self.materializeInputs( - self.formatted_environment, offline=offline, ignoreCache=ignoreCache + self.formatted_environment, + offline=offline, + ignoreCache=ignoreCache, + injectable_inputs=self.cached_environment, ) self.marshallStage() @@ -3497,30 +3579,6 @@ def exportResultsFromFiles( return self.exportResults(actions, vault, action_ids, fail_ok=fail_ok) - def _curate_licence_list( - self, licences: "Sequence[str]" - ) -> "Sequence[LicenceDescription]": - # As these licences can be in short format, resolve them to URIs - expanded_licences: "MutableSequence[LicenceDescription]" = [] - if len(licences) == 0: - expanded_licences.append(NoLicenceDescription) - else: - licence_matcher = self.GetLicenceMatcher() - rejected_licences: "MutableSequence[str]" = [] - for lic in licences: - matched_licence = licence_matcher.matchLicence(lic) - if matched_licence is None: - rejected_licences.append(lic) - else: - expanded_licences.append(matched_licence) - - if len(rejected_licences) > 0: - raise WFException( - f"Unsupported license URI scheme(s) or Workflow RO-Crate short license(s): {', '.join(rejected_licences)}" - ) - - return expanded_licences - def _curate_orcid_list( self, orcids: "Sequence[str]", fail_ok: "bool" = True ) -> "Sequence[ResolvedORCID]": @@ -3684,7 +3742,7 @@ def exportResults( else: preferred_id = action.preferred_id - expanded_licences = self._curate_licence_list(the_licences) + expanded_licences = self.wfexs.curate_licence_list(the_licences) curated_orcids = self._curate_orcid_list(the_orcids) export_p = self._instantiate_export_plugin( @@ -4953,13 +5011,3 @@ def createResultsResearchObject( self.logger.info("Execution RO-Crate created: {}".format(filename)) return filename - - _LicenceMatcher: "ClassVar[Optional[LicenceMatcher]]" = None - - @classmethod - def GetLicenceMatcher(cls) -> "LicenceMatcher": - if cls._LicenceMatcher is None: - cls._LicenceMatcher = LicenceMatcherSingleton() - assert cls._LicenceMatcher is not None - - return cls._LicenceMatcher From 12949652321f38ac246448a63f28f6b797093cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 01:47:55 +0200 Subject: [PATCH 48/62] Added a couple of methods to ZipfilePath in order to improve the experience --- wfexs_backend/utils/zipfile_path.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index 379cce55..4f9b28a3 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -413,6 +413,23 @@ def relative_to( # type: ignore[override] ) ) + def resolve(self, strict: "bool" = False) -> "ZipfilePath": + # TODO: better solution + return self.__class__(self._root, self._at) + + def copy_to(self, dest: "pathlib.Path") -> "None": + if self.is_file(): + self._root.extract(self._at, path=dest) + else: + the_members: "Optional[Sequence[str]]" = None + if self._at != "": + the_members = list( + filter( + lambda name: name.startswith(self._at), self._root.namelist() + ) + ) + self._root.extractall(path=dest, members=the_members) + def with_name(self, name: "Union[str, os.PathLike[str]]") -> "ZipfilePath": return self.parent.joinpath(name) From 62724bc883747cdfd35e8a6e77dee43e7610010f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 01:48:05 +0200 Subject: [PATCH 49/62] Added a couple of tweaks to link_or_copy_pathlib so ZipfilePath instances also work as source. --- wfexs_backend/utils/contents.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/utils/contents.py b/wfexs_backend/utils/contents.py index 06f84954..4b0c12d0 100644 --- a/wfexs_backend/utils/contents.py +++ b/wfexs_backend/utils/contents.py @@ -34,6 +34,8 @@ magic = lazy_import("magic") # import magic +from .zipfile_path import ZipfilePath + from ..common import ( ContentKind, GeneratedContent, @@ -246,7 +248,11 @@ def link_or_copy( dest: "Union[AnyPath, os.PathLike[str]]", force_copy: "bool" = False, ) -> None: - link_or_copy_pathlib(pathlib.Path(src), pathlib.Path(dest), force_copy=force_copy) + link_or_copy_pathlib( + src if isinstance(src, pathlib.Path) else pathlib.Path(src), + dest if isinstance(dest, pathlib.Path) else pathlib.Path(dest), + force_copy=force_copy, + ) def link_or_copy_pathlib( @@ -281,7 +287,16 @@ def link_or_copy_pathlib( dest_parent.mkdir(parents=True) # Now, link or copy - if src.lstat().st_dev == dest_st_dev and not force_copy: + link_condition = False + try: + link_condition = ( + not isinstance(src, ZipfilePath) + and src.lstat().st_dev == dest_st_dev + and not force_copy + ) + except: + pass + if link_condition: try: if src.is_file(): if dest_exists: @@ -326,13 +341,19 @@ def link_or_copy_pathlib( # as it is in a separated filesystem if dest_exists: dest.unlink() - shutil.copy2(src, dest) + if isinstance(src, ZipfilePath): + src.copy_to(dest) + else: + shutil.copy2(src, dest) else: # Recursively copying the content # as it is in a separated filesystem if dest_exists: shutil.rmtree(dest) - shutil.copytree(src, dest, copy_function=copy2_nofollow) + if isinstance(src, ZipfilePath): + src.copy_to(dest) + else: + shutil.copytree(src, dest, copy_function=copy2_nofollow) def real_unlink_if_exists( From a14228982d21f97e361fd2e78dd3fe157e153e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 02:12:27 +0200 Subject: [PATCH 50/62] Added fix for copy_to, as Zipfile.extract and Zipfile.extract_all did not behave as expected --- wfexs_backend/utils/zipfile_path.py | 80 +++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index 4f9b28a3..b4a941ae 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -13,15 +13,21 @@ import contextlib import functools +import inspect import io import itertools +import logging +import os import pathlib import posixpath +import shutil +import sys + from typing import ( cast, TYPE_CHECKING, ) -import sys + from zipfile import ( ZipFile, ZipInfo, @@ -299,6 +305,13 @@ def __init__( original type, the caller should either create a separate ZipFile object or pass a filename. """ + # Getting a logger focused on specific classes + self.logger = logging.getLogger( + dict(inspect.getmembers(self))["__module__"] + + "::" + + self.__class__.__name__ + ) + self._root = FastLookup.make(root) self._at = at @@ -417,18 +430,77 @@ def resolve(self, strict: "bool" = False) -> "ZipfilePath": # TODO: better solution return self.__class__(self._root, self._at) + def _extract_member( + self, + member: "Union[ZipInfo, str]", + targetpath: "Union[str, os.PathLike[str]]", + pwd: "Optional[bytes]" = None, + ) -> "str": + """ + Method partially borrowed from python 3.12 + """ + """Extract the ZipInfo object 'member' to a physical + file on the path targetpath. + """ + if not isinstance(member, ZipInfo): + member = self._root.getinfo(member) + + # build the destination pathname, replacing + # forward slashes to platform specific separators. + arcname = member.filename.replace("/", os.path.sep) + + if os.path.altsep: + arcname = arcname.replace(os.path.altsep, os.path.sep) + # interpret absolute pathname as relative, remove drive letter or + # UNC path, redundant separators, "." and ".." components. + arcname = os.path.splitdrive(arcname)[1] + invalid_path_parts = ("", os.path.curdir, os.path.pardir) + arcname = os.path.sep.join( + x for x in arcname.split(os.path.sep) if x not in invalid_path_parts + ) + # if os.path.sep == "\\": + # # filter illegal characters on Windows + # arcname = self._root._sanitize_windows_name(arcname, os.path.sep) + + if not arcname and not member.is_dir(): + raise ValueError("Empty filename.") + + targetpath = os.path.normpath(targetpath) + + # Create all upper directories if necessary. + upperdirs = os.path.dirname(targetpath) + if upperdirs and not os.path.exists(upperdirs): + os.makedirs(upperdirs) + + if member.is_dir(): + if not os.path.isdir(targetpath): + os.mkdir(targetpath) + return targetpath + + with self._root.open(member, pwd=pwd) as source, open( + targetpath, "wb" + ) as target: + shutil.copyfileobj(source, target) + + return targetpath + def copy_to(self, dest: "pathlib.Path") -> "None": + self.logger.error(f"UY {self._root.filename} {self._at} {dest}") if self.is_file(): - self._root.extract(self._at, path=dest) + self._extract_member(self._at, dest) else: - the_members: "Optional[Sequence[str]]" = None + the_members: "Sequence[str]" if self._at != "": the_members = list( filter( lambda name: name.startswith(self._at), self._root.namelist() ) ) - self._root.extractall(path=dest, members=the_members) + else: + the_members = self._root.namelist() + for the_member in the_members: + the_partial_member = the_member[len(self._at) :] + self._extract_member(the_member, dest / the_partial_member) def with_name(self, name: "Union[str, os.PathLike[str]]") -> "ZipfilePath": return self.parent.joinpath(name) From db1364d8b8c997889c98ab44ba94bb926e7fd426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 02:13:21 +0200 Subject: [PATCH 51/62] Add all the scaffolding to propagate the injectable containers --- wfexs_backend/container_factories/__init__.py | 16 ++++++++++++++-- wfexs_backend/workflow_engines/__init__.py | 15 +++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 54d5869a..9c7e816b 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -692,6 +692,7 @@ def materializeContainers( containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, + injectable_containers: "Sequence[Container]" = [], ) -> "Sequence[Container]": """ It is assured the containers are materialized @@ -703,14 +704,25 @@ def materializeContainers( containers_dir = self.stagedContainersDir for tag in tagList: if self.AcceptsContainer(tag): + # This one can provide partial or complete information + tag_to_use: "ContainerTaggedName" = tag + for injectable_container in injectable_containers: + if ( + injectable_container.origTaggedName == tag.origTaggedName + and injectable_container.source_type == tag.type + and injectable_container.registries == tag.registries + ): + tag_to_use = injectable_container + break + container: "Optional[Container]" try: container, was_redeployed = self.deploySingleContainer( - tag, containers_dir=containers_dir, force=force + tag_to_use, containers_dir=containers_dir, force=force ) except ContainerFactoryException as cfe: container = self.materializeSingleContainer( - tag, + tag_to_use, containers_dir=containers_dir, offline=offline, force=force, diff --git a/wfexs_backend/workflow_engines/__init__.py b/wfexs_backend/workflow_engines/__init__.py index 6a7ee54b..46ec8e76 100644 --- a/wfexs_backend/workflow_engines/__init__.py +++ b/wfexs_backend/workflow_engines/__init__.py @@ -251,6 +251,7 @@ def materialize_containers( containersDir: "AnyPath", offline: "bool" = False, force: "bool" = False, + injectable_containers: "Sequence[Container]" = [], ) -> "Tuple[ContainerEngineVersionStr, Sequence[Container], ContainerOperatingSystem, ProcessorArchitecture]": pass @@ -770,6 +771,7 @@ def materialize_containers( containersDir: "Optional[AnyPath]" = None, offline: "bool" = False, force: "bool" = False, + injectable_containers: "Sequence[Container]" = [], ) -> "Tuple[ContainerEngineVersionStr, Sequence[Container], ContainerOperatingSystem, ProcessorArchitecture]": if containersDir is None: containersDirPath = self.stagedContainersDir @@ -783,6 +785,7 @@ def materialize_containers( containers_dir=containersDirPath, offline=offline, force=force, + injectable_containers=injectable_containers, ), *self.container_factory.architecture, ) @@ -863,6 +866,8 @@ def MaterializeWorkflowAndContainers( containersDir: "AbsPath", consolidatedWorkflowDir: "AbsPath", offline: "bool" = False, + injectable_containers: "Sequence[Container]" = [], + injectable_operational_containers: "Sequence[Container]" = [], ) -> "Tuple[MaterializedWorkflowEngine, ContainerEngineVersionStr, ContainerOperatingSystem, ProcessorArchitecture]": matWfEngV2, listOfContainerTags = matWfEng.instance.materializeWorkflow( matWfEng, consolidatedWorkflowDir, offline=offline @@ -874,7 +879,10 @@ def MaterializeWorkflowAndContainers( containerEngineOs, arch, ) = matWfEngV2.instance.materialize_containers( - listOfContainerTags, containersDir, offline=offline + listOfContainerTags, + containersDir, + offline=offline, + injectable_containers=injectable_containers, ) # Next ones are needed by the workflow engine itself @@ -887,7 +895,10 @@ def MaterializeWorkflowAndContainers( _, _, ) = matWfEngV2.instance.materialize_containers( - listOfOperationalContainerTags, containersDir, offline=offline + listOfOperationalContainerTags, + containersDir, + offline=offline, + injectable_containers=injectable_operational_containers, ) except: logging.debug("FIXME materializing containers") From 90b5bd909927fef4d66d6542ccf3617948f1be35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 02:13:57 +0200 Subject: [PATCH 52/62] Add the treatment of injectable containers --- .../container_factories/docker_container.py | 20 ++++++++- .../container_factories/podman_container.py | 24 ++++++++-- .../singularity_container.py | 44 ++++++++++++++++--- 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/wfexs_backend/container_factories/docker_container.py b/wfexs_backend/container_factories/docker_container.py index 3739aa35..308656bd 100644 --- a/wfexs_backend/container_factories/docker_container.py +++ b/wfexs_backend/container_factories/docker_container.py @@ -76,7 +76,7 @@ DOCKER_PROTO, ) from ..utils.contents import ( - link_or_copy, + link_or_copy_pathlib, real_unlink_if_exists, ) from ..utils.digests import ComputeDigestFromFile @@ -461,11 +461,29 @@ def deploySingleContainer( manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None + if ( + not containerPath.is_file() + and isinstance(container, Container) + and container.localPath is not None + ): + # Time to inject the image! + link_or_copy_pathlib(container.localPath, containerPath, force_copy=True) + if not containerPath.is_file(): errmsg = f"Docker saved image {containerPath.name} is not in the staged working dir for {tag_name}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) + if ( + not containerPathMeta.is_file() + and isinstance(container, Container) + and container.metadataLocalPath is not None + ): + # Time to inject the metadata! + link_or_copy_pathlib( + container.metadataLocalPath, containerPathMeta, force_copy=True + ) + if not containerPathMeta.is_file(): errmsg = f"Docker saved image metadata {containerPathMeta.name} is not in the staged working dir for {tag_name}" self.logger.warning(errmsg) diff --git a/wfexs_backend/container_factories/podman_container.py b/wfexs_backend/container_factories/podman_container.py index 44090b98..051a5ec5 100644 --- a/wfexs_backend/container_factories/podman_container.py +++ b/wfexs_backend/container_factories/podman_container.py @@ -75,7 +75,7 @@ DOCKER_PROTO, ) from ..utils.contents import ( - link_or_copy, + link_or_copy_pathlib, real_unlink_if_exists, ) from ..utils.digests import ComputeDigestFromFile @@ -468,18 +468,36 @@ def deploySingleContainer( manifestsImageSignature: "Optional[Fingerprint]" = None manifests = None manifest = None + if ( + not containerPath.is_file() + and isinstance(container, Container) + and container.localPath is not None + ): + # Time to inject the image! + link_or_copy_pathlib(container.localPath, containerPath, force_copy=True) + if not containerPath.is_file(): errmsg = f"Podman saved image {containerPath.name} is not in the staged working dir for {tag_name}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) - if not os.path.isfile(containerPathMeta): + if ( + not containerPathMeta.is_file() + and isinstance(container, Container) + and container.metadataLocalPath is not None + ): + # Time to inject the metadata! + link_or_copy_pathlib( + container.metadataLocalPath, containerPathMeta, force_copy=True + ) + + if not containerPathMeta.is_file(): errmsg = f"FATAL ERROR: Podman saved image metadata {containerPathMeta.name} is not in the staged working dir for {tag_name}" self.logger.error(errmsg) raise ContainerFactoryException(errmsg) try: - with open(containerPathMeta, mode="r", encoding="utf-8") as mH: + with containerPathMeta.open(mode="r", encoding="utf-8") as mH: signaturesAndManifest = cast("DockerManifestMetadata", json.load(mH)) imageSignature_in_metadata = signaturesAndManifest["image_signature"] manifestsImageSignature = signaturesAndManifest["manifests_signature"] diff --git a/wfexs_backend/container_factories/singularity_container.py b/wfexs_backend/container_factories/singularity_container.py index fb50cabf..5afb9021 100644 --- a/wfexs_backend/container_factories/singularity_container.py +++ b/wfexs_backend/container_factories/singularity_container.py @@ -94,7 +94,9 @@ class SingularityManifest(AbstractImageManifestMetadata): DOCKER_SCHEME, ) -from ..utils.contents import link_or_copy +from ..utils.contents import ( + link_or_copy_pathlib, +) from ..utils.docker import DockerHelper @@ -663,7 +665,7 @@ def _materializeSingleContainerSing( if tmpContainerPath is None: assert localContainerPath is not None tmpContainerPath = self.cc_handler._genTmpContainerPath() - link_or_copy(localContainerPath, tmpContainerPath) + link_or_copy_pathlib(localContainerPath, tmpContainerPath) tmpContainerPathMeta = tmpContainerPath.with_name( tmpContainerPath.name + META_JSON_POSTFIX ) @@ -770,6 +772,7 @@ def materializeContainers( containers_dir: "Optional[pathlib.Path]" = None, offline: "bool" = False, force: "bool" = False, + injectable_containers: "Sequence[Container]" = [], ) -> "Sequence[Container]": """ It is assured the containers are materialized @@ -789,14 +792,24 @@ def materializeContainers( if not self.AcceptsContainer(tag): continue + tag_to_use: "ContainerTaggedName" = tag + for injectable_container in injectable_containers: + if ( + injectable_container.origTaggedName == tag.origTaggedName + and injectable_container.source_type == tag.type + and injectable_container.registries == tag.registries + ): + tag_to_use = injectable_container + break + matched_container: "Union[Container, FailedContainerTag]" try: matched_container, was_redeployed = self.deploySingleContainer( - tag, containers_dir=containers_dir, force=force + tag_to_use, containers_dir=containers_dir, force=force ) except ContainerFactoryException as cfe: matched_container = self._materializeSingleContainerSing( - tag, + tag_to_use, matEnv=matEnv, dhelp=dhelp, containers_dir=containers_dir, @@ -839,11 +852,32 @@ def deploySingleContainer( container, containers_dir ) + was_redeployed = False + if ( + not containerPath.is_file() + and isinstance(container, Container) + and container.localPath is not None + ): + # Time to inject the image! + link_or_copy_pathlib(container.localPath, containerPath, force_copy=True) + was_redeployed = True + if not containerPath.is_file(): errmsg = f"SIF saved image {containerPath.name} is not in the staged working dir for {container.origTaggedName}" self.logger.warning(errmsg) raise ContainerFactoryException(errmsg) + if ( + not containerPathMeta.is_file() + and isinstance(container, Container) + and container.metadataLocalPath is not None + ): + # Time to inject the metadata! + link_or_copy_pathlib( + container.metadataLocalPath, containerPathMeta, force_copy=True + ) + was_redeployed = True + if not containerPathMeta.is_file(): errmsg = f"SIF saved image metadata {containerPathMeta.name} is not in the staged working dir for {container.origTaggedName}" self.logger.warning(errmsg) @@ -912,4 +946,4 @@ def deploySingleContainer( self.logger.exception(errmsg) raise ContainerFactoryException(errmsg) - return rebuilt_container, False + return rebuilt_container, was_redeployed From 7d516ad55f9a28d16dbbffc968ccc5b7d1180ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 02:15:02 +0200 Subject: [PATCH 53/62] Added all the code to process the injection of workflow, containers, etc... PS: The code still fails in some points --- wfexs_backend/cache_handler.py | 14 +- wfexs_backend/utils/passphrase_wrapper.py | 6 +- wfexs_backend/utils/rocrate.py | 32 ++--- wfexs_backend/wfexs_backend.py | 32 ++--- wfexs_backend/workflow.py | 163 +++++++++++++++++----- 5 files changed, 173 insertions(+), 74 deletions(-) diff --git a/wfexs_backend/cache_handler.py b/wfexs_backend/cache_handler.py index ad095271..8d3bb52a 100644 --- a/wfexs_backend/cache_handler.py +++ b/wfexs_backend/cache_handler.py @@ -26,6 +26,7 @@ import logging import os import os.path +import pathlib import shutil import traceback import types @@ -140,7 +141,7 @@ class CacheMetadataDict(TypedDict): class CachedContent(NamedTuple): kind: "ContentKind" - path: "AbsPath" + path: "pathlib.Path" metadata_array: "Sequence[URIWithMetadata]" licences: "Tuple[URIType, ...]" fingerprint: "Optional[Fingerprint]" = None @@ -565,7 +566,7 @@ def inject( finalCachedFilename: "Optional[AbsPath]" = None, tempCachedFilename: "Optional[AbsPath]" = None, inputKind: "Optional[ContentKind]" = None, - ) -> "Tuple[Optional[AbsPath], Optional[Fingerprint]]": + ) -> "Tuple[Optional[pathlib.Path], Optional[Fingerprint]]": if destdir is None: destdir = self.cacheDir @@ -600,7 +601,12 @@ def inject( if do_copy: link_or_copy(tempCachedFilename, newFinalCachedFilename) - return newFinalCachedFilename, fingerprint + return ( + pathlib.Path(newFinalCachedFilename) + if newFinalCachedFilename is not None + else None, + fingerprint, + ) def _inject( self, @@ -1205,7 +1211,7 @@ def fetch( return CachedContent( kind=inputKind, - path=finalCachedFilename, + path=pathlib.Path(finalCachedFilename), metadata_array=metadata_array, licences=tuple(licences), fingerprint=final_fingerprint, diff --git a/wfexs_backend/utils/passphrase_wrapper.py b/wfexs_backend/utils/passphrase_wrapper.py index c5c11210..23c2d121 100644 --- a/wfexs_backend/utils/passphrase_wrapper.py +++ b/wfexs_backend/utils/passphrase_wrapper.py @@ -157,7 +157,7 @@ def _materialize_word_sets( # Prepare the compressed index with tempfile.NamedTemporaryFile() as tmp_indexed_filename: CompressedIndexedText.IndexTextFile( - i_cached_content.path, + i_cached_content.path.as_posix(), tmp_indexed_filename.name, substart=remote_wordlist.substart, subend=remote_wordlist.subend, @@ -175,7 +175,9 @@ def _materialize_word_sets( indexed_filenames.append(indexed_filename) - word_sets[wordlist_tag] = CompressedIndexedText(cfiles=indexed_filenames) + word_sets[wordlist_tag] = CompressedIndexedText( + cfiles=list(map(lambda infil: infil.as_posix(), indexed_filenames)) + ) return word_sets diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index eeee9d7b..5424612a 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -231,28 +231,26 @@ class ROCrateToolboxException(Exception): def ReadROCrateMetadata( - workflowROCrateFilename: "str", public_name: "str" + workflowROCrateFilename: "pathlib.Path", public_name: "str" ) -> "Tuple[Any, Optional[pathlib.Path]]": # Is it a bare file or an archive? - jsonld_filename: "Optional[str]" = None + jsonld_filename: "Optional[pathlib.Path]" = None payload_dir: "Optional[pathlib.Path]" = None - if os.path.isdir(workflowROCrateFilename): - possible_jsonld_filename = os.path.join( - workflowROCrateFilename, ROCRATE_JSONLD_FILENAME + if workflowROCrateFilename.is_dir(): + possible_jsonld_filename = workflowROCrateFilename / ROCRATE_JSONLD_FILENAME + legacy_jsonld_filename = ( + workflowROCrateFilename / LEGACY_ROCRATE_JSONLD_FILENAME ) - legacy_jsonld_filename = os.path.join( - workflowROCrateFilename, LEGACY_ROCRATE_JSONLD_FILENAME - ) - if os.path.exists(possible_jsonld_filename): + if possible_jsonld_filename.exists(): jsonld_filename = possible_jsonld_filename - elif os.path.exists(legacy_jsonld_filename): + elif legacy_jsonld_filename.exists(): jsonld_filename = legacy_jsonld_filename else: raise ROCrateToolboxException( f"{public_name} does not contain a member {ROCRATE_JSONLD_FILENAME} or {LEGACY_ROCRATE_JSONLD_FILENAME}" ) - payload_dir = pathlib.Path(workflowROCrateFilename) - elif os.path.isfile(workflowROCrateFilename): + payload_dir = workflowROCrateFilename + elif workflowROCrateFilename.is_file(): jsonld_filename = workflowROCrateFilename else: raise ROCrateToolboxException( @@ -264,7 +262,7 @@ def ReadROCrateMetadata( putative_mime = mag.from_file(os.path.realpath(jsonld_filename)) # Bare possible RO-Crate if putative_mime == "application/json": - with open(jsonld_filename, mode="rb") as jdf: + with jsonld_filename.open(mode="rb") as jdf: jsonld_bin = jdf.read() # Archived possible RO-Crate elif putative_mime == "application/zip": @@ -524,12 +522,12 @@ def identifyROCrate( } { FILTER NOT EXISTS { - ?mainentity s:isBasedOn ?origmainentity . - ?origmainentity + ?mainentity s:isBasedOn ?somemainentity . + ?somemainentity a bs:ComputationalWorkflow ; - dcterms:conformsTo ?bsworkflowprofile . + dcterms:conformsTo ?somebsworkflowprofile . FILTER ( - STRSTARTS(str(?bsworkflowprofile), str(bswfprofile:)) + STRSTARTS(str(?somebsworkflowprofile), str(bswfprofile:)) ) . } BIND (?mainentity AS ?origmainentity) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index b81a7b05..abde8396 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -1366,7 +1366,7 @@ def fromPreviousROCrate( # or a remote RO-Crate parsedROCrateURI = urllib.parse.urlparse(workflowROCrateFilenameOrURI) if parsedROCrateURI.scheme == "": - workflowROCrateFilename = cast("AnyPath", workflowROCrateFilenameOrURI) + workflowROCrateFilename = pathlib.Path(workflowROCrateFilenameOrURI) else: self.logger.info(f"* Fetching RO-Crate {workflowROCrateFilenameOrURI}") local_content = self.cacheFetch( @@ -1913,7 +1913,7 @@ def cacheFetch( ) return CachedContent( kind=ContentKind.Directory - if os.path.isdir(workflow_dir) + if workflow_dir.is_dir() else ContentKind.File, path=workflow_dir, metadata_array=[], @@ -1964,7 +1964,7 @@ def cacheWorkflow( registerInCache: "bool" = True, offline: "bool" = False, meta_dir: "Optional[AbsPath]" = None, - ) -> "Tuple[AbsPath, RemoteRepo, Optional[WorkflowType], Optional[RepoTag]]": + ) -> "Tuple[pathlib.Path, RemoteRepo, Optional[WorkflowType], Optional[RepoTag]]": """ Fetch the whole workflow description based on the data obtained from the TRS where it is being published. @@ -1998,9 +1998,9 @@ def cacheWorkflow( i_workflow: "Optional[IdentifiedWorkflow]" = None engineDesc: "Optional[WorkflowType]" = None guessedRepo: "Optional[RemoteRepo]" = None - repoDir: "Optional[AbsPath]" = None + repoDir: "Optional[pathlib.Path]" = None putative: "bool" = False - cached_putative_path: "Optional[AbsPath]" = None + cached_putative_path: "Optional[pathlib.Path]" = None if parsedRepoURL.scheme in ("", TRS_SCHEME_PREFIX): # Extracting the TRS endpoint details from the parsedRepoURL if parsedRepoURL.scheme == TRS_SCHEME_PREFIX: @@ -2136,7 +2136,7 @@ def getWorkflowRepoFromTRS( offline: "bool" = False, ignoreCache: "bool" = False, meta_dir: "Optional[AbsPath]" = None, - ) -> "Tuple[IdentifiedWorkflow, Optional[AbsPath]]": + ) -> "Tuple[IdentifiedWorkflow, Optional[pathlib.Path]]": """ :return: @@ -2413,7 +2413,7 @@ def doMaterializeRepo( repo: "RemoteRepo", doUpdate: "bool" = True, registerInCache: "bool" = True, - ) -> "Tuple[AbsPath, RepoTag]": + ) -> "Tuple[pathlib.Path, RepoTag]": if repo.repo_type not in (RepoType.Other, RepoType.SoftwareHeritage): ( remote_url, @@ -2445,7 +2445,7 @@ def doMaterializeRepo( inputKind=kind, ) - return repo_path, repo_effective_checkout + return pathlib.Path(repo_path), repo_effective_checkout def _doMaterializeGitRepo( self, @@ -2537,7 +2537,7 @@ def getWorkflowBundleFromURI( offline: "bool" = False, ignoreCache: "bool" = False, registerInCache: "bool" = True, - ) -> "Tuple[Optional[IdentifiedWorkflow], AbsPath, Sequence[URIWithMetadata]]": + ) -> "Tuple[Optional[IdentifiedWorkflow], pathlib.Path, Sequence[URIWithMetadata]]": try: cached_content = self.cacheFetch( remote_url, @@ -2568,8 +2568,8 @@ def getWorkflowBundleFromURI( ) crate_hashed_id = hashlib.sha1(remote_url.encode("utf-8")).hexdigest() - roCrateFile = os.path.join( - self.cacheROCrateDir, crate_hashed_id + self.DEFAULT_RO_EXTENSION + roCrateFile = pathlib.Path(self.cacheROCrateDir) / ( + crate_hashed_id + self.DEFAULT_RO_EXTENSION ) if not os.path.exists(roCrateFile): if os.path.lexists(roCrateFile): @@ -2580,10 +2580,8 @@ def getWorkflowBundleFromURI( ) return ( - self.getWorkflowRepoFromROCrateFile( - cast("AbsPath", roCrateFile), expectedEngineDesc - ), - cast("AbsPath", roCrateFile), + self.getWorkflowRepoFromROCrateFile(roCrateFile, expectedEngineDesc), + roCrateFile, cached_content.metadata_array, ) else: @@ -2595,7 +2593,7 @@ def getWorkflowBundleFromURI( def getWorkflowRepoFromROCrateFile( self, - roCrateFile: "AbsPath", + roCrateFile: "pathlib.Path", expectedEngineDesc: "Optional[WorkflowType]" = None, ) -> "IdentifiedWorkflow": """ @@ -2605,7 +2603,7 @@ def getWorkflowRepoFromROCrateFile( :return: """ - public_name = roCrateFile + public_name = str(roCrateFile) jsonld_obj, payload_dir = ReadROCrateMetadata( roCrateFile, public_name=public_name ) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 67ca142b..020b96b0 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -459,12 +459,13 @@ def __init__( private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, fail_ok: "bool" = False, + cached_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, cached_workflow: "Optional[LocalWorkflow]" = None, cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, cached_environment: "Optional[Sequence[MaterializedInput]]" = None, preferred_containers: "Sequence[Container]" = [], preferred_operational_containers: "Sequence[Container]" = [], - reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Metadata, + reproducibility_level: "ReproducibilityLevel" = ReproducibilityLevel.Minimal, strict_reproducibility_level: "bool" = False, ): """ @@ -519,6 +520,7 @@ def __init__( # These internal variables are needed for imports. # They are not preserved in the marshalled staging state, so # their effects are only in the initial session + self.cached_repo = cached_repo self.cached_workflow = cached_workflow self.cached_inputs = cached_inputs self.cached_environment = cached_environment @@ -1347,6 +1349,7 @@ def FromStagedRecipe( private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, paranoidMode: "bool" = False, + cached_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, cached_workflow: "Optional[LocalWorkflow]" = None, cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, cached_environment: "Optional[Sequence[MaterializedInput]]" = None, @@ -1385,6 +1388,7 @@ def FromStagedRecipe( public_key_filenames=public_key_filenames, private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, + cached_repo=cached_repo, cached_workflow=cached_workflow, cached_inputs=cached_inputs, cached_environment=cached_environment, @@ -1436,6 +1440,7 @@ def FromPreviousInstanceDeclaration( the_containers: "Sequence[Container]" = [] the_operational_containers: "Sequence[Container]" = [] cached_workflow: "Optional[LocalWorkflow]" = None + cached_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None if reproducibility_level >= ReproducibilityLevel.Full: if wfInstance.materializedParams is not None: cached_inputs = copy.copy(wfInstance.materializedParams) @@ -1484,6 +1489,10 @@ def FromPreviousInstanceDeclaration( wfInstance.materializedEngine.operational_containers ) + if reproducibility_level >= ReproducibilityLevel.Metadata: + if wfInstance.remote_repo is not None and wfInstance.engineDesc is not None: + cached_repo = (wfInstance.remote_repo, wfInstance.engineDesc) + cached_workflow = wfInstance.getMaterializedWorkflow() # We have to reset the inherited paranoid mode and nickname @@ -1504,6 +1513,7 @@ def FromPreviousInstanceDeclaration( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, + cached_repo=cached_repo, cached_workflow=cached_workflow, cached_inputs=cached_inputs, cached_environment=cached_environment, @@ -1563,7 +1573,7 @@ def _transferInputs( def FromPreviousROCrate( cls, wfexs: "WfExSBackend", - workflowROCrateFilename: "AnyPath", + workflowROCrateFilename: "pathlib.Path", public_name: "str", # Mainly used for provenance and exceptions securityContextsConfigFilename: "Optional[AnyPath]" = None, replaced_parameters_filename: "Optional[AnyPath]" = None, @@ -1681,6 +1691,7 @@ def FromPreviousROCrate( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoidMode=paranoidMode, + cached_repo=(repo, workflow_type), cached_workflow=cached_workflow, cached_inputs=cached_inputs, cached_environment=cached_environment, @@ -1701,6 +1712,7 @@ def FromDescription( private_key_filename: "Optional[AnyPath]" = None, private_key_passphrase: "Optional[str]" = None, paranoidMode: "bool" = False, + cached_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, cached_workflow: "Optional[LocalWorkflow]" = None, cached_inputs: "Optional[Sequence[MaterializedInput]]" = None, cached_environment: "Optional[Sequence[MaterializedInput]]" = None, @@ -1746,6 +1758,7 @@ def FromDescription( private_key_filename=private_key_filename, private_key_passphrase=private_key_passphrase, paranoid_mode=paranoidMode, + cached_repo=cached_repo, cached_workflow=cached_workflow, cached_inputs=cached_inputs, cached_environment=cached_environment, @@ -1804,6 +1817,8 @@ def fetchWorkflow( descriptor_type: "Optional[TRS_Workflow_Descriptor]", offline: "bool" = False, ignoreCache: "bool" = False, + injectable_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, + injectable_workflow: "Optional[LocalWorkflow]" = None, ) -> None: """ Fetch the whole workflow description based on the data obtained @@ -1815,24 +1830,51 @@ def fetchWorkflow( """ assert self.metaDir is not None - assert self.workflowDir is not None + assert self.workflowDir is not None, "The workflow directory should be defined" + workflow_dir = pathlib.Path(self.workflowDir) - repoDir: "Optional[AbsPath]" = None + repoDir: "Optional[pathlib.Path]" = None + injected_workflow: "Optional[LocalWorkflow]" = None + rel_path_files: "Optional[Sequence[Union[RelPath, URIType]]]" = None if self.remote_repo is None or ignoreCache: - ( - repoDir, - repo, - self.engineDesc, - repoEffectiveCheckout, - ) = self.wfexs.cacheWorkflow( - workflow_id=workflow_id, - version_id=version_id, - trs_endpoint=trs_endpoint, - descriptor_type=descriptor_type, - ignoreCache=ignoreCache, - offline=offline, - meta_dir=self.metaDir, - ) + repoEffectiveCheckout: "Optional[RepoTag]" + # Injectable repo info is a precondition for injectable local workflow + if injectable_repo is not None: + repo, self.engineDesc = injectable_repo + + parsedRepoURL = urllib.parse.urlparse(repo.repo_url) + assert ( + len(parsedRepoURL.scheme) > 0 + ), f"Repository id {repo.repo_url} should be a parsable URI" + + if not ignoreCache and injectable_workflow is not None: + # Injectable repo info is a precondition for injectable local workflow + repoEffectiveCheckout = repo.checkout + repoDir = injectable_workflow.dir + injected_workflow = injectable_workflow + rel_path_files = injectable_workflow.relPathFiles + else: + repoDir, repoEffectiveCheckout = self.wfexs.doMaterializeRepo( + repo, + doUpdate=ignoreCache, + # registerInCache=True, + ) + else: + ( + repoDir, + repo, + self.engineDesc, + repoEffectiveCheckout, + ) = self.wfexs.cacheWorkflow( + workflow_id=workflow_id, + version_id=version_id, + trs_endpoint=trs_endpoint, + descriptor_type=descriptor_type, + ignoreCache=ignoreCache, + offline=offline, + meta_dir=self.metaDir, + ) + self.remote_repo = repo # These are kept for compatibility self.repoURL = repo.repo_url @@ -1842,31 +1884,47 @@ def fetchWorkflow( # Workflow Language version cannot be assumed here yet # A copy of the workflows is kept - assert ( - self.workflowDir is not None - ), "The workflow directory should be defined" - if os.path.isdir(self.workflowDir): - shutil.rmtree(self.workflowDir) + if workflow_dir.is_dir(): + shutil.rmtree(workflow_dir) # force_copy is needed to isolate the copy of the workflow # so local modifications in a working directory does not # poison the cached workflow - if os.path.isdir(repoDir): - link_or_copy(repoDir, self.workflowDir, force_copy=True) + if injected_workflow is not None: + if ( + injected_workflow.relPath is not None + and len(injected_workflow.relPath) > 0 + ): + link_or_copy_pathlib( + injected_workflow.dir / injected_workflow.relPath, + workflow_dir / injected_workflow.relPath, + force_copy=True, + ) + + if rel_path_files is not None: + for inj in rel_path_files: + link_or_copy_pathlib( + injected_workflow.dir / inj, + workflow_dir / inj, + force_copy=True, + ) + elif repoDir.is_dir(): + link_or_copy_pathlib(repoDir, workflow_dir, force_copy=True) else: - os.makedirs(self.workflowDir, exist_ok=True) + workflow_dir.mkdir(parents=True, exist_ok=True) if self.repoRelPath is None: self.repoRelPath = cast("RelPath", "workflow.entrypoint") - link_or_copy( + link_or_copy_pathlib( repoDir, - cast("AbsPath", os.path.join(self.workflowDir, self.repoRelPath)), + workflow_dir / self.repoRelPath, force_copy=True, ) # We cannot know yet the dependencies localWorkflow = LocalWorkflow( - dir=pathlib.Path(self.workflowDir), + dir=workflow_dir, relPath=self.repoRelPath, effectiveCheckout=self.repoEffectiveCheckout, + relPathFiles=rel_path_files, ) self.logger.info( "materialized workflow repository (checkout {}): {}".format( @@ -1929,6 +1987,8 @@ def setupEngine( offline: "bool" = False, ignoreCache: "bool" = False, initial_engine_version: "Optional[EngineVersion]" = None, + injectable_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, + injectable_workflow: "Optional[LocalWorkflow]" = None, ) -> None: # The engine is populated by self.fetchWorkflow() if self.engine is None: @@ -1940,6 +2000,8 @@ def setupEngine( self.descriptor_type, offline=offline, ignoreCache=ignoreCache, + injectable_repo=injectable_repo, + injectable_workflow=injectable_workflow, ) assert ( @@ -2002,10 +2064,22 @@ def setupEngine( self.materializedEngine = matWfEngV2 def materializeWorkflowAndContainers( - self, offline: "bool" = False, ignoreCache: "bool" = False + self, + offline: "bool" = False, + ignoreCache: "bool" = False, + injectable_repo: "Optional[Tuple[RemoteRepo, WorkflowType]]" = None, + injectable_workflow: "Optional[LocalWorkflow]" = None, + injectable_containers: "Sequence[Container]" = [], + injectable_operational_containers: "Sequence[Container]" = [], ) -> None: if self.materializedEngine is None: - self.setupEngine(offline=offline, ignoreCache=ignoreCache) + # Only inject on first try + self.setupEngine( + offline=offline, + ignoreCache=ignoreCache, + injectable_repo=injectable_repo, + injectable_workflow=injectable_workflow, + ) assert ( self.materializedEngine is not None @@ -2031,6 +2105,8 @@ def materializeWorkflowAndContainers( self.containersDir, self.consolidatedWorkflowDir, offline=offline, + injectable_containers=injectable_containers, + injectable_operational_containers=injectable_operational_containers, ) # DEPRECATED? @@ -3338,14 +3414,31 @@ def stageWorkDir( # self.fetchWorkflow(self.id, self.version_id, self.trs_endpoint, self.descriptor_type) # This method is called from within materializeWorkflowAndContainers # self.setupEngine(offline=offline) - self.materializeWorkflowAndContainers(offline=offline, ignoreCache=ignoreCache) + self.materializeWorkflowAndContainers( + offline=offline, + ignoreCache=ignoreCache, + injectable_repo=self.cached_repo + if self.reproducibility_level >= ReproducibilityLevel.Metadata + else None, + injectable_workflow=self.cached_workflow + if self.reproducibility_level >= ReproducibilityLevel.Full + else None, + injectable_containers=self.preferred_containers + if self.reproducibility_level >= ReproducibilityLevel.Metadata + else [], + injectable_operational_containers=self.preferred_operational_containers + if self.reproducibility_level >= ReproducibilityLevel.Metadata + else [], + ) assert self.formatted_params is not None self.materializedParams = self.materializeInputs( self.formatted_params, offline=offline, ignoreCache=ignoreCache, - injectable_inputs=self.cached_inputs, + injectable_inputs=self.cached_inputs + if self.reproducibility_level >= ReproducibilityLevel.Metadata + else None, ) assert self.formatted_environment is not None @@ -3353,7 +3446,9 @@ def stageWorkDir( self.formatted_environment, offline=offline, ignoreCache=ignoreCache, - injectable_inputs=self.cached_environment, + injectable_inputs=self.cached_environment + if self.reproducibility_level >= ReproducibilityLevel.Metadata + else None, ) self.marshallStage() From d774c6ce6c244d09da78931d8d0ebd898e4743cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 12:57:44 +0200 Subject: [PATCH 54/62] Added pyld caching capabilities --- requirements.txt | 4 +- wfexs_backend/utils/pyld_caching.py | 195 ++++++++++++++++++++++++++++ wfexs_backend/utils/rocrate.py | 10 +- 3 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 wfexs_backend/utils/pyld_caching.py diff --git a/requirements.txt b/requirements.txt index 9110abea..760bdca3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ pyyaml jsonpath-ng jsonschema[format_nongpl] >= 3.0.0 -jsonschema[format_nongpl] < 4.18.0 ; python_version <= '3.7' +jsonschema[format_nongpl] < 4.18.0 ; python_version < '3.8' jsonpointer python-magic >= 0.4.27 paramiko[ed25519] @@ -9,6 +9,8 @@ certifi crypt4gh bagit aioftp +aiohttp-client-cache[sqlite] >= 0.11.0 ; python_version >= '3.8' +aiohttp-client-cache[sqlite] < 0.11.0 ; python_version < '3.8' openpyxl xlrd2 dulwich diff --git a/wfexs_backend/utils/pyld_caching.py b/wfexs_backend/utils/pyld_caching.py new file mode 100644 index 00000000..12c77c7a --- /dev/null +++ b/wfexs_backend/utils/pyld_caching.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2020-2024 Barcelona Supercomputing Center (BSC), Spain +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import + +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from typing import ( + Any, + Callable, + Mapping, + Optional, + ) + +import asyncio +import aiohttp +from aiohttp_client_cache.session import CachedSession +from aiohttp_client_cache.backends.sqlite import SQLiteBackend +import pyld # type: ignore[import, import-untyped] +import re +import string +import urllib.parse + + +def aiohttp_caching_document_loader( + cache_file: "str", + loop: "Optional[asyncio.AbstractEventLoop]" = None, + secure: "bool" = False, + **kwargs: "Any", +) -> "Callable[[str, Mapping[str, Mapping[str, str]]], Mapping[str, Any]]": + """ + This code is based on aiohttp_document_loader from https://raw.githubusercontent.com/digitalbazaar/pyld/2c6b0a65bee700b42c8d0806364f4fc4ebddcc52/lib/pyld/documentloader/aiohttp.py + """ + """ + Create an Asynchronous document loader using aiohttp. + + :param loop: the event loop used for processing HTTP requests. + :param secure: require all requests to use HTTPS (default: False). + :param **kwargs: extra keyword args for the aiohttp request get() call. + + :return: the RemoteDocument loader function. + """ + + if loop is None: + loop = asyncio.get_event_loop() + + async def async_caching_loader( + url: "str", headers: "Mapping[str, str]" + ) -> "Mapping[str, Any]": + """ + Retrieves JSON-LD at the given URL asynchronously. + + :param url: the URL to retrieve. + + :return: the RemoteDocument. + """ + try: + # validate URL + pieces = urllib.parse.urlparse(url) + if ( + not all([pieces.scheme, pieces.netloc]) + or pieces.scheme not in ["http", "https"] + or set(pieces.netloc) + > set(string.ascii_letters + string.digits + "-.:") + ): + raise pyld.jsonld.JsonLdError( + "URL could not be dereferenced; " + 'only "http" and "https" URLs are supported.', + "jsonld.InvalidUrl", + {"url": url}, + code="loading document failed", + ) + if secure and pieces.scheme != "https": + raise pyld.jsonld.JsonLdError( + "URL could not be dereferenced; " + "secure mode enabled and " + 'the URL\'s scheme is not "https".', + "jsonld.InvalidUrl", + {"url": url}, + code="loading document failed", + ) + async with CachedSession( + cache=SQLiteBackend(cache_file), + loop=loop, + ) as session: + async with session.get( + url, + headers=headers, + **kwargs, + ) as response: + # Allow any content_type in trying to parse json + # similar to requests library + json_body = await response.json(content_type=None) + content_type = response.headers.get("content-type") + if not content_type: + content_type = "application/octet-stream" + doc = { + "contentType": content_type, + "contextUrl": None, + "documentUrl": response.url.human_repr(), + "document": json_body, + } + link_header = response.headers.get("link") + if link_header: + linked_context = pyld.jsonld.parse_link_header(link_header).get( + pyld.jsonld.LINK_HEADER_REL + ) + # only 1 related link header permitted + if linked_context and content_type != "application/ld+json": + if isinstance(linked_context, list): + raise pyld.jsonld.JsonLdError( + "URL could not be dereferenced, " + "it has more than one " + "associated HTTP Link Header.", + "jsonld.LoadDocumentError", + {"url": url}, + code="multiple context link headers", + ) + doc["contextUrl"] = linked_context["target"] + linked_alternate = pyld.jsonld.parse_link_header( + link_header + ).get("alternate") + # if not JSON-LD, alternate may point there + if ( + linked_alternate + and linked_alternate.get("type") == "application/ld+json" + and not re.match( + r"^application\/(\w*\+)?json$", content_type + ) + ): + doc["contentType"] = "application/ld+json" + doc["documentUrl"] = pyld.jsonld.prepend_base( + url, linked_alternate["target"] + ) + + return doc + except pyld.jsonld.JsonLdError as e: + raise e + except Exception as cause: + raise pyld.jsonld.JsonLdError( + "Could not retrieve a JSON-LD document from the URL.", + "jsonld.LoadDocumentError", + code="loading document failed", + cause=cause, + ) + + def loader( + url: "str", options: "Mapping[str, Mapping[str, str]]" = {} + ) -> "Mapping[str, Any]": + """ + Retrieves JSON-LD at the given URL. + + :param url: the URL to retrieve. + + :return: the RemoteDocument. + """ + return loop.run_until_complete( + async_caching_loader( + url, + options.get( + "headers", + { + "Accept": "application/ld+json, application/json", + }, + ), + ) + ) + + return loader + + +def hook_pyld_cache(cache_file: "str") -> "None": + pyld.jsonld.set_document_loader( + aiohttp_caching_document_loader( + cache_file=cache_file, + timeout=10, + ) + ) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 5424612a..6f4f144b 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -96,10 +96,10 @@ ) # Needed by pyld to detect it -import aiohttp import pyld # type: ignore[import, import-untyped] import rdflib import rdflib.plugins.sparql +import xdg.BaseDirectory # This code needs exception groups if sys.version_info[:2] < (3, 11): @@ -126,6 +126,10 @@ stringifyDigest, ) +from .pyld_caching import ( + hook_pyld_cache, +) + from ..fetchers import ( RemoteRepo, ) @@ -359,6 +363,10 @@ def __init__(self, wfexs: "WfExSBackend"): self.wfexs = wfexs + # Caching path for the contexts + cache_path = xdg.BaseDirectory.save_cache_path("es.elixir.WfExSJSONLD") + hook_pyld_cache(os.path.join(cache_path, "contexts.db")) + # This is needed for proper behaviour # https://stackoverflow.com/a/6264214 if self.RELATIVE_ROCRATE_SCHEME not in urllib.parse.uses_relative: From affc50b41fe0f772af6234698afd47dbdd7c5726 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 17:49:10 +0200 Subject: [PATCH 55/62] Removed left debugging trace from wfexs_backend.container_factories --- wfexs_backend/container_factories/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index 9c7e816b..b1d72616 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -147,7 +147,6 @@ def _value_defaults_fixes(self) -> None: if isinstance(self.localPath, str): # Properly casting the path self.localPath = pathlib.Path(self.localPath) - print(f"localPath {self.localPath}") # This code is needed for old working directories if self.metadataLocalPath is None and self.localPath is not None: From 8ac8a08ae2fb3c6adb08b1ab3dc9a379d8e8553f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 17:56:19 +0200 Subject: [PATCH 56/62] Removed left debugging trace from wfexs_backend.utils.zipfile_path --- wfexs_backend/utils/zipfile_path.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index b4a941ae..33381a60 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -485,7 +485,6 @@ def _extract_member( return targetpath def copy_to(self, dest: "pathlib.Path") -> "None": - self.logger.error(f"UY {self._root.filename} {self._at} {dest}") if self.is_file(): self._extract_member(self._at, dest) else: From cf07faa25a1d1d1671b86fab49b81882fea9e6a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 19:40:32 +0200 Subject: [PATCH 57/62] Added `is_uri` method to wfexs_backend.utils.misc , to be used in a couple of places. --- wfexs_backend/utils/misc.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/wfexs_backend/utils/misc.py b/wfexs_backend/utils/misc.py index 81d85373..d3447bff 100644 --- a/wfexs_backend/utils/misc.py +++ b/wfexs_backend/utils/misc.py @@ -55,6 +55,7 @@ RelPath, ) +import urllib.parse import urllib.request import jsonschema.validators @@ -329,3 +330,14 @@ def lazy_import(name: "str") -> "ModuleType": raise ModuleNotFoundError(f"No module named '{name}'") return module + + +def is_uri(the_uri: "str") -> "bool": + """ + Inspired in https://stackoverflow.com/a/38020041 + """ + try: + result = urllib.parse.urlparse(the_uri) + return result.scheme != "" + except: + return False From 62965aa5719dbeffd18b21f5b522c80c6add5f32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 19:41:35 +0200 Subject: [PATCH 58/62] Fixed main issues related to reproducibility level 3 (full). It is now working with a simple CWL workflow!!! --- wfexs_backend/utils/rocrate.py | 88 ++++++++++++++++++++-------------- wfexs_backend/workflow.py | 61 +++++++++++++++++++---- 2 files changed, 105 insertions(+), 44 deletions(-) diff --git a/wfexs_backend/utils/rocrate.py b/wfexs_backend/utils/rocrate.py index 6f4f144b..1796110f 100644 --- a/wfexs_backend/utils/rocrate.py +++ b/wfexs_backend/utils/rocrate.py @@ -134,7 +134,8 @@ RemoteRepo, ) -from ..utils.misc import ( +from .misc import ( + is_uri, lazy_import, ) @@ -2225,7 +2226,7 @@ def __list_payload_entity_parts( entity: "rdflib.term.Identifier", public_name: "str", payload_dir: "pathlib.Path", - ) -> "Sequence[ROCratePayload]": + ) -> "Sequence[Union[str, ROCratePayload]]": entity_parts = self.__list_entity_parts(g, entity, public_name) payload_entity_parts = [] @@ -2234,17 +2235,23 @@ def __list_payload_entity_parts( part_row, rdflib.query.ResultRow ), "Check the SPARQL code, as it should be a SELECT query" - included_part_entity = self.__processPayloadEntity( - the_entity=part_row.part_entity, - payload_dir=payload_dir, - kindobj=ContentKind.File, - entity_type="secondary workflow component", - entity_name=str(part_row.part_name) - if part_row.part_name is not None - else "PACO", # FIXME - the_file_size=part_row.file_size, - the_file_sha256=part_row.file_sha256, - ) + included_part_entity: "Optional[Union[str, ROCratePayload]]" = None + if is_uri(str(part_row.part_entity)) and not str( + part_row.part_entity + ).startswith(self.RELATIVE_ROCRATE_SCHEME + ":"): + included_part_entity = str(part_row.part_entity) + else: + included_part_entity = self.__processPayloadEntity( + the_entity=part_row.part_entity, + payload_dir=payload_dir, + kindobj=ContentKind.File, + entity_type="secondary workflow component", + entity_name=str(part_row.part_name) + if part_row.part_name is not None + else "PACO", # FIXME + the_file_size=part_row.file_size, + the_file_sha256=part_row.file_sha256, + ) if included_part_entity is not None: payload_entity_parts.append(included_part_entity) @@ -2385,28 +2392,38 @@ def extractWorkflowMetadata( workflow_parts = self.__list_payload_entity_parts( g, langrow.origmainentity, public_name, payload_dir ) - if len(workflow_parts) == 0: - base_dir = main_entity_path.parent - main_entity_relpath = main_entity_path.name - rel_path_files = [] - else: - rel_path_files = list( - map(lambda part: cast("RelPath", part.rel_path), workflow_parts) - ) - common_prefix = os.path.commonpath( - [main_entity_relpath, *rel_path_files] - ) - if len(common_prefix) == 0: - base_dir = payload_dir - else: - base_dir = payload_dir / common_prefix - main_entity_relpath = main_entity_path.relative_to( - base_dir - ).as_posix() - rel_path_files = [ - cast("RelPath", part.path.relative_to(base_dir).as_posix()) - for part in workflow_parts - ] + rel_path_files: "MutableSequence[Union[RelPath, URIType]]" = [] + base_dir = main_entity_path.parent + main_entity_relpath = main_entity_path.name + if len(workflow_parts) > 0: + rel_path_str: "MutableSequence[str]" = [] + rel_path_index: "MutableSequence[int]" = [] + for i_part, part in enumerate(workflow_parts): + rel_path_file = cast( + "RelPath", + part.rel_path if isinstance(part, ROCratePayload) else part, + ) + rel_path_files.append(rel_path_file) + if isinstance(part, ROCratePayload): + rel_path_str.append(rel_path_file) + rel_path_index.append(i_part) + + if len(rel_path_str) > 0: + common_prefix = os.path.commonpath( + [include_main_entity.rel_path, *rel_path_str] + ) + if len(common_prefix) > 0: + base_dir = payload_dir / common_prefix + main_entity_relpath = main_entity_path.relative_to( + base_dir + ).as_posix() + for i_part in rel_path_index: + part = workflow_parts[i_part] + assert isinstance(part, ROCratePayload) + rel_path_files[i_part] = cast( + "RelPath", + part.path.relative_to(base_dir).as_posix(), + ) cached_workflow = LocalWorkflow( dir=base_dir, @@ -2420,6 +2437,7 @@ def extractWorkflowMetadata( else None, relPathFiles=rel_path_files, ) + self.logger.error(f"POZI {cached_workflow}") return repo, workflow_type, cached_workflow diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 020b96b0..4ecd8481 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -323,7 +323,10 @@ link_or_copy_pathlib, ) from .utils.marshalling_handling import marshall_namedtuple, unmarshall_namedtuple -from .utils.misc import config_validate +from .utils.misc import ( + config_validate, + is_uri, +) from .utils.zipfile_path import path_relative_to from .fetchers.trs_files import ( @@ -1852,7 +1855,41 @@ def fetchWorkflow( repoEffectiveCheckout = repo.checkout repoDir = injectable_workflow.dir injected_workflow = injectable_workflow - rel_path_files = injectable_workflow.relPathFiles + issue_warning = False + rel_path_files = injected_workflow.relPathFiles + if repo.rel_path is not None: + if ( + injected_workflow.relPath is not None + and repo.rel_path.endswith(injected_workflow.relPath) + ): + if ( + injected_workflow.relPathFiles is not None + and repo.rel_path != injected_workflow.relPath + ): + repo_rel_prefix = repo.rel_path[ + 0 : -len(injected_workflow.relPath) + ] + rel_path_files = [] + for rel_path_file in injected_workflow.relPathFiles: + # Do not prefix URLs + if is_uri(rel_path_file): + rel_path_files.append(rel_path_file) + else: + rel_path_files.append( + cast( + "RelPath", + repo_rel_prefix + rel_path_file, + ) + ) + elif repo.rel_path != injected_workflow.relPath: + issue_warning = True + elif injected_workflow.relPath is not None: + issue_warning = True + + if issue_warning: + self.logger.warning( + f"Injected workflow has a different relPath from the injected repo" + ) else: repoDir, repoEffectiveCheckout = self.wfexs.doMaterializeRepo( repo, @@ -1894,19 +1931,25 @@ def fetchWorkflow( injected_workflow.relPath is not None and len(injected_workflow.relPath) > 0 ): + assert repo.rel_path is not None link_or_copy_pathlib( injected_workflow.dir / injected_workflow.relPath, - workflow_dir / injected_workflow.relPath, + workflow_dir / repo.rel_path, force_copy=True, ) if rel_path_files is not None: - for inj in rel_path_files: - link_or_copy_pathlib( - injected_workflow.dir / inj, - workflow_dir / inj, - force_copy=True, - ) + assert injected_workflow.relPathFiles is not None + for inj, dest_inj in zip( + injected_workflow.relPathFiles, rel_path_files + ): + # Do not try copying URLs + if not is_uri(inj): + link_or_copy_pathlib( + injected_workflow.dir / inj, + workflow_dir / dest_inj, + force_copy=True, + ) elif repoDir.is_dir(): link_or_copy_pathlib(repoDir, workflow_dir, force_copy=True) else: From 996c6969500d097b857fdfcdbfefc41a72240cdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 18 Jun 2024 19:42:19 +0200 Subject: [PATCH 59/62] Version bump to 0.99.9. Next releases will be -alpha, -beta, -rc and 1.0.0. --- CITATION.cff | 2 +- wfexs_backend/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 56c2a433..c37c0fa1 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -28,4 +28,4 @@ message: "If you use this software, please cite it using these metadata." repository-code: "https://github.com/inab/WfExS-backend" type: software title: "WfExS-backend" -version: 0.99.2 +version: 0.99.9 diff --git a/wfexs_backend/__init__.py b/wfexs_backend/__init__.py index e476ca4d..d0443b02 100644 --- a/wfexs_backend/__init__.py +++ b/wfexs_backend/__init__.py @@ -21,7 +21,7 @@ __license__ = "Apache 2.0" # https://www.python.org/dev/peps/pep-0396/ -__version__ = "0.99.2" +__version__ = "0.99.9" __url__ = "https://github.com/inab/WfExS-backend" __official_name__ = "WfExS-backend" From fa006eacc8a4326904092096846d2c0f6d0bd8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 19 Jun 2024 19:19:49 +0200 Subject: [PATCH 60/62] Many small fixes and changes to keep backward compatibility to Python 3.7 --- wfexs_backend/container_factories/__init__.py | 10 ++- wfexs_backend/utils/contents.py | 7 +- wfexs_backend/utils/docker.py | 33 +++++--- wfexs_backend/utils/zipfile_path.py | 51 +++++------- wfexs_backend/workflow.py | 79 ++++++++++++++----- 5 files changed, 113 insertions(+), 67 deletions(-) diff --git a/wfexs_backend/container_factories/__init__.py b/wfexs_backend/container_factories/__init__.py index b1d72616..fe916881 100644 --- a/wfexs_backend/container_factories/__init__.py +++ b/wfexs_backend/container_factories/__init__.py @@ -361,7 +361,8 @@ def query( if localContainerPath.is_file(): if localContainerPath.is_symlink(): # Some filesystems complain when filenames contain 'equal', 'slash' or 'plus' symbols - unlinkedContainerPath = localContainerPath.readlink() + # Path.readlink was added in Python 3.9 + unlinkedContainerPath = pathlib.Path(os.readlink(localContainerPath)) fsImageSignature = unlinkedContainerPath.name imageSignature = cast( "Fingerprint", @@ -482,8 +483,11 @@ def update( # And ..... transfer!!! if do_move: - shutil.move(image_path, canonicalContainerPath) - shutil.move(image_metadata_path, canonicalContainerPathMeta) + # Python 3.7 and lower only accept str as parameters + shutil.move(image_path.as_posix(), canonicalContainerPath.as_posix()) + shutil.move( + image_metadata_path.as_posix(), canonicalContainerPathMeta.as_posix() + ) else: link_or_copy(image_path, canonicalContainerPath, force_copy=True) link_or_copy( diff --git a/wfexs_backend/utils/contents.py b/wfexs_backend/utils/contents.py index 4b0c12d0..120acca8 100644 --- a/wfexs_backend/utils/contents.py +++ b/wfexs_backend/utils/contents.py @@ -260,7 +260,7 @@ def link_or_copy_pathlib( ) -> None: assert ( src.exists() - ), f"File {src.as_posix()} must exist to be linked or copied {src.exists()} {src.exists(follow_symlinks=False)}" + ), f"File {src.as_posix()} must exist to be linked or copied {src.exists()} {src.is_symlink()}" # We should not deal with symlinks src = src.resolve() @@ -301,7 +301,10 @@ def link_or_copy_pathlib( if src.is_file(): if dest_exists: dest.unlink() - dest.hardlink_to(src) + # link_to appeared in Python 3.8 + # hardlink_to appeared in Python 3.10 + # dest.hardlink_to(src) + os.link(src, dest) else: # Recursively hardlinking # as of https://stackoverflow.com/a/10778930 diff --git a/wfexs_backend/utils/docker.py b/wfexs_backend/utils/docker.py index 249a0a14..fa3df408 100644 --- a/wfexs_backend/utils/docker.py +++ b/wfexs_backend/utils/docker.py @@ -17,6 +17,7 @@ # limitations under the License. import abc +import hashlib import json import logging from typing import ( @@ -42,10 +43,14 @@ from dxf import ( DXF, - _verify_manifest, hash_bytes as dxf_hash_bytes, _schema2_mimetype as DockerManifestV2MIMEType, _schema2_list_mimetype as DockerFAT_schema2_mimetype, + split_digest as dxf_split_digest, +) + +from dxf.exceptions import ( + DXFDigestMismatchError, ) import dxf.exceptions @@ -126,14 +131,24 @@ def get_parsed_manifest_and_dcd( # "A schema1 manifest should always produce the same image id but # defining the steps to produce directly from the manifest is not # straight forward." - dcd_h = r.headers.get("Docker-Content-Digest") - _, dcd = _verify_manifest( # type: ignore[no-untyped-call] - manifest, - parsed_manifest, - content_digest=dcd_h, - verify=False, - get_content_digest=True, - ) + dcd = r.headers.get("Docker-Content-Digest") + + # Borrowed from https://github.com/davedoesdev/dxf/blob/9e733e98d00ff8c5c5ec579659a431449bc3a322/dxf/__init__.py#L622-L635 + if content is not None: + if dcd is not None: + method_h, expected_dgst_h = dxf_split_digest(dcd) # type: ignore[no-untyped-call] + hasher = hashlib.new(method_h) + hasher.update(content) + dgst_h = hasher.hexdigest() + if dgst_h != expected_dgst_h: + raise DXFDigestMismatchError( # type: ignore[no-untyped-call] + method_h + ":" + dgst_h, method_h + ":" + expected_dgst_h + ) + else: + dcd = dxf_hash_bytes(content) + else: + dcd = dxf_hash_bytes(manifest.encode("utf8")) + assert dcd is not None, f"Empty dcd for {alias}" else: dcd = dxf_hash_bytes(manifest.encode("utf8")) diff --git a/wfexs_backend/utils/zipfile_path.py b/wfexs_backend/utils/zipfile_path.py index 33381a60..4788e1a9 100644 --- a/wfexs_backend/utils/zipfile_path.py +++ b/wfexs_backend/utils/zipfile_path.py @@ -28,10 +28,7 @@ TYPE_CHECKING, ) -from zipfile import ( - ZipFile, - ZipInfo, -) +import zipfile if TYPE_CHECKING: @@ -53,7 +50,9 @@ Union, ) - import zipfile + from typing_extensions import ( + Literal, + ) def _parents(path: "str") -> "Iterator[str]": @@ -111,7 +110,7 @@ def _difference( return itertools.filterfalse(set(subtrahend).__contains__, minuend) -class CompleteDirs(ZipFile): +class CompleteDirs(zipfile.ZipFile): """ A ZipFile subclass that ensures that implied directories are always included in the namelist. @@ -140,7 +139,7 @@ def resolve_dir(self, name: "str") -> "str": dir_match = name not in names and dirname in names return dirname if dir_match else name - def getinfo(self, name: "str") -> "ZipInfo": + def getinfo(self, name: "str") -> "zipfile.ZipInfo": """ Supplement getinfo for implied dirs. """ @@ -149,11 +148,11 @@ def getinfo(self, name: "str") -> "ZipInfo": except KeyError: if not name.endswith("/") or name not in self._name_set(): raise - return ZipInfo(filename=name) + return zipfile.ZipInfo(filename=name) @classmethod def make( - cls, source: "Union[CompleteDirs, ZipFile, str, os.PathLike[str]]" + cls, source: "Union[CompleteDirs, zipfile.ZipFile, str, os.PathLike[str]]" ) -> "CompleteDirs": """ Given a source (filename or zipfile), return an @@ -162,7 +161,7 @@ def make( if isinstance(source, CompleteDirs): return source - if not isinstance(source, ZipFile): + if not isinstance(source, zipfile.ZipFile): return cls(source) # Only allow for FastPath when supplied zipfile is read-only @@ -194,13 +193,6 @@ def _name_set(self) -> "Set[str]": return self.__lookup -def _extract_text_encoding( - encoding: "Optional[str]" = None, *args: "Any", **kwargs: "Any" -) -> "Tuple[str, Tuple[Any], Dict[str, Any]]": - # stacklevel=3 so that the caller of the caller see any warning. - return io.text_encoding(encoding, 3), args, kwargs - - def path_relative_to( path: "pathlib.Path", other: "pathlib.Path", *extra: "Union[str, os.PathLike[str]]" ) -> "str": @@ -225,7 +217,7 @@ class ZipfilePath(pathlib.Path): └── e.txt >>> data = io.BytesIO() - >>> zf = ZipFile(data, 'w') + >>> zf = zipfile.ZipFile(data, 'w') >>> zf.writestr('a.txt', 'content of a') >>> zf.writestr('b/c.txt', 'content of c') >>> zf.writestr('b/d/e.txt', 'content of e') @@ -293,7 +285,7 @@ class ZipfilePath(pathlib.Path): def __init__( self, - root: "Union[str, CompleteDirs, os.PathLike[str], ZipFile]", + root: "Union[str, CompleteDirs, os.PathLike[str], zipfile.ZipFile]", at: "str" = "", ): """ @@ -333,14 +325,14 @@ def open( # type: ignore[override] zip_mode = mode[0] if not self.exists() and zip_mode == "r": raise FileNotFoundError(self) - stream = self._root.open(self._at, mode=zip_mode, pwd=pwd) + stream = self._root.open( + self._at, mode=cast("Literal['r', 'w']", zip_mode), pwd=pwd + ) if "b" in mode: # if args or kwargs: # raise ValueError("encoding args invalid for binary operation") return stream # Text mode: - # encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) - encoding = io.text_encoding(encoding, 2) return io.TextIOWrapper( stream, encoding=encoding, @@ -410,13 +402,13 @@ def parent(self) -> "ZipfilePath": return self._next(parent_at) @property - def zip_root(self) -> "ZipFile": + def zip_root(self) -> "zipfile.ZipFile": return self._root def relative_to( # type: ignore[override] self, other: "Union[str, os.PathLike[str]]", - /, + # /, *_deprecated: "Union[str, os.PathLike[str]]", walk_up: bool = False, ) -> "pathlib.Path": @@ -432,7 +424,7 @@ def resolve(self, strict: "bool" = False) -> "ZipfilePath": def _extract_member( self, - member: "Union[ZipInfo, str]", + member: "Union[zipfile.ZipInfo, str]", targetpath: "Union[str, os.PathLike[str]]", pwd: "Optional[bytes]" = None, ) -> "str": @@ -442,7 +434,7 @@ def _extract_member( """Extract the ZipInfo object 'member' to a physical file on the path targetpath. """ - if not isinstance(member, ZipInfo): + if not isinstance(member, zipfile.ZipInfo): member = self._root.getinfo(member) # build the destination pathname, replacing @@ -503,10 +495,3 @@ def copy_to(self, dest: "pathlib.Path") -> "None": def with_name(self, name: "Union[str, os.PathLike[str]]") -> "ZipfilePath": return self.parent.joinpath(name) - - -# Older versions of Python do not have zipfile.Path -if sys.version_info[:2] < (3, 8): - import zipfile - - zipfile.Path = ZipfilePath diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 4ecd8481..6aa173cc 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -94,6 +94,7 @@ Literal, TypeAlias, TypedDict, + TypeGuard, Required, NotRequired, ) @@ -1456,13 +1457,20 @@ def FromPreviousInstanceDeclaration( and isinstance(cached_inputs, list) and len(cached_inputs) > 0 ): + # This is overcomplicated to pass checks in python 3.7 mypy + def filter_cached_inputs( + m_i: "MaterializedInput", + ) -> "TypeGuard[bool]": + assert replaced_inputs is not None + return m_i.name not in replaced_inputs + new_cached_inputs = list( - filter( - lambda m_i: m_i.name not in replaced_inputs, cached_inputs - ) + filter(filter_cached_inputs, cached_inputs) ) if len(new_cached_inputs) < len(cached_inputs): - cached_inputs = new_cached_inputs + cached_inputs = cast( + "Sequence[MaterializedInput]", new_cached_inputs + ) if wfInstance.materializedEnvironment is not None: cached_environment = copy.copy(wfInstance.materializedEnvironment) @@ -1475,14 +1483,20 @@ def FromPreviousInstanceDeclaration( and isinstance(cached_environment, list) and len(cached_environment) > 0 ): + # This is overcomplicated to pass checks in python 3.7 mypy + def filter_cached_environment( + m_i: "MaterializedInput", + ) -> "TypeGuard[bool]": + assert replaced_environment is not None + return m_i.name not in replaced_environment + new_cached_environment = list( - filter( - lambda m_i: m_i.name not in replaced_environment, - cached_environment, - ) + filter(filter_cached_environment, cached_environment) ) if len(new_cached_environment) < len(cached_environment): - cached_environment = new_cached_environment + cached_environment = cast( + "Sequence[MaterializedInput]", new_cached_environment + ) if wfInstance.materializedEngine is not None: if wfInstance.materializedEngine.containers is not None: @@ -1663,11 +1677,16 @@ def FromPreviousROCrate( and isinstance(cached_inputs, list) and len(cached_inputs) > 0 ): - new_cached_inputs = list( - filter(lambda m_i: m_i.name not in replaced_inputs, cached_inputs) - ) + # This is overcomplicated to pass checks in python 3.7 mypy + def filter_cached_inputs(m_i: "MaterializedInput") -> "TypeGuard[bool]": + assert replaced_inputs is not None + return m_i.name not in replaced_inputs + + new_cached_inputs = list(filter(filter_cached_inputs, cached_inputs)) if len(new_cached_inputs) < len(cached_inputs): - cached_inputs = new_cached_inputs + cached_inputs = cast( + "Sequence[MaterializedInput]", new_cached_inputs + ) replaced_environment = replaced_items.get("environment") if ( @@ -1675,14 +1694,23 @@ def FromPreviousROCrate( and isinstance(cached_environment, list) and len(cached_environment) > 0 ): + # This is overcomplicated to pass checks in python 3.7 mypy + def filter_cached_environment( + m_i: "MaterializedInput", + ) -> "TypeGuard[bool]": + assert replaced_environment is not None + return m_i.name not in replaced_environment + new_cached_environment = list( filter( - lambda m_i: m_i.name not in replaced_environment, - cached_environment, + filter_cached_environment, + cast("Sequence[MaterializedInput]", cached_environment), ) ) if len(new_cached_environment) < len(cached_environment): - cached_environment = new_cached_environment + cached_environment = cast( + "Sequence[MaterializedInput]", new_cached_environment + ) return cls.FromStagedRecipe( wfexs, @@ -2369,14 +2397,20 @@ def _fetchRemoteFile( # execution environment realPrettyLocal = prettyLocal.resolve() realInputDestDir = inputDestDir.resolve() - if not realPrettyLocal.is_relative_to(realInputDestDir): + # Path.is_relative_to was introduced in Python 3.9 + # if not realPrettyLocal.is_relative_to(realInputDestDir): + common_path = pathlib.Path( + os.path.commonpath([realPrettyLocal, realInputDestDir]) + ) + if realInputDestDir != common_path: prettyRelname = cast("RelPath", realPrettyLocal.name) prettyLocal = inputDestDir / prettyRelname # Checking whether local name hardening is needed if not hardenPrettyLocal: if prettyLocal.is_symlink(): - oldLocal = prettyLocal.readlink() + # Path.readlink was added in Python 3.9 + oldLocal = os.readlink(prettyLocal) hardenPrettyLocal = oldLocal != matContent.local elif prettyLocal.exists(): @@ -2386,7 +2420,7 @@ def _fetchRemoteFile( # Trying to avoid collisions on input naming prettyLocal = inputDestDir / (prefix + prettyRelname) - if not os.path.exists(prettyLocal): + if not prettyLocal.exists(): # We are either hardlinking or copying here link_or_copy_pathlib(matContent.local, prettyLocal) @@ -2819,7 +2853,12 @@ def _fetchContentWithURIs( if relative_dir is not None: newInputDestDir = (inputDestDir / relative_dir).resolve() - if newInputDestDir.is_relative_to(inputDestDir): + # Path.is_relative_to was introduced in Python 3.9 + # if newInputDestDir.is_relative_to(inputDestDir): + common_path = pathlib.Path( + os.path.commonpath([newInputDestDir, inputDestDir]) + ) + if common_path == inputDestDir: inputDestDir = newInputDestDir extrapolatedInputDestDir = ( extrapolatedInputDestDir / relative_dir From e752116087f65f13d641e4230796de570c26803a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 19 Jun 2024 19:25:07 +0200 Subject: [PATCH 61/62] Fixed issue which did not appear in Python 3.7 validations. --- wfexs_backend/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 6aa173cc..849d1377 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -2410,7 +2410,7 @@ def _fetchRemoteFile( if not hardenPrettyLocal: if prettyLocal.is_symlink(): # Path.readlink was added in Python 3.9 - oldLocal = os.readlink(prettyLocal) + oldLocal = pathlib.Path(os.readlink(prettyLocal)) hardenPrettyLocal = oldLocal != matContent.local elif prettyLocal.exists(): From eb275f98aca59d7577c7e110fdf3f3c2eb860530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Wed, 19 Jun 2024 19:44:19 +0200 Subject: [PATCH 62/62] Trying to ease the validations --- .github/workflows/pre-commit.yml | 4 +++- .github/workflows/schemas-doc-generator.yml | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index e3396a7d..a974dca7 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -43,7 +43,7 @@ jobs: path: .mypy_cache/${{ matrix.python-version }} key: mypy-${{ matrix.python-version }} - name: 'pre-commit' - uses: pre-commit/action@v3.0.0 + uses: pre-commit/action@v3.0.1 # if: ${{ matrix.python-version != '3.6' }} with: extra_args: --all -c .pre-commit-config.yaml @@ -70,6 +70,8 @@ jobs: path: constraints-${{ matrix.python-version }}.txt pull_request_changes: + # Do this only when it is not a pull request validation + if: github.event_name != 'pull_request' runs-on: ubuntu-latest name: Pull request with the newly generated contents needs: diff --git a/.github/workflows/schemas-doc-generator.yml b/.github/workflows/schemas-doc-generator.yml index b4ec8ebe..956031ab 100644 --- a/.github/workflows/schemas-doc-generator.yml +++ b/.github/workflows/schemas-doc-generator.yml @@ -15,6 +15,8 @@ jobs: extra_args: --all -c .pre-commit-config.yaml jsonschema_dir_validate update_docs: + # Do this only when it is not a pull request validation + if: github.event_name != 'pull_request' runs-on: ubuntu-latest name: Update documentation if all worked properly needs: