Skip to content

Commit

Permalink
Now the RemoteRepo structure has a way to tell the web url to browse …
Browse files Browse the repository at this point in the history
…the repository
  • Loading branch information
jmfernandez committed Jul 29, 2023
1 parent d3b6078 commit 48aeaa7
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 124 deletions.
1 change: 1 addition & 0 deletions wfexs_backend/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ class RemoteRepo(NamedTuple):
tag: "Optional[RepoTag]" = None
rel_path: "Optional[RelPath]" = None
repo_type: "Optional[RepoType]" = None
web_url: "Optional[URIType]" = None


class IdentifiedWorkflow(NamedTuple):
Expand Down
17 changes: 17 additions & 0 deletions wfexs_backend/fetchers/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ def guess_git_repo_params(
repoTag = None
repoRelPath = None
repoType: "Optional[RepoType]" = None
web_url: "Optional[URIType]" = None

# Deciding which is the input
if isinstance(wf_url, parse.ParseResult):
Expand Down Expand Up @@ -520,11 +521,27 @@ def guess_git_repo_params(
if repoURL is None:
return None

# if repoType == RepoType.GitHub:
# wf_entrypoint_path = [
#
# ]
# web_url = urllib.parse.urlunparse(
# (
# "https",
# "raw.githubusercontent.com",
# "/".join(wf_entrypoint_path),
# "",
# "",
# "",
# )
# )

return RemoteRepo(
repo_url=cast("RepoURL", repoURL),
tag=cast("Optional[RepoTag]", repoTag),
rel_path=cast("Optional[RelPath]", repoRelPath),
repo_type=repoType,
web_url=web_url,
)


Expand Down
106 changes: 55 additions & 51 deletions wfexs_backend/fetchers/swh.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@
import logging

from typing import (
Any,
IO,
Mapping,
MutableSequence,
Optional,
Sequence,
Tuple,
Expand Down Expand Up @@ -125,31 +127,7 @@ def doMaterializeRepo(
# The service does not work with quoted identifiers, neither with
# fully unquoted identifiers. Only the semicolons have to be
# substituted
swh_quoted_repoURL = repoURL.replace(";", parse.quote(";"))
resio = io.BytesIO()
# urljoin cannot be used due working with URIs
resolve_uri = cast(
"URIType", self.SWH_API_REST_RESOLVE + swh_quoted_repoURL + "/"
)
try:
_, metaresio, _ = fetchClassicURL(
resolve_uri,
resio,
secContext={
"headers": {
"Accept": "application/json",
},
},
)
res_doc = json.loads(resio.getvalue().decode("utf-8"))
except Exception as e:
raise FetcherException(f"HTTP REST call {resolve_uri} failed") from e
gathered_meta = {
"fetched": resolve_uri,
"payload": res_doc,
}
metadata_array = [URIWithMetadata(repoURL, gathered_meta)]
metadata_array.extend(metaresio)
res_doc, metadata_array = resolve_swh_id(repoURL)

# Error handling
if "exception" in res_doc:
Expand All @@ -163,32 +141,8 @@ def doMaterializeRepo(
if object_type == "content":
anchor = res_doc.get("metadata", {}).get("anchor")
if anchor is not None:
# urljoin cannot be used due working with URIs
anchor_resolve_uri = cast(
"URIType", self.SWH_API_REST_RESOLVE + anchor + "/"
)
try:
ancio = io.BytesIO()
_, metaancio, _ = fetchClassicURL(
anchor_resolve_uri,
ancio,
secContext={
"headers": {
"Accept": "application/json",
},
},
)
anc_res_doc = json.loads(ancio.getvalue().decode("utf-8"))
except Exception as e:
raise FetcherException(
f"HTTP REST call {anchor_resolve_uri} failed"
) from e
gathered_meta = {
"fetched": anchor_resolve_uri,
"payload": anc_res_doc,
}
metadata_array = [URIWithMetadata(repoURL, gathered_meta)]
metadata_array.extend(metaancio)
anc_res_doc, anchor_metadata_array = resolve_swh_id(anchor)
metadata_array.extend(anchor_metadata_array)

# Now, truly yes the context
object_type = anc_res_doc["object_type"]
Expand Down Expand Up @@ -598,6 +552,52 @@ def fetch(
)


def resolve_swh_id(
the_id: "URIType",
) -> "Tuple[Mapping[str, Any], MutableSequence[URIWithMetadata]]":
# ## Use the resolver, see https://archive.softwareheritage.org/api/1/resolve/doc/
# curl -H "Accept: application/json" https://archive.softwareheritage.org/api/1/resolve/swh:1:rev:31348ed533961f84cf348bf1af660ad9de6f870c/
# The service does not work with quoted identifiers, neither with
# fully unquoted identifiers. Only the semicolons have to be
# substituted
swh_quoted_id = the_id.replace(";", parse.quote(";"))
resio = io.BytesIO()
# urljoin cannot be used due working with URIs
resolve_uri = cast(
"URIType", SoftwareHeritageFetcher.SWH_API_REST_RESOLVE + swh_quoted_id + "/"
)
try:
_, metaresio, _ = fetchClassicURL(
resolve_uri,
resio,
secContext={
"headers": {
"Accept": "application/json",
},
},
)
res_doc = json.loads(resio.getvalue().decode("utf-8"))
except Exception as e:
raise FetcherException(f"HTTP REST call {resolve_uri} failed") from e

if not isinstance(res_doc, dict):
raise FetcherException(f"{the_id} is not valid. Message: {res_doc}")

gathered_meta = {
"fetched": resolve_uri,
"payload": res_doc,
}
metadata_array = [
URIWithMetadata(
uri=the_id,
metadata=gathered_meta,
)
]
metadata_array.extend(metaresio)

return res_doc, metadata_array


def guess_swh_repo_params(
orig_wf_url: "Union[URIType, parse.ParseResult]",
logger: "logging.Logger",
Expand Down Expand Up @@ -645,8 +645,12 @@ def guess_swh_repo_params(
):
return None

# Now we are sure it is known, let's learn the web url to browse it
resolved_payload, _ = resolve_swh_id(wf_url)
web_url = resolved_payload["browse_url"]
return RemoteRepo(
repo_url=wf_url,
tag=cast("RepoTag", putative_core_swhid),
repo_type=RepoType.SoftwareHeritage,
web_url=web_url,
)
136 changes: 74 additions & 62 deletions wfexs_backend/ro_crate.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
MaterializedWorkflowEngine,
ProcessorArchitecture,
RelPath,
RemoteRepo,
RepoTag,
RepoURL,
StagedExecution,
Expand Down Expand Up @@ -268,8 +269,7 @@ class WorkflowRunROCrate:

def __init__(
self,
repoURL: "RepoURL",
repoTag: "RepoTag",
remote_repo: "RemoteRepo",
localWorkflow: "LocalWorkflow",
materializedEngine: "MaterializedWorkflowEngine",
workflowEngineVersion: "Optional[WorkflowEngineVersionStr]",
Expand Down Expand Up @@ -313,10 +313,6 @@ def __init__(

self.wf_wfexs = self._add_wfexs_to_crate()

wf_url = repoURL.replace(".git", "/") + "tree/" + repoTag
if localWorkflow.relPath is not None:
wf_url += localWorkflow.dir.rsplit("workflow")[1]

matWf = materializedEngine.workflow
if matWf.relPath is not None:
if os.path.isabs(matWf.relPath):
Expand All @@ -328,65 +324,81 @@ def __init__(
else:
matWf_local_path = matWf.dir

parsed_repo_url = urllib.parse.urlparse(repoURL)
if parsed_repo_url.netloc == "github.com":
assert (
matWf.effectiveCheckout is not None
), "The effective checkout should be available"

parsed_repo_path = parsed_repo_url.path.split("/")
repo_name = parsed_repo_path[2]
# TODO: should we urldecode repo_name?
if repo_name.endswith(".git"):
repo_name = repo_name[:-4]
wf_entrypoint_path = [
"", # Needed to prepend a slash
parsed_repo_path[1],
# TODO: should we urlencode repo_name?
repo_name,
matWf.effectiveCheckout,
]

wf_url: "str"
wf_entrypoint_url: "str"
if remote_repo.web_url is not None:
wf_url = remote_repo.web_url
wf_entrypoint_url = wf_url
else:
wf_url = remote_repo.repo_url.replace(".git", "/")
if remote_repo.tag is not None:
wf_url += "tree/" + remote_repo.tag
if localWorkflow.relPath is not None:
wf_entrypoint_path.append(localWorkflow.relPath)

wf_entrypoint_url = urllib.parse.urlunparse(
(
"https",
"raw.githubusercontent.com",
"/".join(wf_entrypoint_path),
"",
"",
"",
wf_url += localWorkflow.dir.rsplit("workflow")[1]

parsed_repo_url = urllib.parse.urlparse(remote_repo.repo_url)
if parsed_repo_url.netloc == "github.com":
assert (
matWf.effectiveCheckout is not None
), "The effective checkout should be available"

parsed_repo_path = parsed_repo_url.path.split("/")
repo_name = parsed_repo_path[2]
# TODO: should we urldecode repo_name?
if repo_name.endswith(".git"):
repo_name = repo_name[:-4]
wf_entrypoint_path = [
"", # Needed to prepend a slash
parsed_repo_path[1],
# TODO: should we urlencode repo_name?
repo_name,
matWf.effectiveCheckout,
]

if localWorkflow.relPath is not None:
wf_entrypoint_path.append(localWorkflow.relPath)

wf_entrypoint_url = urllib.parse.urlunparse(
(
"https",
"raw.githubusercontent.com",
"/".join(wf_entrypoint_path),
"",
"",
"",
)
)
)

elif "gitlab" in parsed_repo_url.netloc:
parsed_repo_path = parsed_repo_url.path.split("/")
# FIXME: cover the case of nested groups
repo_name = parsed_repo_path[2]
if repo_name.endswith(".git"):
repo_name = repo_name[:-4]
wf_entrypoint_path = [parsed_repo_path[1], repo_name]
if localWorkflow.relPath is not None:
# TODO: should we urlencode repoTag?
wf_entrypoint_path.extend(["-", "raw", repoTag, localWorkflow.relPath])

wf_entrypoint_url = urllib.parse.urlunparse(
(
parsed_repo_url.scheme,
parsed_repo_url.netloc,
"/".join(wf_entrypoint_path),
"",
"",
"",
elif "gitlab" in parsed_repo_url.netloc:
parsed_repo_path = parsed_repo_url.path.split("/")
# FIXME: cover the case of nested groups
repo_name = parsed_repo_path[2]
if repo_name.endswith(".git"):
repo_name = repo_name[:-4]
wf_entrypoint_path = [parsed_repo_path[1], repo_name]
if remote_repo.tag is not None and localWorkflow.relPath is not None:
# TODO: should we urlencode repoTag?
wf_entrypoint_path.extend(
["-", "raw", remote_repo.tag, localWorkflow.relPath]
)

wf_entrypoint_url = urllib.parse.urlunparse(
(
parsed_repo_url.scheme,
parsed_repo_url.netloc,
"/".join(wf_entrypoint_path),
"",
"",
"",
)
)
)

else:
raise ROCrateGenerationException(
"FIXME: Unsupported http(s) git repository {}".format(repoURL)
)
else:
raise ROCrateGenerationException(
"FIXME: Unsupported http(s) git repository {}".format(
remote_repo.repo_url
)
)

# This is needed to avoid future collisions with other workflows stored in the RO-Crate
rocrate_wf_folder = str(uuid.uuid5(uuid.NAMESPACE_URL, wf_entrypoint_url))
Expand Down Expand Up @@ -480,7 +492,7 @@ def __init__(
lang=self.compLang,
gen_cwl=False,
)
local_wf_file["codeRepository"] = repoURL
local_wf_file["codeRepository"] = remote_repo.repo_url
if materializedEngine.workflow.effectiveCheckout is not None:
local_wf_file["version"] = materializedEngine.workflow.effectiveCheckout
local_wf_file["description"] = "Unconsolidated Workflow Entrypoint"
Expand All @@ -505,7 +517,7 @@ def __init__(
wf_consolidate_action["instrument"] = self.weng_crate
wf_consolidate_action["agent"] = self.wf_wfexs
else:
self.wf_file["codeRepository"] = repoURL
self.wf_file["codeRepository"] = remote_repo.repo_url
if materializedEngine.workflow.effectiveCheckout is not None:
self.wf_file["version"] = materializedEngine.workflow.effectiveCheckout
self.wf_file["description"] = "Workflow Entrypoint"
Expand Down
Loading

0 comments on commit 48aeaa7

Please sign in to comment.