From ae8c44bae314f94e410e392b989ab48c7124b7ad Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 14 Dec 2023 19:43:10 +0100 Subject: [PATCH 1/2] OcrdMets/OcrdFile: access to @CONTENTIDS attribute of physical page mets:div --- ocrd_models/ocrd_models/ocrd_file.py | 13 ++++++++++++- ocrd_models/ocrd_models/ocrd_mets.py | 19 +++++++++++++++++++ tests/model/test_ocrd_file.py | 5 +++++ tests/model/test_ocrd_mets.py | 3 +++ 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_file.py b/ocrd_models/ocrd_models/ocrd_file.py index e8205a33d9..20a2e31522 100644 --- a/ocrd_models/ocrd_models/ocrd_file.py +++ b/ocrd_models/ocrd_models/ocrd_file.py @@ -27,6 +27,7 @@ def __init__(self, el, mimetype=None, pageId=None, local_filename=None, mets=Non local_filename (Path): ``@xlink:href`` pointing to the locally cached version of the file in the workspace ID (string): ``@ID`` of this ``mets:file`` loctype (string): DEPRECATED do not use + contentids (string): ``@CONTENTIDS`` of the ``mets:div`` in the ``mets:structMap[@TYPE="PHYSICAL]`` this file manifests """ if el is None: raise ValueError("Must provide mets:file element this OcrdFile represents") @@ -135,6 +136,15 @@ def pageId(self, pageId): raise Exception("OcrdFile %s has no member 'mets' pointing to parent OcrdMets" % self) self.mets.set_physical_page_for_file(pageId, self) + @property + def contentids(self): + """ + Get the ``@CONTENTIDS`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file``. + """ + if self.mets is None: + raise Exception("OcrdFile %s has no member 'mets' pointing to parent OcrdMets" % self) + return self.mets.get_contentids_for_file(self) + @property def loctypes(self): """ @@ -226,7 +236,7 @@ class ClientSideOcrdFile: this represents the response of the :py:class:`ocrd.mets_server.OcrdMetsServer`. """ - def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None, fileGrp=None): + def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filename=None, mets=None, url=None, ID=None, fileGrp=None, contentids=None): """ Args: el (): ignored @@ -238,6 +248,7 @@ def __init__(self, el, mimetype=None, pageId=None, loctype='OTHER', local_filena url (string): ignored XXX the remote/original file once we have proper mets:FLocat bookkeeping local_filename (): ``@xlink:href`` of this ``mets:file`` - XXX the local file once we have proper mets:FLocat bookkeeping ID (string): ``@ID`` of this ``mets:file`` + fileGrp (string): ``@USE`` of the ``mets:fileGrp`` this file belongs to """ self.ID = ID self.mimetype = mimetype diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 3319f8f6ff..fa85529515 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -722,6 +722,25 @@ def get_physical_page_for_file(self, ocrd_file): if len(ret): return ret[0] + def get_contentids_for_file(self, ocrd_file): + """ + Get the ``@CONTENTIDS` attribute of the physical page (``@CONTENTIDS`` of the ``mets:structMap[@TYPE="PHYSICAL"]//mets:div[@TYPE="PAGE"]`` entry) + corresponding to the ``mets:file`` :py:attr:`ocrd_file`. + """ + ret = [] + if self._cache_flag: + for pageId in self._fptr_cache.keys(): + if ocrd_file.ID in self._fptr_cache[pageId].keys(): + ret.append(self._page_cache[pageId].get('CONTENTIDS')) + else: + ret = self._tree.getroot().xpath( + '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@CONTENTIDS' % + ocrd_file.ID, namespaces=NS) + + # To get rid of the python's FutureWarning + if len(ret): + return ret[0] + def remove_physical_page(self, ID): """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. diff --git a/tests/model/test_ocrd_file.py b/tests/model/test_ocrd_file.py index 98b9fa4240..c473b64a04 100644 --- a/tests/model/test_ocrd_file.py +++ b/tests/model/test_ocrd_file.py @@ -4,6 +4,7 @@ from tests.base import ( main, + assets, create_ocrd_file, create_ocrd_file_with_defaults ) @@ -121,6 +122,10 @@ def test_fptr_changed_for_change_id(): assert mets.get_physical_pages(for_fileIds=['FOO_1']) == [None] assert mets.get_physical_pages(for_fileIds=['BAZ_1']) == ['p0001'] +def test_get_contentids(): + mets = OcrdMets(filename=assets.url_of('pembroke_werke_1766/data/mets.xml')) + ocrd_file = next(mets.find_files(pageId='PHYS_0009')) + assert ocrd_file.contentids == 'http://resolver.staatsbibliothek-berlin.de/SBB0001CA7900000009' if __name__ == '__main__': main(__file__) diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 64ea1eccfe..68a3fd00c0 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -396,6 +396,9 @@ def test_update_physical_page_attributes(sbb_directory_ocrd_mets): assert b'ORDER' in m.to_xml() assert b'ORDERLABEL' in m.to_xml() +def test_get_contentids(): + mets = OcrdMets(filename=assets.url_of('pembroke_werke_1766/data/mets.xml')) + assert mets.get_contentids_for_file(next(mets.find_files(pageId='PHYS_0009'))) == 'http://resolver.staatsbibliothek-berlin.de/SBB0001CA7900000009' if __name__ == '__main__': main(__file__) From bd2d48fb29d539ac2c87f5a243683395b6728462 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 14 Dec 2023 19:54:44 +0100 Subject: [PATCH 2/2] WorkspaceBagger: Use, in order of preference, f.basename, f.contentids and f.ID for filenames, fix #1154 --- ocrd/ocrd/workspace_bagger.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index f838a65894..5fd69a9417 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -18,6 +18,7 @@ unzip_file_to_dir, DEFAULT_METS_BASENAME, MIMETYPE_PAGE, + safe_filename, VERSION, ) from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL @@ -80,7 +81,10 @@ def _bag_mets_files( file_grp_dir.mkdir() attr = 'local_filename' if f.local_filename else 'url' - basename = f.basename if f.basename else f"{f.ID}{MIME_TO_EXT.get(f.mimetype, '.xml')}" + if f.local_filename and f.basename: + basename = f.basename + else: + basename = safe_filename(f.contentids if f.contentids else f.ID) + MIME_TO_EXT.get(f.mimetype, '.xml') _relpath = join(f.fileGrp, basename) self.resolver.download_to_directory(file_grp_dir, getattr(f, attr), basename=basename) changed_local_filenames[str(getattr(f, attr))] = _relpath