Skip to content

Commit

Permalink
add fix filemd cmd to cli
Browse files Browse the repository at this point in the history
  • Loading branch information
RayPlante committed May 6, 2024
1 parent 21c3d6f commit 2090a51
Show file tree
Hide file tree
Showing 3 changed files with 562 additions and 1 deletion.
3 changes: 2 additions & 1 deletion python/nistoar/pdr/publish/cmd/fix/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"""
# from . import topics
from .... import cli
from . import topics
from . import topics, filemd

default_name = "fix"
help = "fix specials SIP problems via subcommands"
Expand All @@ -29,6 +29,7 @@ def load_into(subparser, as_cmd=None):
as_cmd = default_name
out = cli.CommandSuite(as_cmd, p)
out.load_subcommand(topics)
out.load_subcommand(filemd)
return out


267 changes: 267 additions & 0 deletions python/nistoar/pdr/publish/cmd/fix/filemd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
"""
CLI command that will scan all the files and ensure that the associated file metadata have correct
size and checksum properties.
"""
import logging, argparse, sys, os, shutil, tempfile, json
from copy import deepcopy
from collections import Mapping

from nistoar.pdr.exceptions import ConfigurationException, PDRException, PDRServerError
from nistoar.pdr.preserv.bagger.prepupd import UpdatePrepService
from nistoar.pdr.preserv.bagger.midas3 import (moddate_of, UNSYNCED_FILE,
MIDASMetadataBagger, _midadid_to_dirname as midas2recno)
from nistoar.pdr.preserv.bagit import NISTBag, BagBuilder
from nistoar.pdr.utils import write_json
from nistoar.pdr.cli import PDRCommandFailure
from nistoar.pdr import def_schema_dir
from nistoar.nerdm.taxonomy import ResearchTopicsTaxonomy
from .. import validate as vald8, define_pub_opts, determine_bag_path

default_name = "filemd"
help = "ensure the size and checksum file metadata is up to date"
description = """
examine files and update the NERDm file metadata accordingly; in particular, this ensures that
the file metadata includes accurate size and checksum data. It will also automatically check that
any accompanying checksum data files contain a consistent checksum hash value, correcting them if
requested.
This command assumes the midas3 bagging and preservation conventions.
"""

def load_into(subparser):
"""
load this command into a CLI by defining the command's arguments and options.
:param argparser.ArgumentParser subparser: the argument parser instance to define this command's
interface into it
:rtype: None
"""
p = subparser
p.description = description
define_pub_opts(p) # defines AIPID and --bag-parent-dir

p.add_argument("filepaths", metavar="FILEPATH", type=str, nargs='*',
help="path to a file component to update. If none are listed, all files will be checked")
p.add_argument("-d", "--data-dir", metavar="DIR", type=str, dest="datadir", default=None,
help="directory that contains the data files that are part of this submission "+
"(This is equivalent to reviewdir/record-number.)")
p.add_argument("-f", "--force", action="store_true", dest="force",
help="Force a re-examination of all (requested) files. Without this, files will only "
"be examined if it appears they need to be, either because the underlying file is "
"newer than the last scan or the file is missing size or checksum data")
p.add_argument("-C", "--correct-cs-file", action="store_true", dest="correctcsf",
help="correct the checksum file if it appears to be in disagreement with the value "
"in the NERDm metadata")
p.add_argument("-n", "--dry-run", action="store_true", dest="dryrun",
help="Do not make any actual changes; only print out what files will be examined")
p.add_argument("-V", "--validate", action="store_true", dest="validate",
help="validate the NERDm metadata after update.")

return None

def execute(args, config=None, log=None):
if not log:
log = logging.getLogger(default_name)
if not config:
config = {}

if isinstance(args, list):
# cmd-line arguments not parsed yet
p = argparse.ArgumentParser()
load_into(p)
args = p.parse_args(args)

if not args.aipid:
raise PDRCommandFailure(default_name, "AIP ID not specified", 1)
args.aipid = args.aipid[0]
usenm = args.aipid
if len(usenm) > 11:
usenm = usenm[:4]+"..."+usenm[-4:]
log = log.getChild(usenm)

# create bagger from inputs
bgr = create_bagger(args, config, log)

if len(bgr.datafiles) < 1:
log.info("No data files found (resubmit updated POD if necessary)")
return

# determine which files to look at
if args.filepaths:
missing = []
for f in args.filepaths:
if f not in bgr.datafiles:
missing.append(f)
if missing:
raise PDRCommandFailure(default_name, "%s: requested files not found (resubmit POD?):\n %s",
args.aipid, "\n ".join(missing))

if not args.filepaths:
args.filepaths = list(bgr.datafiles.keys())
if args.dryrun or args.verbose:
log.info("%s: will examine all available files")

examine_files = which_files(args, bgr, log)

# now examine each selected file
fixed_sha_files = []
updated_mdata = []
for filepath, srcfile in examine_files.items():
xcs = None
try:
if not args.dryrun:
xcs = examine_file(bgr, filepath, srcfile, log)
updated_mdata.append(filepath)
except Exception as ex:
log.error("%s: Unable to update file metadata: %s", filepath, str(ex))
log.warning("Skipping checksum file check")
continue

if filepath.endswith(".sha256"):
log.debug("Skipping checksum file check on checksum file")
continue

# check for a corresponding checksum file
chksumfile = srcfile + ".sha256"
chksumpath = filepath + ".sha256"
if os.path.exists(chksumfile):
if not xcs:
dfmd = bgr.bagbldr.bag.nerd_metadata_for(filepath)
xcs = dfmd.get("checksum", {}).get("hash")

if not xcs and (args.dryrun or args.verbose):
log.debug("%s: NERDm checksum data not set yet", filepath)

try:
with open(chksumfile) as fd:
fcs = fd.readline().split()[0]
except Exception as ex:
log.warn("%s: unable to extract checksum hash value: %s", chksumfile, str(ex))
if args.correctcsf:
log.info("%s: updating corrupted checksum file anyway", chksumpath)
fcs = "1"
else:
fcs = "1"

if xcs != fcs:
if args.dryrun or not args.correctcsf:
log.warn("%s: checksum file has incorrect value in it", chksumpath)
elif args.correctcsf:
try:
with open(chksumfile, 'w') as fd:
fd.write(xcs)
except Exception as ex:
log.error("%s: unable to write out checksum: %s", chksumfile, str(ex))

if chksumpath in bgr.datafiles:
examine_file(bgr, chksumpath, chksumfile, log)
fixed_sha_files.append(filepath)

if args.dryrun:
log.info("%s: will examine %d file%s in total",
args.aipid, len(updated_mdata),
"s" if len(updated_mdata) != 1 else "")
else:
log.info("%s: examined %d file%s in total",
args.aipid, len(updated_mdata),
"s" if len(updated_mdata) != 1 else "")
if not args.correctcsf:
log.info("%s: %d checksum file%s do not match metadata hash",
args.aipid, len(fixed_sha_files),
"s" if len(fixed_sha_files) != 1 else "")
else:
log.info("%s: %d checksum file%s were fixed",
args.aipid, len(fixed_sha_files),
"s" if len(fixed_sha_files) != 1 else "")

def create_bagger(args, config, log):
# set the input bag
workdir, bagparent, bagdir = determine_bag_path(args, config)
if not os.path.isdir(bagdir):
raise PDRCommandFailure(default_name, "Input bag does not exist (as a dir): "+bagdir, 2)
log.info("Found input bag at "+bagdir)

# create a Bagger instance for it
if config.get('bagger') and isinstance(config['bagger'], Mapping):
config = deepcopy(config)
config.update(config['bagger'])
del config['bagger']

bgrmdfile = os.path.join(bagdir, "metadata", MIDASMetadataBagger.BGRMD_FILENAME)
if not args.datadir and os.path.isfile(bgrmdfile):
bgr = MIDASMetadataBagger.forMetadataBag(bagdir, config, for_pres=True)
else:
if not args.datadir:
if not config.get('review_dir'):
raise PDRCommandFailure(default_name, "Unable to determine data (review) directory", 3)
args.datadir = os.path.join(config['review_dir'], midas2recno(args.aipid))
bgr = MIDASMetadataBagger(args.aipid, bagparent, args.datadir, config.get('bagger',{}))

bgr.ensure_res_metadata() # sets self.datafiles
return bgr

def which_files(args, bagger, log):
lasttime = 0.0
if not args.force:
bmd = bagger.baggermd_for("")
lasttime = bmd.get('last_file_examine', 0.0)

if not args.filepaths:
args.filepaths = list(bagger.datafiles.keys())

examine_files = {}
for filepath in args.filepaths:
if filepath not in bagger.datafiles:
if bagger.bagbldr.bag.comp_exists(filepath):
msg = "data file for this path not found in data dir"
else:
msg = "not a file path registered in metadata bag"
raise PDRCommandFailure(default_name, "%s: %s" % (filepath, msg), 4)

srcpath = bagger.datafiles[filepath]
examine = args.force
dfmd = None
if not examine:
dfmd = bagger.bagbldr.bag.nerd_metadata_for(filepath)
examine = 'size' not in dfmd or 'checksum' not in dfmd
if args.dryrun or args.verbose:
log.debug("%s: will examine datafile, %s: missing key metadata", args.aipid, filepath)

if not examine:
if srcpath and moddate_of(srcpath) > lasttime:
examine = True
if args.dryrun or args.verbose:
log.debug("%s: will examine datafile, %s: data file appears updated",
args.aipid, filepath)

elif os.path.exists(os.path.join(os.path.dirname(bagger.bagbldr.bag.nerd_file_for(filepath)),
UNSYNCED_FILE)):
examine = True
if args.dryrun or args.verbose:
log.debug("%s: will examine datafile, %s: marked as unsynced.",
args.aipid, filepath)
if examine:
examine_files[filepath] = srcpath

return examine_files

def examine_file(bagger, filepath, location, log):
"""
examine the referenced file and update its metadata accordingly.
"""
md = bagger.bagbldr.describe_data_file(location, filepath, True)
if '__status' in md:
md['__status'] = "updated"

md = bagger.bagbldr.update_metadata_for(filepath, md, None,
"cli-driven metadata update for file, "+filepath)
if '__status' in md:
del md['__status']
bagger.bagbldr.replace_metadata_for(filepath, md, '')
bagger._mark_filepath_synced(filepath)

out = md.get('checksum', {}).get('hash')
if not out:
raise RuntimeException(filepath+": Failed to update checksum for unknown reason")
return out


Loading

0 comments on commit 2090a51

Please sign in to comment.