Skip to content

Commit

Permalink
Merge pull request #340 from usnistgov/fix/unicode-support
Browse files Browse the repository at this point in the history
Provide full support for unicode in filenames
  • Loading branch information
RayPlante authored Jul 18, 2024
2 parents 02ed8f5 + e06c187 commit daf23c6
Show file tree
Hide file tree
Showing 20 changed files with 200 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
OAR_DOCKERHUB_CRED: ${{ secrets.OAR_DOCKERHUB_CRED }}
run: |
bash scripts/dhsetup.sh
cd docker && bash ./dockbuild.sh pytest
cd docker && bash ./dockbuild.sh pdrtest
- name: Run Unit Tests via Docker
run: cd docker && ./testall -D python
Expand Down
8 changes: 6 additions & 2 deletions docker/pdrtest/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

FROM oar-metadata/ejsonschema-py2

ENV LANG=C.UTF-8
COPY setdefenc.py /tmp/
RUN cat /tmp/setdefenc.py >> /usr/lib/python2.7/sitecustomize.py

RUN apt-get update && apt-get install -y python-yaml nginx curl wget less sudo \
uwsgi uwsgi-plugin-python zip \
p7zip-full ca-certificates git
Expand All @@ -20,9 +24,9 @@ COPY verify-asc.sh /usr/local/bin

# install multibag from source
RUN curl -L -o multibag-py.zip \
https://github.com/usnistgov/multibag-py/archive/master.zip && \
https://github.com/usnistgov/multibag-py/archive/0.6.zip && \
unzip -oq multibag-py.zip && \
cd multibag-py-master && \
cd multibag-py-0.6 && \
python setup.py install

ENV GOSU_VERSION 1.14
Expand Down
4 changes: 4 additions & 0 deletions docker/pdrtest/setdefenc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

import sys
sys.setdefaultencoding('utf8')

4 changes: 2 additions & 2 deletions python/nistoar/pdr/preserv/bagger/midas3.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from ....nerdm import utils as nerdutils
from ... import def_merge_etcdir, utils, ARK_NAAN, PDR_PUBLIC_SERVER
from .. import (SIPDirectoryError, SIPDirectoryNotFound, AIPValidationError,
ConfigurationException, StateException, PODError,
ConfigurationException, StateException, PODError, NERDError,
PreservationStateError)
from .... import pdr
from .prepupd import UpdatePrepService
Expand Down Expand Up @@ -324,7 +324,7 @@ def _filepaths_in_pod(self):

pod = self._pod_rec()

return [self._distsvcurl.sub('', urllib.unquote(d['downloadURL']))
return [self._distsvcurl.sub('', urllib.unquote(str(d['downloadURL'])))
for d in pod.get('distribution',[]) if 'downloadURL' in d]


Expand Down
8 changes: 4 additions & 4 deletions python/nistoar/pdr/preserv/bagit/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def _download_url(self, ediid, destpath):
if ediid.startswith(arkpfx):
# our convention is to omit the "ark:/88434/" prefix
ediid = ediid[len(arkpfx):]
return self._distbase + ediid + '/' + urlencode(path)
return self._distbase + ediid + '/' + urlencode(str(path))

def assign_id(self, id, keep_conv=False):
"""
Expand Down Expand Up @@ -2495,7 +2495,7 @@ def _create_def_datafile_md(self, destpath):
out = OrderedDict([
("_schema", NERD_DEF + "Component"),
("@context", NERDM_CONTEXT),
("@id", "cmps/" + urlencode(destpath)),
("@id", "cmps/" + urlencode(str(destpath))),
("@type", deepcopy(self._comp_types["DataFile"][0]))
])
out["_extensionSchemas"] = deepcopy(self._comp_types["DataFile"][1])
Expand All @@ -2514,7 +2514,7 @@ def _create_def_chksum_md(self, destpath):
out = OrderedDict([
("_schema", NERD_DEF + "Component"),
("@context", NERDM_CONTEXT),
("@id", "cmps/" + urlencode(destpath)),
("@id", "cmps/" + urlencode(str(destpath))),
("@type", deepcopy(self._comp_types["ChecksumFile"][0])),
("filepath", destpath)
])
Expand Down Expand Up @@ -2543,7 +2543,7 @@ def _create_def_subcoll_md(self, destpath):
out = OrderedDict([
("_schema", NERD_DEF + "Component"),
("@context", NERDM_CONTEXT),
("@id", "cmps/" + urlencode(destpath)),
("@id", "cmps/" + urlencode(str(destpath))),
("@type", deepcopy(self._comp_types["Subcollection"][0])),
("_extensionSchemas", deepcopy(self._comp_types["Subcollection"][1])),
("filepath", destpath)
Expand Down
7 changes: 6 additions & 1 deletion python/nistoar/pdr/preserv/bagit/validate/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This module provides the base validator class
"""
import sys, traceback
from abc import ABCMeta, abstractmethod, abstractproperty
from collections import Sequence, OrderedDict

Expand All @@ -11,6 +12,10 @@
PROB = 3
issuetypes = [ ERROR, WARN, REC ]

def _fmt_exc():
return "".join(traceback.format_exception(*sys.exc_info()))


class Validator(object):
"""
a class for validating a bag encapsulated in a directory.
Expand Down Expand Up @@ -433,7 +438,7 @@ def validate(self, bag, want=ALL, results=None, *kw):
out._err( ValidationIssue(self.profile[0], self.profile[1],
"validator failure", ERROR,
"test method, {0}, raised an exception: {1}"
.format(test, str(ex)), False),
.format(test, _fmt_exc()), False),
False )
return out

Expand Down
9 changes: 6 additions & 3 deletions python/nistoar/pdr/preserv/bagit/validate/multibag.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
"""
This module implements a validator for the Multibag Profile
"""
import os, re
from __future__ import absolute_import
import os, re, io
from collections import OrderedDict
from urlparse import urlparse

from .base import (Validator, ValidatorBase, ALL, ValidationResults,
ERROR, WARN, REC, ALL, PROB)
from ..bag import NISTBag

from multibag.constants import DEF_ENC

class MultibagValidator(ValidatorBase):
"""
A validator that runs tests for compliance with the Multibag Bagit Profile.
Expand Down Expand Up @@ -243,7 +246,7 @@ def test_member_bags(self, bag, want=ALL, results=None):
found = set()
foundme = False
last = None
with open(mbemf) as fd:
with io.open(mbemf, encoding=DEF_ENC) as fd:
i = 0
for line in fd:
i += 1
Expand Down Expand Up @@ -333,7 +336,7 @@ def test_file_lookup(self, bag, want=ALL, results=None, ishead=False):
replicated = []
missing = []
paths = set()
with open(flirf) as fd:
with io.open(flirf, encoding=DEF_ENC) as fd:
i = 0
for line in fd:
i += 1
Expand Down
24 changes: 19 additions & 5 deletions python/nistoar/pdr/publish/midas3/mdwsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
This web service provides the public access to the metadata and the data files provided
by the author to MIDAS.
"""
import os, sys, logging, json, re
import os, sys, logging, json, re, urllib
from wsgiref.headers import Headers
from cgi import parse_qs, escape as escape_qp
from collections import OrderedDict
Expand Down Expand Up @@ -96,6 +96,7 @@ def __call__(self, env, start_resp):
class Handler(object):

badidre = re.compile(r"[<>\s]")
enc = "ISO-8859-1"

def __init__(self, app, wsgienv, start_resp):
self.app = app
Expand Down Expand Up @@ -129,7 +130,8 @@ def add_header(self, name, value):
# thus, this will raise a UnicodeEncodeError if the input strings
# include Unicode (char code > 255).
e = "ISO-8859-1"
self._hdr.add_header(name.encode(e), value.encode(e))
onerr = "backslashreplace"
self._hdr.add_header(name.encode(e, onerr), value.encode(e, onerr))

def set_response(self, code, message):
self._code = code
Expand Down Expand Up @@ -410,10 +412,22 @@ def send_datafile(self, id, filepath):

self.set_response(200, "Data file found")
self.add_header('Content-Type', mtype)
self.add_header('Content-Disposition',
'inline; filename="%s"' % os.path.basename(filepath))
outname = os.path.basename(filepath)
try:
outname.encode("ISO-8859-1")
self.add_header('Content-Disposition',
'inline; filename="%s"' % outname)
except UnicodeError:
outname = urllib.quote(outname.encode())
self.add_header('Content-Disposition',
'inline; filename*=UTF-8''"%s"' % outname)
if xsend:
self.add_header('X-Accel-Redirect', xsend)
try:
xsend.encode("ISO-8859-1")
self.add_header('X-Accel-Redirect', xsend)
except UnicodeEncodeError:
xsend = urllib.quote(xsend.encode())
self.add_header('X-Accel-Redirect', xsend)
self.end_headers()

if xsend:
Expand Down
Binary file modified python/tests/nistoar/pdr/distrib/data/1491.1_0.mbag0_4-0.zip
Binary file not shown.
14 changes: 7 additions & 7 deletions python/tests/nistoar/pdr/preserv/bagger/test_midas.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ def test_available_files(self):

datafiles = self.bagr.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -505,7 +505,7 @@ def test_available_files(self):
# copy of trial3a.json in upload overrides
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(uplsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_baggermd_file_for(self):
self.bagr.ensure_base_bag()
Expand Down Expand Up @@ -596,7 +596,7 @@ def test_ensure_data_files(self):

self.bagr.ensure_data_files()
self.assertIsNotNone(self.bagr.datafiles)
self.assertEqual(len(self.bagr.datafiles), 5)
self.assertEqual(len(self.bagr.datafiles), 6)
self.assertEqual(len([d for d in self.bagr.datafiles.keys()
if d.endswith(".sha256")]), 2)

Expand Down Expand Up @@ -643,7 +643,7 @@ def test_registered_files(self):

datafiles = self.bagr.registered_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -655,14 +655,14 @@ def test_registered_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_available_files(self):
revsip = os.path.join(self.revdir, self.midasid[32:])

datafiles = self.bagr.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -674,7 +674,7 @@ def test_available_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_fileExaminer(self):
# turn on asyncexamine (but turn off autolaunch so that we can test
Expand Down
16 changes: 8 additions & 8 deletions python/tests/nistoar/pdr/preserv/bagger/test_midas3.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_pod_rec(self):
def test_available_files(self):
datafiles = self.sip.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -120,7 +120,7 @@ def test_available_files(self):
# copy of trial3a.json in upload overrides
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(self.sip.upldatadir, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_registered_files(self):
pod = utils.read_json(os.path.join(self.revdir, "1491", "_pod.json"))
Expand All @@ -130,7 +130,7 @@ def test_registered_files(self):
datafiles = self.sip.registered_files()

self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 4)
self.assertEqual(len(datafiles), 5)
self.assertIn("trial1.json", datafiles)
self.assertNotIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -143,7 +143,7 @@ def test_registered_files(self):
os.path.join(self.sip.revdatadir, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(self.sip.revdatadir, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 4)
self.assertEqual(len(datafiles), 5)

def test_fromPOD(self):
podf = os.path.join(self.revdir, "1491", "_pod.json")
Expand Down Expand Up @@ -775,7 +775,7 @@ def test_ensure_data_files(self):
self.bagr.ensure_data_files(examine="sync")

self.assertIsNotNone(self.bagr.datafiles)
self.assertEqual(len(self.bagr.datafiles), 5)
self.assertEqual(len(self.bagr.datafiles), 6)
self.assertEqual(len([d for d in self.bagr.datafiles.keys()
if d.endswith(".sha256")]), 2)

Expand Down Expand Up @@ -836,14 +836,14 @@ def test_registered_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_available_files(self):
revsip = os.path.join(self.revdir, self.midasid[32:])

datafiles = self.bagr.sip.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -855,7 +855,7 @@ def test_available_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_fileExaminer_autolaunch(self):
# show that the async thread does its work with autolaunch
Expand Down
Loading

0 comments on commit daf23c6

Please sign in to comment.