Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
kienerj committed Nov 2, 2023
2 parents af1127e + d64e2cb commit b09630b
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 6 deletions.
14 changes: 12 additions & 2 deletions pycdxml/cdxml_converter/chemdraw_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from lxml import etree as ET
import logging
import re

from ..utils.cdxml_io import etree_to_cdxml

Expand Down Expand Up @@ -437,7 +438,7 @@ def _read_attributes(self, element: ET.Element):
chemdraw_type = ChemDrawDocument.CDX_PROPERTIES[tag_id]["type"]
logger.debug(f"Reading property {prop_name} of type {chemdraw_type}.")
klass = globals()[chemdraw_type]
if prop_name == "UTF8Text":
if prop_name in ["UTF8Text", "Keyword", "Content"]:
type_obj = klass.from_bytes(prop_bytes, charset="utf8")
elif chemdraw_type == "CDXString":
type_obj = klass.from_bytes(prop_bytes, fonttable=self.fonttable)
Expand Down Expand Up @@ -494,7 +495,16 @@ def _read_attributes(self, element: ET.Element):
# adds style tags <s></s> to this t element containing styled text
type_obj.to_element(element)
else:
element.attrib[prop_name] = type_obj.to_property_value()
try:
element.attrib[prop_name] = type_obj.to_property_value()
except ValueError as e:
# https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
# This error is usually caused when a control character is found which is invalid in xml
# Since this is rare, we only replace it in case of need for performance reasons
logger.warning(f"{e}. Replacing invalid chars with ''.")
val = type_obj.to_property_value()
val = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', val)
element.attrib[prop_name] = val

logger.debug('Successfully finished reading attributes.')
# move back 2 positions, finished reading attributes
Expand Down
18 changes: 14 additions & 4 deletions pycdxml/cdxml_converter/chemdraw_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> '

# get charset from first fontstyle
try:
charset = CDXString.get_charset(fonttable, font_styles)
if fonttable is not None:
charset = CDXString.get_charset(fonttable, font_styles)
text_length = len(property_bytes) - (CDXString.BYTES_PER_STYLE * style_runs) - 2
except pycdxml.cdxml_converter.chemdraw_objects.MissingFontException as ex:
# to deal with issue #30 - no style runs and the uint16 defining number of style runs is completely omitted
Expand All @@ -116,6 +117,15 @@ def from_bytes(property_bytes: bytes, charset='iso-8859-1', fonttable=None) -> '
logger.warning("Found unsupported charset. Retrying with 'utf8'.")
stream.seek(stream.tell() - text_length)
value = stream.read(text_length).decode('utf8')
except UnicodeDecodeError:
stream.seek(stream.tell() - text_length)
if charset == 'utf8':
logger.warning("Found unsupported character for utf8. Retrying with errors=='replace'.")
else:
logger.warning(f"Found unsupported character for charset {charset}. "
f"Retrying with 'utf8' and errors=='replace'.")
value = stream.read(text_length).decode('utf8', errors="replace")

# Normalize to xml spec where all line breaks in attributes are represented by \n
value = value.replace("\r", "\n")
logger.debug(f"Read String '{value}' with {len(font_styles)} different styles.")
Expand Down Expand Up @@ -1967,8 +1977,8 @@ def __init__(self, value: int):

@staticmethod
def from_bytes(property_bytes: bytes) -> 'CDXPositioningType':
if len(property_bytes) != 2:
raise ValueError("CDXPositioningType should consist of exactly 2 bytes.")
if len(property_bytes) != 1:
raise ValueError("CDXPositioningType should consist of exactly 1 bytes.")
value = int.from_bytes(property_bytes, "little", signed=True)
return CDXPositioningType(value)

Expand All @@ -1977,7 +1987,7 @@ def from_string(value: str) -> 'CDXPositioningType':
return CDXPositioningType[value]

def to_bytes(self) -> bytes:
return self.positioning_type.to_bytes(2, byteorder='little', signed=True)
return self.positioning_type.to_bytes(1, byteorder='little', signed=True)

def to_property_value(self) -> str:
val = str(CDXPositioningType(self.positioning_type))
Expand Down
3 changes: 3 additions & 0 deletions tests/cdxml_converter_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

cwd = os.getcwd()
if cwd.endswith("tests"):
os.chdir(Path(cwd).parent)

class CdxmlConverterTest(unittest.TestCase):
"""
Expand Down

0 comments on commit b09630b

Please sign in to comment.