From 4bd612f8d07e632511718d15c07abc3cbb66d09a Mon Sep 17 00:00:00 2001 From: Matej Aleksandrov Date: Thu, 17 Oct 2024 08:08:50 -0700 Subject: [PATCH] Add --pyink-ipynb-unicode-escape option for unicode escaping PiperOrigin-RevId: 686918137 --- patches/pyink.patch | 65 ++++++++++++++++--------- src/pyink/__init__.py | 12 +++++ src/pyink/ink.py | 20 ++++++++ src/pyink/mode.py | 2 + tests/data/pyink_configs/example.ipynb | 4 +- tests/data/pyink_configs/overrides.toml | 1 + tests/test_ipynb.py | 21 +++++--- 7 files changed, 92 insertions(+), 33 deletions(-) diff --git a/patches/pyink.patch b/patches/pyink.patch index 1cca49b3b6f..ed0e9ee5066 100644 --- a/patches/pyink.patch +++ b/patches/pyink.patch @@ -37,7 +37,7 @@ COMPILED = Path(__file__).suffix in (".pyd", ".so") -@@ -338,6 +345,53 @@ def validate_regex( +@@ -338,6 +345,61 @@ def validate_regex( ), ) @click.option( @@ -67,6 +67,14 @@ + ), +) +@click.option( ++ "--pyink-ipynb-unicode-escape", ++ is_flag=True, ++ help=( ++ "Enable serialization of Jupyter notebook content into a JSON form" ++ " where characters <, >, and & are unicode escaped." ++ ), ++) ++@click.option( + "--pyink-annotation-pragmas", + type=str, + multiple=True, @@ -91,19 +99,20 @@ "--check", is_flag=True, help=( -@@ -530,6 +584,11 @@ def main( # noqa: C901 +@@ -530,6 +592,12 @@ def main( # noqa: C901 preview: bool, unstable: bool, enable_unstable_feature: list[Preview], + pyink: bool, + pyink_indentation: str, + pyink_ipynb_indentation: str, ++ pyink_ipynb_unicode_escape: bool, + pyink_annotation_pragmas: list[str], + pyink_use_majority_quotes: bool, quiet: bool, verbose: bool, required_version: Optional[str], -@@ -636,7 +695,15 @@ def main( # noqa: C901 +@@ -636,7 +704,16 @@ def main( # noqa: C901 preview=preview, unstable=unstable, python_cell_magics=set(python_cell_magics), @@ -111,6 +120,7 @@ + is_pyink=pyink, + pyink_indentation=int(pyink_indentation), + pyink_ipynb_indentation=int(pyink_ipynb_indentation), ++ pyink_ipynb_unicode_escape=pyink_ipynb_unicode_escape, + pyink_annotation_pragmas=( + tuple(pyink_annotation_pragmas) or DEFAULT_ANNOTATION_PRAGMAS + ), @@ -120,7 +130,7 @@ ) lines: list[tuple[int, int]] = [] -@@ -1132,6 +1199,17 @@ def validate_metadata(nb: MutableMapping +@@ -1132,6 +1209,17 @@ def validate_metadata(nb: MutableMapping if language is not None and language != "python": raise NothingChanged from None @@ -138,7 +148,7 @@ def format_ipynb_string(src_contents: str, *, fast: bool, mode: Mode) -> FileContent: """Format Jupyter notebook. -@@ -1143,7 +1221,6 @@ def format_ipynb_string(src_contents: st +@@ -1143,7 +1231,6 @@ def format_ipynb_string(src_contents: st raise NothingChanged trailing_newline = src_contents[-1] == "\n" @@ -146,7 +156,7 @@ nb = json.loads(src_contents) validate_metadata(nb) for cell in nb["cells"]: -@@ -1155,14 +1232,15 @@ def format_ipynb_string(src_contents: st +@@ -1155,14 +1242,17 @@ def format_ipynb_string(src_contents: st pass else: cell["source"] = dst.splitlines(keepends=True) @@ -164,13 +174,15 @@ + dst_contents = json.dumps( + nb, indent=mode.pyink_ipynb_indentation, ensure_ascii=False + ) ++ if mode.pyink_ipynb_unicode_escape: ++ dst_contents = ink.unicode_escape_json(dst_contents) + if trailing_newline: + dst_contents = dst_contents + "\n" + return dst_contents def format_str( -@@ -1223,6 +1301,8 @@ def _format_str_once( +@@ -1223,6 +1313,8 @@ def _format_str_once( future_imports = get_future_imports(src_node) versions = detect_target_versions(src_node, future_imports=future_imports) @@ -973,7 +985,7 @@ @dataclass -@@ -237,12 +261,20 @@ class Mode: +@@ -237,12 +261,21 @@ class Mode: target_versions: set[TargetVersion] = field(default_factory=set) line_length: int = DEFAULT_LINE_LENGTH string_normalization: bool = True @@ -990,11 +1002,12 @@ + is_pyink: bool = False + pyink_indentation: Literal[2, 4] = 4 + pyink_ipynb_indentation: Literal[1, 2] = 1 ++ pyink_ipynb_unicode_escape: bool = False + pyink_annotation_pragmas: tuple[str, ...] = DEFAULT_ANNOTATION_PRAGMAS unstable: bool = False enabled_features: set[Preview] = field(default_factory=set) -@@ -254,6 +286,9 @@ class Mode: +@@ -254,6 +287,9 @@ class Mode: except those in UNSTABLE_FEATURES are enabled. Any features in `self.enabled_features` are also enabled. """ @@ -1004,7 +1017,7 @@ if self.unstable: return True if feature in self.enabled_features: -@@ -285,12 +320,27 @@ class Mode: +@@ -285,12 +321,28 @@ class Mode: version_str, str(self.line_length), str(int(self.string_normalization)), @@ -1019,6 +1032,7 @@ + str(int(self.is_pyink)), + str(self.pyink_indentation), + str(self.pyink_ipynb_indentation), ++ str(int(self.pyink_ipynb_unicode_escape)), + sha256(str(self.pyink_annotation_pragmas).encode()).hexdigest()[:8], features_and_magics, ] @@ -1439,18 +1453,23 @@ format_cell, format_file_contents, format_file_in_place, -@@ -27,8 +28,10 @@ pytest.importorskip("IPython", reason="I +@@ -27,8 +28,15 @@ pytest.importorskip("IPython", reason="I pytest.importorskip("tokenize_rt", reason="tokenize-rt is an optional dependency") JUPYTER_MODE = Mode(is_ipynb=True) -+PYINK_JUPYTER_MODE = Mode(is_ipynb=True, pyink_indentation=2, pyink_ipynb_indentation=2) ++PYINK_JUPYTER_MODE = Mode( ++ is_ipynb=True, ++ pyink_indentation=2, ++ pyink_ipynb_indentation=2, ++ pyink_ipynb_unicode_escape=True, ++) EMPTY_CONFIG = DATA_DIR / "empty_pyproject.toml" +PYINK_OVERRIDE_CONFIG = DATA_DIR / "pyink_configs" / "overrides.toml" runner = CliRunner() -@@ -174,6 +177,22 @@ def test_cell_magic_with_magic() -> None +@@ -174,6 +182,22 @@ def test_cell_magic_with_magic() -> None @pytest.mark.parametrize( @@ -1473,7 +1492,7 @@ "mode, expected_output, expectation", [ pytest.param( -@@ -224,6 +243,13 @@ def test_cell_magic_with_custom_python_m +@@ -224,6 +248,13 @@ def test_cell_magic_with_custom_python_m format_cell(src, fast=True, mode=JUPYTER_MODE) @@ -1487,7 +1506,7 @@ def test_cell_magic_nested() -> None: src = "%%time\n%%time\n2+2" result = format_cell(src, fast=True, mode=JUPYTER_MODE) -@@ -397,6 +423,45 @@ def test_entire_notebook_no_trailing_new +@@ -397,6 +428,45 @@ def test_entire_notebook_no_trailing_new assert result == expected @@ -1513,8 +1532,8 @@ + ' "%%time\\n",\n' + ' "\\n",\n' + ' "a = 1\\n",\n' -+ ' "if a == 1:\\n",\n' -+ ' " print(\\"\\")"\n' ++ ' "if a \\u003c 1 or a \\u003e 1:\\n",\n' ++ ' " print(\\"\\u0026\\u003c\\u003e\\")"\n' + " ]\n" + " }\n" + " ],\n" @@ -1533,7 +1552,7 @@ def test_entire_notebook_without_changes() -> None: content = read_jupyter_notebook("jupyter", "notebook_without_changes") with pytest.raises(NothingChanged): -@@ -448,6 +513,30 @@ def test_ipynb_diff_with_no_change() -> +@@ -448,6 +518,30 @@ def test_ipynb_diff_with_no_change() -> assert expected in result.output @@ -1546,18 +1565,18 @@ + f"--config={PYINK_OVERRIDE_CONFIG}", + ], + ) -+ expected = """00:00:cell_1 ++ expected = """cell_1 +@@ -1,6 +1,5 @@ +- %%time ++%%time + +-a=1 -+-if a ==1: -+- print("") ++-if a <1 or a>1: ++- print("&<>") +- ++a = 1 -++if a == 1: -++ print("")""" +++if a < 1 or a > 1: +++ print("&<>")""" + assert expected in result.output + + diff --git a/src/pyink/__init__.py b/src/pyink/__init__.py index fdad04144f9..99647535b56 100644 --- a/src/pyink/__init__.py +++ b/src/pyink/__init__.py @@ -370,6 +370,14 @@ def validate_regex( " notebook." ), ) +@click.option( + "--pyink-ipynb-unicode-escape", + is_flag=True, + help=( + "Enable serialization of Jupyter notebook content into a JSON form" + " where characters <, >, and & are unicode escaped." + ), +) @click.option( "--pyink-annotation-pragmas", type=str, @@ -587,6 +595,7 @@ def main( # noqa: C901 pyink: bool, pyink_indentation: str, pyink_ipynb_indentation: str, + pyink_ipynb_unicode_escape: bool, pyink_annotation_pragmas: list[str], pyink_use_majority_quotes: bool, quiet: bool, @@ -698,6 +707,7 @@ def main( # noqa: C901 is_pyink=pyink, pyink_indentation=int(pyink_indentation), pyink_ipynb_indentation=int(pyink_ipynb_indentation), + pyink_ipynb_unicode_escape=pyink_ipynb_unicode_escape, pyink_annotation_pragmas=( tuple(pyink_annotation_pragmas) or DEFAULT_ANNOTATION_PRAGMAS ), @@ -1238,6 +1248,8 @@ def format_ipynb_string(src_contents: str, *, fast: bool, mode: Mode) -> FileCon dst_contents = json.dumps( nb, indent=mode.pyink_ipynb_indentation, ensure_ascii=False ) + if mode.pyink_ipynb_unicode_escape: + dst_contents = ink.unicode_escape_json(dst_contents) if trailing_newline: dst_contents = dst_contents + "\n" return dst_contents diff --git a/src/pyink/ink.py b/src/pyink/ink.py index b57a3094018..042c1ed98bf 100644 --- a/src/pyink/ink.py +++ b/src/pyink/ink.py @@ -75,6 +75,26 @@ def get_code_start(src: str) -> str: return "" +def unicode_escape_json(src: str) -> str: + """Escapes problematic unicode characters in JSON string. + + This mimicks the implementation in Colab backend and converts characters + <, >, and & to their unicode representations. More info in + go/unicode-escaping-in-colab. + + Args: + src: A serialized JSON string. + + Returns: + A serialized JSON string with unicode escaped characters. + """ + def _match_to_unicode(match: re.Match[str]) -> str: + char = match.group(0) + return f"\\u{hex(ord(char))[2:].zfill(4)}" + + return re.sub(r"[<>&]", _match_to_unicode, src) + + def convert_unchanged_lines(src_node: Node, lines: Collection[tuple[int, int]]): """Converts unchanged lines to STANDALONE_COMMENT. diff --git a/src/pyink/mode.py b/src/pyink/mode.py index 19d3b49750c..af74e5a7638 100644 --- a/src/pyink/mode.py +++ b/src/pyink/mode.py @@ -274,6 +274,7 @@ class Mode: is_pyink: bool = False pyink_indentation: Literal[2, 4] = 4 pyink_ipynb_indentation: Literal[1, 2] = 1 + pyink_ipynb_unicode_escape: bool = False pyink_annotation_pragmas: tuple[str, ...] = DEFAULT_ANNOTATION_PRAGMAS unstable: bool = False enabled_features: set[Preview] = field(default_factory=set) @@ -331,6 +332,7 @@ def get_cache_key(self) -> str: str(int(self.is_pyink)), str(self.pyink_indentation), str(self.pyink_ipynb_indentation), + str(int(self.pyink_ipynb_unicode_escape)), sha256(str(self.pyink_annotation_pragmas).encode()).hexdigest()[:8], features_and_magics, ] diff --git a/tests/data/pyink_configs/example.ipynb b/tests/data/pyink_configs/example.ipynb index 2cef7ca7dc6..410394efc3a 100644 --- a/tests/data/pyink_configs/example.ipynb +++ b/tests/data/pyink_configs/example.ipynb @@ -16,8 +16,8 @@ " %%time\n", "\n", "a=1\n", - "if a ==1:\n", - " print(\"\")\n" + "if a <1 or a>1:\n", + " print(\"&<>\")\n" ] } ], diff --git a/tests/data/pyink_configs/overrides.toml b/tests/data/pyink_configs/overrides.toml index 067f882d16b..88fa22556c9 100644 --- a/tests/data/pyink_configs/overrides.toml +++ b/tests/data/pyink_configs/overrides.toml @@ -1,4 +1,5 @@ [tool.pyink] pyink-indentation = 2 pyink-ipynb-indentation = 2 +pyink-ipynb-unicode-escape = true pyink-annotation-pragmas = ["@param", "type: ignore"] diff --git a/tests/test_ipynb.py b/tests/test_ipynb.py index 0382106b048..971d3427e87 100644 --- a/tests/test_ipynb.py +++ b/tests/test_ipynb.py @@ -28,7 +28,12 @@ pytest.importorskip("tokenize_rt", reason="tokenize-rt is an optional dependency") JUPYTER_MODE = Mode(is_ipynb=True) -PYINK_JUPYTER_MODE = Mode(is_ipynb=True, pyink_indentation=2, pyink_ipynb_indentation=2) +PYINK_JUPYTER_MODE = Mode( + is_ipynb=True, + pyink_indentation=2, + pyink_ipynb_indentation=2, + pyink_ipynb_unicode_escape=True, +) EMPTY_CONFIG = DATA_DIR / "empty_pyproject.toml" PYINK_OVERRIDE_CONFIG = DATA_DIR / "pyink_configs" / "overrides.toml" @@ -445,8 +450,8 @@ def test_entire_notebook_with_pyink_overrides() -> None: ' "%%time\\n",\n' ' "\\n",\n' ' "a = 1\\n",\n' - ' "if a == 1:\\n",\n' - ' " print(\\"\\")"\n' + ' "if a \\u003c 1 or a \\u003e 1:\\n",\n' + ' " print(\\"\\u0026\\u003c\\u003e\\")"\n' " ]\n" " }\n" " ],\n" @@ -522,18 +527,18 @@ def test_ipynb_diff_with_pyink_overrides() -> None: f"--config={PYINK_OVERRIDE_CONFIG}", ], ) - expected = """00:00:cell_1 + expected = """cell_1 @@ -1,6 +1,5 @@ - %%time +%%time -a=1 --if a ==1: -- print("") +-if a <1 or a>1: +- print("&<>") - +a = 1 -+if a == 1: -+ print("")""" ++if a < 1 or a > 1: ++ print("&<>")""" assert expected in result.output