Skip to content

Commit

Permalink
Improve caching by comparing file hashes as fallback for mtime and size
Browse files Browse the repository at this point in the history
  • Loading branch information
cdce8p committed Jul 29, 2023
1 parent 1a972e3 commit d908b98
Show file tree
Hide file tree
Showing 8 changed files with 229 additions and 132 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
<!-- Changes that improve Black's performance. -->

- Avoid importing `IPython` if notebook cells do not contain magics (#3782)
- Improve caching by comparing file hashes as fallback for mtime and size. (#3821)

### Output

Expand Down
7 changes: 7 additions & 0 deletions docs/contributing/reference/reference_classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,13 @@ Black Classes
:show-inheritance:
:members:

:class:`Cache`
------------------------

.. autoclass:: black.cache.Cache
:show-inheritance:
:member:

Enum Classes
~~~~~~~~~~~~~

Expand Down
8 changes: 0 additions & 8 deletions docs/contributing/reference/reference_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,10 @@ Split functions
Caching
-------

.. autofunction:: black.cache.filter_cached

.. autofunction:: black.cache.get_cache_dir

.. autofunction:: black.cache.get_cache_file

.. autofunction:: black.cache.get_cache_info

.. autofunction:: black.cache.read_cache

.. autofunction:: black.cache.write_cache

Utilities
---------

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ dependencies = [
"pathspec>=0.9.0",
"platformdirs>=2",
"tomli>=1.1.0; python_version < '3.11'",
"typing_extensions>=3.10.0.0; python_version < '3.10'",
"typing_extensions>=4.0.1; python_version < '3.11'",
]
dynamic = ["readme", "version"]

Expand Down
12 changes: 5 additions & 7 deletions src/black/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pathspec.patterns.gitwildmatch import GitWildMatchPatternError

from _black_version import version as __version__
from black.cache import Cache, get_cache_info, read_cache, write_cache
from black.cache import Cache
from black.comments import normalize_fmt_off
from black.const import (
DEFAULT_EXCLUDES,
Expand Down Expand Up @@ -775,12 +775,9 @@ def reformat_one(
if format_stdin_to_stdout(fast=fast, write_back=write_back, mode=mode):
changed = Changed.YES
else:
cache: Cache = {}
cache = Cache.read(mode)
if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
cache = read_cache(mode)
res_src = src.resolve()
res_src_s = str(res_src)
if res_src_s in cache and cache[res_src_s] == get_cache_info(res_src):
if not cache.is_changed(src):
changed = Changed.CACHED
if changed is not Changed.CACHED and format_file_in_place(
src, fast=fast, write_back=write_back, mode=mode
Expand All @@ -789,7 +786,8 @@ def reformat_one(
if (write_back is WriteBack.YES and changed is not Changed.CACHED) or (
write_back is WriteBack.CHECK and changed is Changed.NO
):
write_cache(cache, [src], mode)
cache.update([src])
cache.write()
report.done(src, changed)
except Exception as exc:
if report.verbose:
Expand Down
167 changes: 108 additions & 59 deletions src/black/cache.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
"""Caching of formatted files with feature-based invalidation."""

import hashlib
import os
import pickle
import sys
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Iterable, Set, Tuple
from typing import Dict, Iterable, NamedTuple, Set, Tuple

from platformdirs import user_cache_dir

from _black_version import version as __version__
from black.mode import Mode

# types
Timestamp = float
FileSize = int
CacheInfo = Tuple[Timestamp, FileSize]
Cache = Dict[str, CacheInfo]
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self


class FileData(NamedTuple):
st_mtime: float
st_size: int
hash: str


def get_cache_dir() -> Path:
Expand All @@ -37,61 +44,103 @@ def get_cache_dir() -> Path:
CACHE_DIR = get_cache_dir()


def read_cache(mode: Mode) -> Cache:
"""Read the cache if it exists and is well formed.
If it is not well formed, the call to write_cache later should resolve the issue.
"""
cache_file = get_cache_file(mode)
if not cache_file.exists():
return {}

with cache_file.open("rb") as fobj:
try:
cache: Cache = pickle.load(fobj)
except (pickle.UnpicklingError, ValueError, IndexError):
return {}

return cache


def get_cache_file(mode: Mode) -> Path:
return CACHE_DIR / f"cache.{mode.get_cache_key()}.pickle"


def get_cache_info(path: Path) -> CacheInfo:
"""Return the information used to check if a file is already formatted or not."""
stat = path.stat()
return stat.st_mtime, stat.st_size

@dataclass
class Cache:
mode: Mode
cache_file: Path
file_data: Dict[str, FileData] = field(default_factory=dict)

@classmethod
def read(cls, mode: Mode) -> Self:
"""Read the cache if it exists and is well formed.
If it is not well formed, the call to write later should
resolve the issue.
"""
cache_file = get_cache_file(mode)
if not cache_file.exists():
return cls(mode, cache_file)

with cache_file.open("rb") as fobj:
try:
file_data: Dict[str, FileData] = pickle.load(fobj)
except (pickle.UnpicklingError, ValueError, IndexError):
return cls(mode, cache_file)

return cls(mode, cache_file, file_data)

@staticmethod
def hash_digest(path: Path) -> str:
"""Return hash digest for path."""

with open(path, "rb") as fp:
data = fp.read()
return hashlib.sha256(data).hexdigest()

@staticmethod
def get_file_data(path: Path) -> FileData:
"""Return file data for path."""

stat = path.stat()
hash = Cache.hash_digest(path)
return FileData(stat.st_mtime, stat.st_size, hash)

def is_changed(self, source: Path) -> bool:
"""Check if source has changed compared to cached version."""
res_src = source.resolve()
old = self.file_data.get(str(res_src))
if old is None:
return True

st = res_src.stat()
if st.st_size != old.st_size or int(st.st_mtime) != int(old.st_mtime):
new_hash = Cache.hash_digest(res_src)
if st.st_size != old.st_size or new_hash != old.hash:
return True
return False

def filtered_cached(self, sources: Iterable[Path]) -> Tuple[Set[Path], Set[Path]]:
"""Split an iterable of paths in `sources` into two sets.
The first contains paths of files that modified on disk or are not in the
cache. The other contains paths to non-modified files.
"""
changed: Set[Path] = set()
done: Set[Path] = set()
for src in sources:
res_src = src.resolve()
old = self.file_data.get(str(res_src))
st = res_src.stat()

if old is None:
changed.add(src)
continue
elif st.st_size != old.st_size or int(st.st_mtime) != int(old.st_mtime):
new_hash = Cache.hash_digest(res_src)
if st.st_size != old.st_size or new_hash != old.hash:
changed.add(src)
continue
done.add(src)
return changed, done

def filter_cached(cache: Cache, sources: Iterable[Path]) -> Tuple[Set[Path], Set[Path]]:
"""Split an iterable of paths in `sources` into two sets.
def update(self, sources: Iterable[Path]) -> None:
"""Update the cache file data."""
self.file_data.update(
**{str(src.resolve()): Cache.get_file_data(src) for src in sources}
)

The first contains paths of files that modified on disk or are not in the
cache. The other contains paths to non-modified files.
"""
todo, done = set(), set()
for src in sources:
res_src = src.resolve()
if cache.get(str(res_src)) != get_cache_info(res_src):
todo.add(src)
else:
done.add(src)
return todo, done


def write_cache(cache: Cache, sources: Iterable[Path], mode: Mode) -> None:
"""Update the cache file."""
cache_file = get_cache_file(mode)
try:
CACHE_DIR.mkdir(parents=True, exist_ok=True)
new_cache = {
**cache,
**{str(src.resolve()): get_cache_info(src) for src in sources},
}
with tempfile.NamedTemporaryFile(dir=str(cache_file.parent), delete=False) as f:
pickle.dump(new_cache, f, protocol=4)
os.replace(f.name, cache_file)
except OSError:
pass
def write(self) -> None:
"""Write a new cache file."""
try:
CACHE_DIR.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile(
dir=str(self.cache_file.parent), delete=False
) as f:
pickle.dump(self.file_data, f, protocol=4)
os.replace(f.name, self.cache_file)
except OSError:
pass
10 changes: 5 additions & 5 deletions src/black/concurrency.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from mypy_extensions import mypyc_attr

from black import WriteBack, format_file_in_place
from black.cache import Cache, filter_cached, read_cache, write_cache
from black.cache import Cache
from black.mode import Mode
from black.output import err
from black.report import Changed, Report
Expand Down Expand Up @@ -133,10 +133,9 @@ async def schedule_formatting(
`write_back`, `fast`, and `mode` options are passed to
:func:`format_file_in_place`.
"""
cache: Cache = {}
cache = Cache.read(mode)
if write_back not in (WriteBack.DIFF, WriteBack.COLOR_DIFF):
cache = read_cache(mode)
sources, cached = filter_cached(cache, sources)
sources, cached = cache.filtered_cached(sources)
for src in sorted(cached):
report.done(src, Changed.CACHED)
if not sources:
Expand Down Expand Up @@ -185,4 +184,5 @@ async def schedule_formatting(
if cancelled:
await asyncio.gather(*cancelled, return_exceptions=True)
if sources_to_cache:
write_cache(cache, sources_to_cache, mode)
cache.update(sources_to_cache)
cache.write()
Loading

0 comments on commit d908b98

Please sign in to comment.