-
Notifications
You must be signed in to change notification settings - Fork 2.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve caching by comparing file hashes as fallback for mtime and size #3821
Changes from all commits
d908b98
7dd6427
07fe5b1
64bbbce
85b4a91
088ea2f
4278c86
6e1a57b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,28 @@ | ||
"""Caching of formatted files with feature-based invalidation.""" | ||
|
||
import hashlib | ||
import os | ||
import pickle | ||
import sys | ||
import tempfile | ||
from dataclasses import dataclass, field | ||
from pathlib import Path | ||
from typing import Dict, Iterable, Set, Tuple | ||
from typing import Dict, Iterable, NamedTuple, Set, Tuple | ||
|
||
from platformdirs import user_cache_dir | ||
|
||
from _black_version import version as __version__ | ||
from black.mode import Mode | ||
|
||
# types | ||
Timestamp = float | ||
FileSize = int | ||
CacheInfo = Tuple[Timestamp, FileSize] | ||
Cache = Dict[str, CacheInfo] | ||
if sys.version_info >= (3, 11): | ||
from typing import Self | ||
else: | ||
from typing_extensions import Self | ||
|
||
|
||
class FileData(NamedTuple): | ||
st_mtime: float | ||
st_size: int | ||
hash: str | ||
|
||
|
||
def get_cache_dir() -> Path: | ||
|
@@ -37,61 +44,92 @@ def get_cache_dir() -> Path: | |
CACHE_DIR = get_cache_dir() | ||
|
||
|
||
def read_cache(mode: Mode) -> Cache: | ||
"""Read the cache if it exists and is well formed. | ||
|
||
If it is not well formed, the call to write_cache later should resolve the issue. | ||
""" | ||
cache_file = get_cache_file(mode) | ||
if not cache_file.exists(): | ||
return {} | ||
|
||
with cache_file.open("rb") as fobj: | ||
try: | ||
cache: Cache = pickle.load(fobj) | ||
except (pickle.UnpicklingError, ValueError, IndexError): | ||
return {} | ||
|
||
return cache | ||
|
||
|
||
def get_cache_file(mode: Mode) -> Path: | ||
return CACHE_DIR / f"cache.{mode.get_cache_key()}.pickle" | ||
|
||
|
||
def get_cache_info(path: Path) -> CacheInfo: | ||
"""Return the information used to check if a file is already formatted or not.""" | ||
stat = path.stat() | ||
return stat.st_mtime, stat.st_size | ||
|
||
|
||
def filter_cached(cache: Cache, sources: Iterable[Path]) -> Tuple[Set[Path], Set[Path]]: | ||
"""Split an iterable of paths in `sources` into two sets. | ||
|
||
The first contains paths of files that modified on disk or are not in the | ||
cache. The other contains paths to non-modified files. | ||
""" | ||
todo, done = set(), set() | ||
for src in sources: | ||
res_src = src.resolve() | ||
if cache.get(str(res_src)) != get_cache_info(res_src): | ||
todo.add(src) | ||
else: | ||
done.add(src) | ||
return todo, done | ||
|
||
|
||
def write_cache(cache: Cache, sources: Iterable[Path], mode: Mode) -> None: | ||
"""Update the cache file.""" | ||
cache_file = get_cache_file(mode) | ||
try: | ||
CACHE_DIR.mkdir(parents=True, exist_ok=True) | ||
new_cache = { | ||
**cache, | ||
**{str(src.resolve()): get_cache_info(src) for src in sources}, | ||
} | ||
with tempfile.NamedTemporaryFile(dir=str(cache_file.parent), delete=False) as f: | ||
pickle.dump(new_cache, f, protocol=4) | ||
os.replace(f.name, cache_file) | ||
except OSError: | ||
pass | ||
@dataclass | ||
class Cache: | ||
mode: Mode | ||
cache_file: Path | ||
file_data: Dict[str, FileData] = field(default_factory=dict) | ||
|
||
@classmethod | ||
def read(cls, mode: Mode) -> Self: | ||
"""Read the cache if it exists and is well formed. | ||
|
||
If it is not well formed, the call to write later should | ||
resolve the issue. | ||
""" | ||
cache_file = get_cache_file(mode) | ||
if not cache_file.exists(): | ||
return cls(mode, cache_file) | ||
|
||
with cache_file.open("rb") as fobj: | ||
try: | ||
file_data: Dict[str, FileData] = pickle.load(fobj) | ||
except (pickle.UnpicklingError, ValueError, IndexError): | ||
return cls(mode, cache_file) | ||
|
||
return cls(mode, cache_file, file_data) | ||
|
||
@staticmethod | ||
def hash_digest(path: Path) -> str: | ||
"""Return hash digest for path.""" | ||
|
||
data = path.read_bytes() | ||
return hashlib.sha256(data).hexdigest() | ||
|
||
@staticmethod | ||
def get_file_data(path: Path) -> FileData: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not a global function? staticmethods often feel a bit useless. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like it here as it helps to group these methods nicely. Obviously personal preference. Though, if you want me to change it, I can do that too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's just leave it as is, thanks! |
||
"""Return file data for path.""" | ||
|
||
stat = path.stat() | ||
hash = Cache.hash_digest(path) | ||
return FileData(stat.st_mtime, stat.st_size, hash) | ||
|
||
def is_changed(self, source: Path) -> bool: | ||
"""Check if source has changed compared to cached version.""" | ||
res_src = source.resolve() | ||
old = self.file_data.get(str(res_src)) | ||
if old is None: | ||
return True | ||
|
||
st = res_src.stat() | ||
if st.st_size != old.st_size: | ||
return True | ||
if int(st.st_mtime) != int(old.st_mtime): | ||
new_hash = Cache.hash_digest(res_src) | ||
if new_hash != old.hash: | ||
return True | ||
return False | ||
|
||
def filtered_cached(self, sources: Iterable[Path]) -> Tuple[Set[Path], Set[Path]]: | ||
"""Split an iterable of paths in `sources` into two sets. | ||
|
||
The first contains paths of files that modified on disk or are not in the | ||
cache. The other contains paths to non-modified files. | ||
""" | ||
changed: Set[Path] = set() | ||
done: Set[Path] = set() | ||
for src in sources: | ||
cdce8p marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self.is_changed(src): | ||
changed.add(src) | ||
else: | ||
done.add(src) | ||
return changed, done | ||
|
||
def write(self, sources: Iterable[Path]) -> None: | ||
"""Update the cache file data and write a new cache file.""" | ||
self.file_data.update( | ||
**{str(src.resolve()): Cache.get_file_data(src) for src in sources} | ||
) | ||
try: | ||
CACHE_DIR.mkdir(parents=True, exist_ok=True) | ||
with tempfile.NamedTemporaryFile( | ||
dir=str(self.cache_file.parent), delete=False | ||
) as f: | ||
pickle.dump(self.file_data, f, protocol=4) | ||
os.replace(f.name, self.cache_file) | ||
except OSError: | ||
pass |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I think it should be fine to use sha1 here, which is about 1.4x faster for me
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sha256
works well for mypy. I would stick with it in this case.https://github.com/python/mypy/blob/v1.5.1/mypy/util.py#L501-L510