Skip to content

Commit

Permalink
Added gitignore-flavor ignore patterns support; Refactored file scann…
Browse files Browse the repository at this point in the history
…ing logic during backup creation,

resolved #31, closed #33
see also #32
  • Loading branch information
Fallen-Breath committed Jul 17, 2024
1 parent aaeec5a commit be5e21a
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 88 deletions.
24 changes: 20 additions & 4 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,9 @@ Configs on how the backup is made
"targets": [
"world"
],
"ignored_files": [
"session.lock"
"ignored_files": [],
"ignore_patterns": [
"**/session.lock"
],
"follow_target_symlink": false,
"hash_method": "xxh128",
Expand Down Expand Up @@ -259,8 +260,11 @@ For example, for bukkit-like servers that split the world dimensions, you might

#### ignored_files

A list of file names to be ignored during backup. It contains `session.lock` by default
to solve the backup failure problem caused by `session.lock` being occupied by the server in Windows
!!! warning

Deprecated since v1.8.0. Use [ignore_patterns](#ignore_patterns) instead

A list of file / directory names to be ignored during backup

If the name string starts with `*`, then it will ignore files with name ending with specific string,
e.g. `*.test` makes all files ends with `.test` be ignored, like `a.test`
Expand All @@ -270,6 +274,18 @@ e.g. `temp*` makes all files starts with `temp` be ignored, like `tempfile`

- Type: `List[str]`

#### ignore_patterns

A list of [gitignore flavor](http://git-scm.com/docs/gitignore) patterns for matching files / directories to be excluded during the backup

The root path for the pattern matching is [source_root](#source_root).
For example, if `source_root` is `server`, then pattern `world/trash*.obj` will match `server/world/trash1.obj`

It contains a `**/session.lock` pattern by default, which matches files named `session.lock` in any location.
It's used to solve the backup failure problem caused by `session.lock` being occupied by the server in Windows OS

- Type: `List[str]`

#### follow_target_symlink

When set to `true`, for [backup targets](#targets) that are symbolic links,
Expand Down
24 changes: 20 additions & 4 deletions docs/config.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,9 @@ Prime Backup 在创建备份时的操作时序如下:
"targets": [
"world"
],
"ignored_files": [
"session.lock"
"ignored_files": [],
"ignore_patterns": [
"**/session.lock"
],
"follow_target_symlink": false,
"hash_method": "xxh128",
Expand Down Expand Up @@ -259,8 +260,11 @@ Prime Backup 在创建备份时的操作时序如下:

#### ignored_files

在备份时忽略的文件名列表,默认仅包含 `session.lock`
以解决 Windows 下 `session.lock` 被服务端占用导致备份失败的问题
!!! warning

于 v1.8.0 弃用。请使用 [ignore_patterns](#ignore_patterns)

在备份时忽略的文件名列表

若文件名字符串以 `*` 开头,则将忽略以指定字符串结尾的文件,
`*.test` 表示忽略所有以 `.test` 结尾的文件,如 `a.test`
Expand All @@ -270,6 +274,18 @@ Prime Backup 在创建备份时的操作时序如下:

- 类型:`List[str]`

#### ignore_patterns

一个 [gitignore 风格](http://git-scm.com/docs/gitignore) 的模板串列表,用于在创建备份的过程中匹配并忽略指定的文件 / 文件夹

模板串匹配时的根路径是 [source_root](#source_root)
例如,如果 `source_root``server`,那么模板串 `world/trash*.obj` 将匹配 `server/world/trash1.obj`

默认包含一个 `**/session.lock` 模板串,用于匹配位于任何位置的,名为 `session.lock` 的文件,
以解决 Windows 下 `session.lock` 被服务端占用导致备份失败的问题

- 类型:`List[str]`

#### follow_target_symlink

在设为 `true` 时,对于类型为符号链接的 [备份目标](#targets)
Expand Down
168 changes: 102 additions & 66 deletions prime_backup/action/create_backup_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from pathlib import Path
from typing import List, Optional, Tuple, Callable, Any, Dict, NamedTuple, Generator, Union, Set, Deque, ContextManager

import pathspec

from prime_backup.action.create_backup_action_base import CreateBackupActionBase
from prime_backup.compressors import Compressor, CompressMethod
from prime_backup.db import schema
Expand All @@ -22,7 +24,7 @@
from prime_backup.types.backup_tags import BackupTags
from prime_backup.types.operator import Operator
from prime_backup.types.units import ByteCount
from prime_backup.utils import hash_utils, misc_utils, blob_utils, file_utils, path_utils
from prime_backup.utils import hash_utils, misc_utils, blob_utils, file_utils
from prime_backup.utils.thread_pool import FailFastThreadPool


Expand All @@ -38,10 +40,10 @@ class _BlobCreatePolicy(enum.Enum):
"""
the policy of how to create a blob from a given file path
"""
read_all = enum.auto() # small files: read all in memory, calc hash. read once
hash_once = enum.auto() # files with unique size: compress+hash to temp file, then move. read once
copy_hash = enum.auto() # files that keep changing: copy to temp file, calc hash, compress to blob. read twice and need more spaces
default = enum.auto() # default policy: compress+hash to blob store, check hash again. read twice
read_all = enum.auto() # small files: read all in memory, calc hash | read 1x, write 1x
hash_once = enum.auto() # files with unique size: compress+hash to temp file, then move | read 1x, write 1x, move 1x
copy_hash = enum.auto() # files that keep changing: copy to temp file, calc hash, compress to blob | read 2x, write 2x. need more spaces
default = enum.auto() # default policy: compress+hash to blob store, check hash again | read 2x, write 1x


_BLOB_FILE_CHANGED_RETRY_COUNT = 3
Expand Down Expand Up @@ -163,9 +165,25 @@ def flush(self):
self.fetcher_hash.flush()


class _ScanResult(NamedTuple):
all_file_paths: List[Path]
root_targets: List[str] # list of posix path, related to the source_path
@dataclasses.dataclass(frozen=True)
class _ScanResultEntry:
path: Path # full path, including source_root
stat: os.stat_result

def is_file(self) -> bool:
return stat.S_ISREG(self.stat.st_mode)

def is_dir(self) -> bool:
return stat.S_ISDIR(self.stat.st_mode)

def is_symlink(self) -> bool:
return stat.S_ISLNK(self.stat.st_mode)


@dataclasses.dataclass(frozen=True)
class _ScanResult:
all_files: List[_ScanResultEntry] = dataclasses.field(default_factory=list)
root_targets: List[str] = dataclasses.field(default_factory=list) # list of posix path, related to the source_path


@dataclasses.dataclass(frozen=True)
Expand Down Expand Up @@ -196,56 +214,73 @@ def __init__(self, creator: Operator, comment: str, *, tags: Optional[BackupTags
self.__source_path: Path = source_path or self.config.source_path

def __scan_files(self) -> _ScanResult:
collected = []

scanned_targets: Dict[str, bool] = {} # use as an ordered set
scan_queue: Deque[Path] = collections.deque() # a queue of paths related to the source_path
for scan_target in self.config.backup.targets:
scan_queue.append(Path(scan_target))

self.logger.debug(f'Scanning files at {list(scan_queue)}')
while len(scan_queue) > 0:
scan_target = scan_queue.popleft()
if (target_posix := scan_target.as_posix()) in scanned_targets:
continue
scanned_targets[target_posix] = True

target_path = self.__source_path / scan_target
if not target_path.exists():
self.logger.info('Skipping not-exist backup target {}'.format(target_path))
continue
if not path_utils.is_relative_to(target_path, self.__source_path):
self.logger.warning("Skipping backup target {} cuz it's not inside the source path {}".format(target_path, self.__source_path))
continue

collected.append(target_path)

if target_path.is_symlink() and self.config.backup.follow_target_symlink:
scan_queue.append(target_path.readlink())
continue

# as-is policy, don't scan into symlink
if not target_path.is_symlink() and target_path.is_dir():
for dir_path, dir_names, file_names in os.walk(target_path):
for name in file_names + dir_names:
file_path = Path(dir_path) / name
if not self.config.backup.is_file_ignore(file_path):
collected.append(file_path)

return _ScanResult(all_file_paths=collected, root_targets=list(scanned_targets.keys()))
ignore_patterns = pathspec.GitIgnoreSpec.from_lines(self.config.backup.ignore_patterns)
result = _ScanResult()
visited_path: Set[Path] = set() # full path
ignored_paths: List[Path] = [] # related path

def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult):
def scan(full_path: Path, is_root_target: bool):
try:
rel_path = full_path.relative_to(self.__source_path)
except ValueError:
self.logger.warning("Skipping backup path {} cuz it's not inside the source path {}".format(full_path, self.__source_path))
return

if ignore_patterns.match_file(rel_path) or self.config.backup.is_file_ignore_by_deprecated_ignored_files(rel_path.name):
ignored_paths.append(rel_path)
if is_root_target:
self.logger.warning('Backup target {} is ignored by config'.format(rel_path))
return

if full_path in visited_path:
return
visited_path.add(full_path)

try:
st = full_path.lstat()
except FileNotFoundError:
if is_root_target:
self.logger.info('Backup target {} does not exist, skipped. full_path: {}'.format(rel_path, full_path))
return

entry = _ScanResultEntry(full_path, st)
result.all_files.append(entry)
if is_root_target:
result.root_targets.append(rel_path.as_posix())

if entry.is_dir():
for child in os.listdir(full_path):
scan(full_path / child, False)
elif is_root_target and entry.is_symlink() and self.config.backup.follow_target_symlink:
scan(full_path.readlink(), True)

self.logger.debug(f'Scan file done start, targets: {self.config.backup.targets}')
start_time = time.time()

for target in self.config.backup.targets:
scan(self.__source_path / target, True)

self.logger.debug('Scan file done, cost {:.2f}s, count {}, root_targets (len={}): {}, ignored_paths[:100] (len={}): {}'.format(
time.time() - start_time, len(result.all_files),
len(result.root_targets), result.root_targets,
len(ignored_paths), [p.as_posix() for p in ignored_paths][:100],
))
return result

def __pre_calculate_stats(self, scan_result: _ScanResult):
stats = self.__pre_calc_result.stats
hashes = self.__pre_calc_result.hashes
stats.clear()
for file_entry in scan_result.all_files:
stats[file_entry.path] = file_entry.stat

def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult):
hashes = self.__pre_calc_result.hashes
hashes.clear()

sizes = set()
for path in scan_result.all_file_paths:
st = path.lstat()
stats[path] = st
if stat.S_ISREG(st.st_mode):
sizes.add(st.st_size)
sizes: Set[int] = set()
for file_entry in scan_result.all_files:
if file_entry.is_file():
sizes.add(file_entry.stat.st_size)

hash_dict_lock = threading.Lock()
existence = session.has_blob_with_size_batched(list(sizes))
Expand All @@ -257,12 +292,11 @@ def hash_worker(pth: Path):
hashes[pth] = h

with FailFastThreadPool(name='hasher') as pool:
for path in scan_result.all_file_paths:
st = stats[path]
if stat.S_ISREG(st.st_mode):
if existence[st.st_size]:
for file_entry in scan_result.all_files:
if file_entry.is_file():
if existence[file_entry.stat.st_size]:
# we need to hash the file, sooner or later
pool.submit(hash_worker, path)
pool.submit(hash_worker, file_entry.path)
else:
pass # will use hash_once policy

Expand All @@ -280,9 +314,10 @@ def __get_or_create_blob(self, session: DbSession, src_path: Path, st: os.stat_r
def make_temp_file() -> ContextManager[Path]:
temp_file_name = f'blob_{os.getpid()}_{threading.current_thread().ident}_{src_path_md5}.tmp'
temp_file_path = self.__temp_path / temp_file_name
with contextlib.ExitStack() as exit_stack:
exit_stack.callback(functools.partial(self._remove_file, temp_file_path))
yield temp_file_path
try:
yield
finally:
self._remove_file(temp_file_path, what='temp_file')

def attempt_once(last_chance: bool = False) -> Generator[Any, Any, schema.Blob]:
compress_method: CompressMethod = self.config.backup.get_compress_method_from_size(st.st_size)
Expand Down Expand Up @@ -364,7 +399,7 @@ def bp_rba(h: str) -> Path:
file_utils.copy_file_fast(src_path, temp_file_path)
blob_hash = hash_utils.calc_file_hash(temp_file_path)

misc_utils.assert_true(last_chance, 'only last_chance=True can use do hash_once without checking uniqueness')
misc_utils.assert_true(last_chance, 'only last_chance=True is allowed for the copy_hash policy')
if (cache := self.__blob_by_hash_cache.get(blob_hash)) is not None:
return cache
yield BlobByHashFetcher.Req(blob_hash)
Expand Down Expand Up @@ -416,7 +451,7 @@ def bp_rba(h: str) -> Path:
raw_size, stored_size = cr.read_size, cr.write_size
check_changes(cr.read_size, cr.read_hash)
else:
raise AssertionError()
raise AssertionError('bad policy {!r}'.format(policy))

misc_utils.assert_true(blob_hash is not None, 'blob_hash is None')
misc_utils.assert_true(raw_size is not None, 'raw_size is None')
Expand Down Expand Up @@ -511,10 +546,11 @@ def run(self) -> BackupInfo:
tags=self.tags.to_dict(),
)
self.logger.info('Creating backup for {} at path {!r}, file cnt {}, timestamp {!r}, creator {!r}, comment {!r}, tags {!r}'.format(
scan_result.root_targets, self.__source_path.as_posix(), len(scan_result.all_file_paths),
scan_result.root_targets, self.__source_path.as_posix(), len(scan_result.all_files),
backup.timestamp, backup.creator, backup.comment, backup.tags,
))

self.__pre_calculate_stats(scan_result)
if self.config.get_effective_concurrency() > 1:
self.__pre_calculate_hash(session, scan_result)
self.logger.info('Pre-calculate all file hash done')
Expand All @@ -526,8 +562,8 @@ def run(self) -> BackupInfo:

files = []
schedule_queue: Deque[Tuple[Generator, Any]] = collections.deque()
for file_path in scan_result.all_file_paths:
schedule_queue.append((self.__create_file(session, file_path), None))
for file_entry in scan_result.all_files:
schedule_queue.append((self.__create_file(session, file_entry.path), None))
while len(schedule_queue) > 0:
gen, value = schedule_queue.popleft()
try:
Expand Down
4 changes: 2 additions & 2 deletions prime_backup/action/create_backup_action_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ def __init__(self):
self.__new_blobs_summary: Optional[BlobListSummary] = None
self.__blobs_rollbackers: List[Callable] = []

def _remove_file(self, file_to_remove: Path):
def _remove_file(self, file_to_remove: Path, *, what: str = 'rollback'):
try:
file_to_remove.unlink(missing_ok=True)
except OSError as e:
self.logger.error('(rollback) remove file {!r} failed: {}'.format(file_to_remove, e))
self.logger.error('({}) remove file {!r} failed: {}'.format(what, file_to_remove, e))

def _add_remove_file_rollbacker(self, file_to_remove: Path):
self.__blobs_rollbackers.append(functools.partial(self._remove_file, file_to_remove=file_to_remove))
Expand Down
19 changes: 7 additions & 12 deletions prime_backup/config/backup_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from pathlib import Path
from typing import List, Optional

from mcdreforged.api.utils import Serializable
Expand All @@ -13,8 +12,9 @@ class BackupConfig(Serializable):
targets: List[str] = [
'world',
]
ignored_files: List[str] = [
'session.lock',
ignored_files: List[str] = [] # deprecated
ignore_patterns: List[str] = [
'**/session.lock',
]
follow_target_symlink: bool = False
hash_method: HashMethod = HashMethod.xxh128
Expand All @@ -30,18 +30,13 @@ def get_compress_method_from_size(self, file_size: int, *, compress_method_overr
else:
return self.compress_method

def is_file_ignore(self, full_path: Path) -> bool:
"""
Apply to not only files
"""
# TODO: better rule?
name = full_path.name
def is_file_ignore_by_deprecated_ignored_files(self, file_name: str) -> bool:
for item in self.ignored_files:
if len(item) > 0:
if item[0] == '*' and name.endswith(item[1:]):
if item[0] == '*' and file_name.endswith(item[1:]):
return True
if item[-1] == '*' and name.startswith(item[:-1]):
if item[-1] == '*' and file_name.startswith(item[:-1]):
return True
if name == item:
if file_name == item:
return True
return False
Loading

0 comments on commit be5e21a

Please sign in to comment.