Added gitignore-flavor ignore patterns support; Refactored file scann…

…ing logic during backup creation, resolved #31, closed #33 see also #32
TISUnion · Jul 17, 2024 · be5e21a · be5e21a
1 parent aaeec5a
commit be5e21a
Show file tree

Hide file tree

Showing 6 changed files with 152 additions and 88 deletions.
diff --git a/docs/config.md b/docs/config.md
@@ -211,8 +211,9 @@ Configs on how the backup is made
     "targets": [
         "world"
     ],
-    "ignored_files": [
-        "session.lock"
+    "ignored_files": [],
+    "ignore_patterns": [
+       "**/session.lock"
     ],
     "follow_target_symlink": false,
     "hash_method": "xxh128",
@@ -259,8 +260,11 @@ For example, for bukkit-like servers that split the world dimensions, you might
 
 #### ignored_files
 
-A list of file names to be ignored during backup. It contains `session.lock` by default 
-to solve the backup failure problem caused by `session.lock` being occupied by the server in Windows
+!!! warning
+
+    Deprecated since v1.8.0. Use [ignore_patterns](#ignore_patterns) instead
+
+A list of file / directory names to be ignored during backup
 
 If the name string starts with `*`, then it will ignore files with name ending with specific string, 
 e.g. `*.test` makes all files ends with `.test` be ignored, like `a.test`
@@ -270,6 +274,18 @@ e.g. `temp*`  makes all files starts with `temp` be ignored, like `tempfile`
 
 - Type: `List[str]`
 
+#### ignore_patterns
+
+A list of [gitignore flavor](http://git-scm.com/docs/gitignore) patterns for matching files / directories to be excluded during the backup
+
+The root path for the pattern matching is [source_root](#source_root).
+For example, if `source_root` is `server`, then pattern `world/trash*.obj` will match `server/world/trash1.obj`
+
+It contains a `**/session.lock` pattern by default, which matches files named `session.lock` in any location.
+It's used to solve the backup failure problem caused by `session.lock` being occupied by the server in Windows OS
+
+- Type: `List[str]`
+
 #### follow_target_symlink
 
 When set to `true`, for [backup targets](#targets) that are symbolic links,

diff --git a/docs/config.zh.md b/docs/config.zh.md
@@ -211,8 +211,9 @@ Prime Backup 在创建备份时的操作时序如下：
     "targets": [
         "world"
     ],
-    "ignored_files": [
-        "session.lock"
+    "ignored_files": [],
+    "ignore_patterns": [
+       "**/session.lock"
     ],
     "follow_target_symlink": false,
     "hash_method": "xxh128",
@@ -259,8 +260,11 @@ Prime Backup 在创建备份时的操作时序如下：
 
 #### ignored_files
 
-在备份时忽略的文件名列表，默认仅包含 `session.lock` 
-以解决 Windows 下 `session.lock` 被服务端占用导致备份失败的问题
+!!! warning
+
+    于 v1.8.0 弃用。请使用 [ignore_patterns](#ignore_patterns)
+
+在备份时忽略的文件名列表
 
 若文件名字符串以 `*` 开头，则将忽略以指定字符串结尾的文件，
 如 `*.test` 表示忽略所有以 `.test` 结尾的文件，如 `a.test`
@@ -270,6 +274,18 @@ Prime Backup 在创建备份时的操作时序如下：
 
 - 类型：`List[str]`
 
+#### ignore_patterns
+
+一个 [gitignore 风格](http://git-scm.com/docs/gitignore) 的模板串列表，用于在创建备份的过程中匹配并忽略指定的文件 / 文件夹
+
+模板串匹配时的根路径是 [source_root](#source_root)。
+例如，如果 `source_root` 是 `server`，那么模板串 `world/trash*.obj` 将匹配 `server/world/trash1.obj`
+
+默认包含一个 `**/session.lock` 模板串，用于匹配位于任何位置的，名为 `session.lock` 的文件，
+以解决 Windows 下 `session.lock` 被服务端占用导致备份失败的问题
+
+- 类型：`List[str]`
+
 #### follow_target_symlink
 
 在设为 `true` 时，对于类型为符号链接的 [备份目标](#targets)，

diff --git a/prime_backup/action/create_backup_action.py b/prime_backup/action/create_backup_action.py
@@ -12,6 +12,8 @@
 from pathlib import Path
 from typing import List, Optional, Tuple, Callable, Any, Dict, NamedTuple, Generator, Union, Set, Deque, ContextManager
 
+import pathspec
+
 from prime_backup.action.create_backup_action_base import CreateBackupActionBase
 from prime_backup.compressors import Compressor, CompressMethod
 from prime_backup.db import schema
@@ -22,7 +24,7 @@
 from prime_backup.types.backup_tags import BackupTags
 from prime_backup.types.operator import Operator
 from prime_backup.types.units import ByteCount
-from prime_backup.utils import hash_utils, misc_utils, blob_utils, file_utils, path_utils
+from prime_backup.utils import hash_utils, misc_utils, blob_utils, file_utils
 from prime_backup.utils.thread_pool import FailFastThreadPool
 
 
@@ -38,10 +40,10 @@ class _BlobCreatePolicy(enum.Enum):
 	"""
 	the policy of how to create a blob from a given file path
 	"""
-	read_all = enum.auto()   # small files: read all in memory, calc hash. read once
-	hash_once = enum.auto()  # files with unique size: compress+hash to temp file, then move. read once
-	copy_hash = enum.auto()  # files that keep changing: copy to temp file, calc hash, compress to blob. read twice and need more spaces
-	default = enum.auto()    # default policy: compress+hash to blob store, check hash again. read twice
+	read_all = enum.auto()   # small files: read all in memory, calc hash                                |  read 1x, write 1x
+	hash_once = enum.auto()  # files with unique size: compress+hash to temp file, then move             |  read 1x, write 1x, move 1x
+	copy_hash = enum.auto()  # files that keep changing: copy to temp file, calc hash, compress to blob  |  read 2x, write 2x. need more spaces
+	default = enum.auto()    # default policy: compress+hash to blob store, check hash again             |  read 2x, write 1x
 
 
 _BLOB_FILE_CHANGED_RETRY_COUNT = 3
@@ -163,9 +165,25 @@ def flush(self):
 		self.fetcher_hash.flush()
 
 
-class _ScanResult(NamedTuple):
-	all_file_paths: List[Path]
-	root_targets: List[str]  # list of posix path, related to the source_path
+@dataclasses.dataclass(frozen=True)
+class _ScanResultEntry:
+	path: Path  # full path, including source_root
+	stat: os.stat_result
+
+	def is_file(self) -> bool:
+		return stat.S_ISREG(self.stat.st_mode)
+
+	def is_dir(self) -> bool:
+		return stat.S_ISDIR(self.stat.st_mode)
+
+	def is_symlink(self) -> bool:
+		return stat.S_ISLNK(self.stat.st_mode)
+
+
+@dataclasses.dataclass(frozen=True)
+class _ScanResult:
+	all_files: List[_ScanResultEntry] = dataclasses.field(default_factory=list)
+	root_targets: List[str] = dataclasses.field(default_factory=list)  # list of posix path, related to the source_path
 
 
 @dataclasses.dataclass(frozen=True)
@@ -196,56 +214,73 @@ def __init__(self, creator: Operator, comment: str, *, tags: Optional[BackupTags
 		self.__source_path: Path = source_path or self.config.source_path
 
 	def __scan_files(self) -> _ScanResult:
-		collected = []
-
-		scanned_targets: Dict[str, bool] = {}  # use as an ordered set
-		scan_queue: Deque[Path] = collections.deque()  # a queue of paths related to the source_path
-		for scan_target in self.config.backup.targets:
-			scan_queue.append(Path(scan_target))
-
-		self.logger.debug(f'Scanning files at {list(scan_queue)}')
-		while len(scan_queue) > 0:
-			scan_target = scan_queue.popleft()
-			if (target_posix := scan_target.as_posix()) in scanned_targets:
-				continue
-			scanned_targets[target_posix] = True
-
-			target_path = self.__source_path / scan_target
-			if not target_path.exists():
-				self.logger.info('Skipping not-exist backup target {}'.format(target_path))
-				continue
-			if not path_utils.is_relative_to(target_path, self.__source_path):
-				self.logger.warning("Skipping backup target {} cuz it's not inside the source path {}".format(target_path, self.__source_path))
-				continue
-
-			collected.append(target_path)
-
-			if target_path.is_symlink() and self.config.backup.follow_target_symlink:
-				scan_queue.append(target_path.readlink())
-				continue
-
-			# as-is policy, don't scan into symlink
-			if not target_path.is_symlink() and target_path.is_dir():
-				for dir_path, dir_names, file_names in os.walk(target_path):
-					for name in file_names + dir_names:
-						file_path = Path(dir_path) / name
-						if not self.config.backup.is_file_ignore(file_path):
-							collected.append(file_path)
-
-		return _ScanResult(all_file_paths=collected, root_targets=list(scanned_targets.keys()))
+		ignore_patterns = pathspec.GitIgnoreSpec.from_lines(self.config.backup.ignore_patterns)
+		result = _ScanResult()
+		visited_path: Set[Path] = set()  # full path
+		ignored_paths: List[Path] = []   # related path
 
-	def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult):
+		def scan(full_path: Path, is_root_target: bool):
+			try:
+				rel_path = full_path.relative_to(self.__source_path)
+			except ValueError:
+				self.logger.warning("Skipping backup path {} cuz it's not inside the source path {}".format(full_path, self.__source_path))
+				return
+
+			if ignore_patterns.match_file(rel_path) or self.config.backup.is_file_ignore_by_deprecated_ignored_files(rel_path.name):
+				ignored_paths.append(rel_path)
+				if is_root_target:
+					self.logger.warning('Backup target {} is ignored by config'.format(rel_path))
+				return
+
+			if full_path in visited_path:
+				return
+			visited_path.add(full_path)
+
+			try:
+				st = full_path.lstat()
+			except FileNotFoundError:
+				if is_root_target:
+					self.logger.info('Backup target {} does not exist, skipped. full_path: {}'.format(rel_path, full_path))
+				return
+
+			entry = _ScanResultEntry(full_path, st)
+			result.all_files.append(entry)
+			if is_root_target:
+				result.root_targets.append(rel_path.as_posix())
+
+			if entry.is_dir():
+				for child in os.listdir(full_path):
+					scan(full_path / child, False)
+			elif is_root_target and entry.is_symlink() and self.config.backup.follow_target_symlink:
+				scan(full_path.readlink(), True)
+
+		self.logger.debug(f'Scan file done start, targets: {self.config.backup.targets}')
+		start_time = time.time()
+
+		for target in self.config.backup.targets:
+			scan(self.__source_path / target, True)
+
+		self.logger.debug('Scan file done, cost {:.2f}s, count {}, root_targets (len={}): {}, ignored_paths[:100] (len={}): {}'.format(
+			time.time() - start_time, len(result.all_files),
+			len(result.root_targets), result.root_targets,
+			len(ignored_paths), [p.as_posix() for p in ignored_paths][:100],
+		))
+		return result
+
+	def __pre_calculate_stats(self, scan_result: _ScanResult):
 		stats = self.__pre_calc_result.stats
-		hashes = self.__pre_calc_result.hashes
 		stats.clear()
+		for file_entry in scan_result.all_files:
+			stats[file_entry.path] = file_entry.stat
+
+	def __pre_calculate_hash(self, session: DbSession, scan_result: _ScanResult):
+		hashes = self.__pre_calc_result.hashes
 		hashes.clear()
 
-		sizes = set()
-		for path in scan_result.all_file_paths:
-			st = path.lstat()
-			stats[path] = st
-			if stat.S_ISREG(st.st_mode):
-				sizes.add(st.st_size)
+		sizes: Set[int] = set()
+		for file_entry in scan_result.all_files:
+			if file_entry.is_file():
+				sizes.add(file_entry.stat.st_size)
 
 		hash_dict_lock = threading.Lock()
 		existence = session.has_blob_with_size_batched(list(sizes))
@@ -257,12 +292,11 @@ def hash_worker(pth: Path):
 				hashes[pth] = h
 
 		with FailFastThreadPool(name='hasher') as pool:
-			for path in scan_result.all_file_paths:
-				st = stats[path]
-				if stat.S_ISREG(st.st_mode):
-					if existence[st.st_size]:
+			for file_entry in scan_result.all_files:
+				if file_entry.is_file():
+					if existence[file_entry.stat.st_size]:
 						# we need to hash the file, sooner or later
-						pool.submit(hash_worker, path)
+						pool.submit(hash_worker, file_entry.path)
 					else:
 						pass  # will use hash_once policy
 
@@ -280,9 +314,10 @@ def __get_or_create_blob(self, session: DbSession, src_path: Path, st: os.stat_r
 		def make_temp_file() -> ContextManager[Path]:
 			temp_file_name = f'blob_{os.getpid()}_{threading.current_thread().ident}_{src_path_md5}.tmp'
 			temp_file_path = self.__temp_path / temp_file_name
-			with contextlib.ExitStack() as exit_stack:
-				exit_stack.callback(functools.partial(self._remove_file, temp_file_path))
-				yield temp_file_path
+			try:
+				yield
+			finally:
+				self._remove_file(temp_file_path, what='temp_file')
 
 		def attempt_once(last_chance: bool = False) -> Generator[Any, Any, schema.Blob]:
 			compress_method: CompressMethod = self.config.backup.get_compress_method_from_size(st.st_size)
@@ -364,7 +399,7 @@ def bp_rba(h: str) -> Path:
 					file_utils.copy_file_fast(src_path, temp_file_path)
 					blob_hash = hash_utils.calc_file_hash(temp_file_path)
 
-					misc_utils.assert_true(last_chance, 'only last_chance=True can use do hash_once without checking uniqueness')
+					misc_utils.assert_true(last_chance, 'only last_chance=True is allowed for the copy_hash policy')
 					if (cache := self.__blob_by_hash_cache.get(blob_hash)) is not None:
 						return cache
 					yield BlobByHashFetcher.Req(blob_hash)
@@ -416,7 +451,7 @@ def bp_rba(h: str) -> Path:
 						raw_size, stored_size = cr.read_size, cr.write_size
 						check_changes(cr.read_size, cr.read_hash)
 				else:
-					raise AssertionError()
+					raise AssertionError('bad policy {!r}'.format(policy))
 
 			misc_utils.assert_true(blob_hash is not None, 'blob_hash is None')
 			misc_utils.assert_true(raw_size is not None, 'raw_size is None')
@@ -511,10 +546,11 @@ def run(self) -> BackupInfo:
 					tags=self.tags.to_dict(),
 				)
 				self.logger.info('Creating backup for {} at path {!r}, file cnt {}, timestamp {!r}, creator {!r}, comment {!r}, tags {!r}'.format(
-					scan_result.root_targets, self.__source_path.as_posix(), len(scan_result.all_file_paths),
+					scan_result.root_targets, self.__source_path.as_posix(), len(scan_result.all_files),
 					backup.timestamp, backup.creator, backup.comment, backup.tags,
 				))
 
+				self.__pre_calculate_stats(scan_result)
 				if self.config.get_effective_concurrency() > 1:
 					self.__pre_calculate_hash(session, scan_result)
 					self.logger.info('Pre-calculate all file hash done')
@@ -526,8 +562,8 @@ def run(self) -> BackupInfo:
 
 				files = []
 				schedule_queue: Deque[Tuple[Generator, Any]] = collections.deque()
-				for file_path in scan_result.all_file_paths:
-					schedule_queue.append((self.__create_file(session, file_path), None))
+				for file_entry in scan_result.all_files:
+					schedule_queue.append((self.__create_file(session, file_entry.path), None))
 				while len(schedule_queue) > 0:
 					gen, value = schedule_queue.popleft()
 					try:

diff --git a/prime_backup/action/create_backup_action_base.py b/prime_backup/action/create_backup_action_base.py
@@ -17,11 +17,11 @@ def __init__(self):
 		self.__new_blobs_summary: Optional[BlobListSummary] = None
 		self.__blobs_rollbackers: List[Callable] = []
 
-	def _remove_file(self, file_to_remove: Path):
+	def _remove_file(self, file_to_remove: Path, *, what: str = 'rollback'):
 		try:
 			file_to_remove.unlink(missing_ok=True)
 		except OSError as e:
-			self.logger.error('(rollback) remove file {!r} failed: {}'.format(file_to_remove, e))
+			self.logger.error('({}) remove file {!r} failed: {}'.format(what, file_to_remove, e))
 
 	def _add_remove_file_rollbacker(self, file_to_remove: Path):
 		self.__blobs_rollbackers.append(functools.partial(self._remove_file, file_to_remove=file_to_remove))

diff --git a/prime_backup/config/backup_config.py b/prime_backup/config/backup_config.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import List, Optional
 
 from mcdreforged.api.utils import Serializable
@@ -13,8 +12,9 @@ class BackupConfig(Serializable):
 	targets: List[str] = [
 		'world',
 	]
-	ignored_files: List[str] = [
-		'session.lock',
+	ignored_files: List[str] = []  # deprecated
+	ignore_patterns: List[str] = [
+		'**/session.lock',
 	]
 	follow_target_symlink: bool = False
 	hash_method: HashMethod = HashMethod.xxh128
@@ -30,18 +30,13 @@ def get_compress_method_from_size(self, file_size: int, *, compress_method_overr
 			else:
 				return self.compress_method
 
-	def is_file_ignore(self, full_path: Path) -> bool:
-		"""
-		Apply to not only files
-		"""
-		# TODO: better rule?
-		name = full_path.name
+	def is_file_ignore_by_deprecated_ignored_files(self, file_name: str) -> bool:
 		for item in self.ignored_files:
 			if len(item) > 0:
-				if item[0] == '*' and name.endswith(item[1:]):
+				if item[0] == '*' and file_name.endswith(item[1:]):
 					return True
-				if item[-1] == '*' and name.startswith(item[:-1]):
+				if item[-1] == '*' and file_name.startswith(item[:-1]):
 					return True
-				if name == item:
+				if file_name == item:
 					return True
 		return False