From 4e5fb1361665204b1c5e6565bc84ab6e2ed68162 Mon Sep 17 00:00:00 2001 From: dsoselia <36576137+dsoselia@users.noreply.github.com> Date: Thu, 30 Jun 2022 20:53:39 +0200 Subject: [PATCH 1/3] Create .gitignore --- .gitignore | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..ffb18beb8c --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# VSCode +.vscode + +# Other +datasets +checkpoints +output_dir +test.py +*.pyc From 543f8da8598b12bfb46980114074889f4486c69a Mon Sep 17 00:00:00 2001 From: Alfred Date: Thu, 28 Jul 2022 16:49:52 +0200 Subject: [PATCH 2/3] add error if grads nonfinite --- util/misc.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/util/misc.py b/util/misc.py index ad9a7868ec..c2fd7b3089 100644 --- a/util/misc.py +++ b/util/misc.py @@ -45,7 +45,8 @@ def synchronize_between_processes(self): """ if not is_dist_avail_and_initialized(): return - t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + t = torch.tensor([self.count, self.total], + dtype=torch.float64, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() @@ -218,7 +219,8 @@ def init_distributed_mode(args): args.rank = int(os.environ['OMPI_COMM_WORLD_RANK']) args.world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) args.gpu = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) - args.dist_url = "tcp://%s:%s" % (os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) + args.dist_url = "tcp://%s:%s" % ( + os.environ['MASTER_ADDR'], os.environ['MASTER_PORT']) os.environ['LOCAL_RANK'] = str(args.gpu) os.environ['RANK'] = str(args.rank) os.environ['WORLD_SIZE'] = str(args.world_size) @@ -254,13 +256,16 @@ class NativeScalerWithGradNormCount: def __init__(self): self._scaler = torch.cuda.amp.GradScaler() - def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): + def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True, + error_if_nonfinite: bool = False): self._scaler.scale(loss).backward(create_graph=create_graph) if update_grad: if clip_grad is not None: assert parameters is not None - self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place - norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) + # unscale the gradients of optimizer's assigned params in-place + self._scaler.unscale_(optimizer) + norm = torch.nn.utils.clip_grad_norm_( + parameters, clip_grad, error_if_nonfinite=error_if_nonfinite) else: self._scaler.unscale_(optimizer) norm = get_grad_norm_(parameters) @@ -286,9 +291,11 @@ def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: return torch.tensor(0.) device = parameters[0].grad.device if norm_type == inf: - total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + total_norm = max(p.grad.detach().abs().max().to(device) + for p in parameters) else: - total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + total_norm = torch.norm(torch.stack([torch.norm( + p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) return total_norm @@ -309,7 +316,8 @@ def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler): save_on_master(to_save, checkpoint_path) else: client_state = {'epoch': epoch} - model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % epoch_name, client_state=client_state) + model.save_checkpoint(save_dir=args.output_dir, tag="checkpoint-%s" % + epoch_name, client_state=client_state) def load_model(args, model_without_ddp, optimizer, loss_scaler): @@ -337,4 +345,4 @@ def all_reduce_mean(x): x_reduce /= world_size return x_reduce.item() else: - return x \ No newline at end of file + return x From 527ec80540ed0a3180920e2be48e0af998e1f730 Mon Sep 17 00:00:00 2001 From: Alfred Date: Fri, 7 Oct 2022 12:52:06 +0000 Subject: [PATCH 3/3] test --- util/misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/util/misc.py b/util/misc.py index c2fd7b3089..f437672ec8 100644 --- a/util/misc.py +++ b/util/misc.py @@ -300,6 +300,7 @@ def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: def save_model(args, epoch, model, model_without_ddp, optimizer, loss_scaler): + #test output_dir = Path(args.output_dir) epoch_name = str(epoch) if loss_scaler is not None: