mala-project · RandomDefaultUser · Oct 7, 2024 · Jun 24, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -158,6 +158,13 @@ cython_debug/
 # JupyterNotebooks
 .ipynb_checkpoints
 */.ipynb_checkpoints/*
+*.ipynb
+
+# Lightning
+lightning_logs/
+
+# wandb
+wandb/
 
 # SQLite
 *.db

diff --git a/docs/source/advanced_usage/hyperparameters.rst b/docs/source/advanced_usage/hyperparameters.rst
@@ -114,7 +114,7 @@ a physical validation metric such as
 
  .. code-block:: python
 
- parameters.running.after_before_training_metric = "band_energy"
+ parameters.running.after_training_metric = "band_energy"
 
 Advanced optimization algorithms
 ********************************

diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst
@@ -77,7 +77,7 @@ Specifically, when setting
 
  .. code-block:: python
 
- parameters.running.after_before_training_metric = "band_energy"
+ parameters.running.after_training_metric = "band_energy"
 
 the error in the band energy between actual and predicted LDOS will be
 calculated and printed before and after network training (in meV/atom).
@@ -205,21 +205,21 @@ visualization prior to training via
 
  # 0: No visualizatuon, 1: loss and learning rate, 2: like 1,
  # but additionally weights and biases are saved
- parameters.running.visualisation = 1
- parameters.running.visualisation_dir = "mala_vis"
+ parameters.running.logging = 1
+ parameters.running.logging_dir = "mala_vis"
 
-where ``visualisation_dir`` specifies some directory in which to save the
-MALA visualization data. Afterwards, you can run the training without any
+where ``logging_dir`` specifies some directory in which to save the
+MALA logging data. Afterwards, you can run the training without any
 other modifications. Once training is finished (or during training, in case
 you want to use tensorboard to monitor progress), you can launch tensorboard
 via
 
  .. code-block:: bash
 
- tensorboard --logdir path_to_visualization
+ tensorboard --logdir path_to_log_directory
 
-The full path for ``path_to_visualization`` can be accessed via
-``trainer.full_visualization_path``.
+The full path for ``path_to_log_directory`` can be accessed via
+``trainer.full_logging_path``.
 
 
 Training in parallel

diff --git a/docs/source/basic_usage/hyperparameters.rst b/docs/source/basic_usage/hyperparameters.rst
@@ -118,9 +118,9 @@ properties of the ``Parameters`` class:
  during the optimization.
  - ``network.layer_sizes``
  - ``"int"``, ``"categorical"``
- * - ``"trainingtype"``
+ * - ``"optimizer"``
  - Optimization algorithm used during the NN optimization.
- - ``running.trainingtype``
+ - ``running.optimizer``
  - ``"categorical"``
  * - ``"mini_batch_size"``
  - Size of the mini batches used to calculate the gradient during

diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst
@@ -35,7 +35,7 @@ options to train a simple network with example data, namely
  parameters.running.max_number_epochs = 100
  parameters.running.mini_batch_size = 40
  parameters.running.learning_rate = 0.00001
- parameters.running.trainingtype = "Adam"
+ parameters.running.optimizer = "Adam"
  parameters.verbosity = 1 # level of output; 1 is standard, 0 is low, 2 is debug.
 
 Here, we can see that the ``Parameters`` object contains multiple

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -80,6 +80,7 @@
  "asap3",
  "openpmd_io",
  "skspatial",
+ "tqdm",
 ]
 
 myst_heading_anchors = 3

diff --git a/docs/source/install/installing_lammps.rst b/docs/source/install/installing_lammps.rst
@@ -49,7 +49,7 @@ The MALA team recommends to build LAMMPS with ``cmake``. To do so
 
  cmake ../cmake -D PKG_KOKKOS=yes -D BUILD_MPI=yes -D PKG_ML-SNAP=yes -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ARCH_HSW=yes -D Kokkos_ARCH_VOLTA70=yes -D CMAKE_CXX_COMPILER=/path/to/lammps/lib/kokkos/bin/nvcc_wrapper -D BUILD_SHARED_LIBS=yes
 
- .. note::
+.. note::
  When using a GPU by setting ``parameters.use_gpu = True``, you *need* to
  have a GPU version of ``LAMMPS`` installed. See :ref:`production_gpu` for
  details.

diff --git a/examples/advanced/ex01_checkpoint_training.py b/examples/advanced/ex01_checkpoint_training.py
@@ -26,7 +26,7 @@ def initial_setup():
  parameters.running.max_number_epochs = 9
  parameters.running.mini_batch_size = 8
  parameters.running.learning_rate = 0.00001
- parameters.running.trainingtype = "Adam"
+ parameters.running.optimizer = "Adam"
 
  # We checkpoint the training every 5 epochs and save the results
  # as "ex07".

diff --git a/examples/advanced/ex03_tensor_board.py b/examples/advanced/ex03_tensor_board.py
@@ -18,7 +18,7 @@
 parameters.running.max_number_epochs = 100
 parameters.running.mini_batch_size = 40
 parameters.running.learning_rate = 0.001
-parameters.running.trainingtype = "Adam"
+parameters.running.optimizer = "Adam"
 
 # Turn the visualization on and select a folder to save the visualization
 # files into.
@@ -45,6 +45,6 @@
 trainer.train_network()
 printout(
  'Run finished, launch tensorboard with "tensorboard --logdir '
- + trainer.full_visualization_path
+ + trainer.full_logging_path
  + '"'
 )
diff --git a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py
@@ -21,7 +21,7 @@ def initial_setup():
  parameters.running.max_number_epochs = 10
  parameters.running.mini_batch_size = 40
  parameters.running.learning_rate = 0.00001
- parameters.running.trainingtype = "Adam"
+ parameters.running.optimizer = "Adam"
  parameters.hyperparameters.n_trials = 9
  parameters.hyperparameters.checkpoints_each_trial = 5
  parameters.hyperparameters.checkpoint_name = "ex05_checkpoint"

diff --git a/examples/advanced/ex06_distributed_hyperparameter_optimization.py b/examples/advanced/ex06_distributed_hyperparameter_optimization.py
@@ -28,7 +28,7 @@
 parameters.running.max_number_epochs = 5
 parameters.running.mini_batch_size = 40
 parameters.running.learning_rate = 0.00001
-parameters.running.trainingtype = "Adam"
+parameters.running.optimizer = "Adam"
 parameters.hyperparameters.n_trials = 10
 parameters.hyperparameters.checkpoints_each_trial = -1
 parameters.hyperparameters.checkpoint_name = "ex06"
@@ -44,7 +44,7 @@
 parameters.targets.ldos_gridspacing_ev = 2.5
 parameters.targets.ldos_gridoffset_ev = -5
 parameters.hyperparameters.number_training_per_trial = 3
-parameters.running.after_before_training_metric = "band_energy"
+parameters.running.after_training_metric = "band_energy"
 
 data_handler = mala.DataHandler(parameters)
 

diff --git a/examples/advanced/ex07_advanced_hyperparameter_optimization.py b/examples/advanced/ex07_advanced_hyperparameter_optimization.py
@@ -21,7 +21,7 @@ def optimize_hyperparameters(hyper_optimizer):
  parameters.running.max_number_epochs = 10
  parameters.running.mini_batch_size = 40
  parameters.running.learning_rate = 0.00001
- parameters.running.trainingtype = "Adam"
+ parameters.running.optimizer = "Adam"
  parameters.hyperparameters.n_trials = 8
  parameters.hyperparameters.hyper_opt_method = hyper_optimizer
 
@@ -64,7 +64,7 @@ def optimize_hyperparameters(hyper_optimizer):
  data_handler.output_dimension,
  ]
  hyperoptimizer.add_hyperparameter(
- "categorical", "trainingtype", choices=["Adam", "SGD"]
+ "categorical", "optimizer", choices=["Adam", "SGD"]
  )
  hyperoptimizer.add_hyperparameter(
  "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"]

diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py
@@ -28,7 +28,7 @@
 parameters.running.max_number_epochs = 100
 parameters.running.mini_batch_size = 40
 parameters.running.learning_rate = 0.00001
-parameters.running.trainingtype = "Adam"
+parameters.running.optimizer = "Adam"
 # These parameters characterize how the LDOS and bispectrum descriptors
 # were calculated. They are _technically_ not needed to train a simple
 # network. However, it is useful to define them prior to training. Then,

diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py
@@ -21,15 +21,15 @@
 # It is recommended to enable the "lazy-loading" feature, so that
 # data is loaded into memory one snapshot at a time during testing - this
 # helps keep RAM requirement down. Furthermore, you have to decide which
-# observables to test (usual choices are "band_energy", "total_energy" and
-# "number_of_electrons") and whether you want the results per snapshot
+# observables to test (usual choices are "band_energy", "total_energy")
+# and whether you want the results per snapshot
 # (output_format="list") or as an averaged value (output_format="mae")
 ####################
 
 parameters, network, data_handler, tester = mala.Tester.load_run(
  run_name=model_name, path=model_path
 )
-tester.observables_to_test = ["band_energy", "number_of_electrons"]
+tester.observables_to_test = ["band_energy", "density"]
 tester.output_format = "list"
 parameters.data.use_lazy_loading = True
 

diff --git a/examples/basic/ex04_hyperparameter_optimization.py b/examples/basic/ex04_hyperparameter_optimization.py
@@ -22,7 +22,7 @@
 parameters.data.output_rescaling_type = "normal"
 parameters.running.max_number_epochs = 20
 parameters.running.mini_batch_size = 40
-parameters.running.trainingtype = "Adam"
+parameters.running.optimizer = "Adam"
 parameters.hyperparameters.n_trials = 20
 
 ####################

diff --git a/mala/common/parameters.py b/mala/common/parameters.py
@@ -265,11 +265,6 @@ class ParametersNetwork(ParametersBase):
  Number of hidden layers to be used in lstm or gru or transformer nets
  Default: None
 
- dropout: float
- Dropout rate for transformer net
- 0.0 ≤ dropout ≤ 1.0
- Default: 0.0
-
  num_heads: int
  Number of heads to be used in Multi head attention network
  This should be a divisor of input dimension
@@ -452,7 +447,7 @@ class ParametersTargets(ParametersBase):
  Number of points in the energy grid that is used to calculate the
  (L)DOS.
 
- ldos_gridsize : float
+ ldos_gridsize : int
  Gridsize of the LDOS.
 
  ldos_gridspacing_ev: float
@@ -625,9 +620,8 @@ class ParametersRunning(ParametersBase):
 
  Attributes
  ----------
- trainingtype : string
- Training type to be used. Supported options at the moment:
-
+ optimizer : string
+ Optimizer to be used. Supported options at the moment:
  - SGD: Stochastic gradient descent.
  - Adam: Adam Optimization Algorithm
 
@@ -640,10 +634,6 @@ class ParametersRunning(ParametersBase):
  mini_batch_size : int
  Size of the mini batch for the optimization algorihm. Default: 10.
 
- weight_decay : float
- Weight decay for regularization. Always refers to L2 regularization.
- Default: 0.
-
  early_stopping_epochs : int
  Number of epochs the validation accuracy is allowed to not improve by
  at leastearly_stopping_threshold, before we terminate. If 0, no
@@ -696,19 +686,13 @@ class ParametersRunning(ParametersBase):
  Name used for the checkpoints. Using this, multiple runs
  can be performed in the same directory.
 
- visualisation : int
- If True then Tensorboard is activated for visualisation
- case 0: No tensorboard activated
- case 1: tensorboard activated with Loss and learning rate
- case 2; additonally weights and biases and gradient
+ logging_dir : string
+ Name of the folder that logging files will be saved to.
 
- visualisation_dir : string
- Name of the folder that visualization files will be saved to.
-
- visualisation_dir_append_date : bool
- If True, then upon creating visualization files, these will be saved
- in a subfolder of visualisation_dir labelled with the starting date
- of the visualization, to avoid having to change input scripts often.
+ logging_dir_append_date : bool
+ If True, then upon creating logging files, these will be saved
+ in a subfolder of logging_dir labelled with the starting date
+ of the logging, to avoid having to change input scripts often.
 
  inference_data_grid : list
  List holding the grid to be used for inference in the form of
@@ -717,7 +701,7 @@ class ParametersRunning(ParametersBase):
  use_mixed_precision : bool
  If True, mixed precision computation (via AMP) will be used.
 
- training_report_frequency : int
+ training_log_interval : int
  Determines how often detailed performance info is printed during
  training (only has an effect if the verbosity is high enough).
 
@@ -729,36 +713,49 @@ class ParametersRunning(ParametersBase):
 
  def __init__(self):
  super(ParametersRunning, self).__init__()
- self.trainingtype = "SGD"
- self.learning_rate = 0.5
+ self.optimizer = "Adam"
+ self.learning_rate = 10 ** (-5)
+ self.learning_rate_embedding = 10 ** (-4)
  self.max_number_epochs = 100
  self.verbosity = True
  self.mini_batch_size = 10
- self.weight_decay = 0
+ self.snapshots_per_epoch = -1
+
+ self.l1_regularization = 0.0
+ self.l2_regularization = 0.0
+ self.dropout = 0.0
+ self.batch_norm = False
+ self.input_noise = 0.0
+
  self.early_stopping_epochs = 0
  self.early_stopping_threshold = 0
  self.learning_rate_scheduler = None
  self.learning_rate_decay = 0.1
  self.learning_rate_patience = 0
+ self._during_training_metric = "ldos"
+ self._after_training_metric = "ldos"
+ self.use_compression = False
  self.num_workers = 0
  self.use_shuffling_for_samplers = True
  self.checkpoints_each_epoch = 0
+ self.checkpoint_best_so_far = False
  self.checkpoint_name = "checkpoint_mala"
- self.visualisation = 0
- self.visualisation_dir = os.path.join(".", "mala_logging")
- self.visualisation_dir_append_date = True
- self.during_training_metric = "ldos"
- self.after_before_training_metric = "ldos"
+ self.run_name = ""
+ self.logging_dir = "./mala_logging"
+ self.logging_dir_append_date = True
+ self.logger = "tensorboard"
+ self.validation_metrics = ["ldos"]
+ self.validate_on_training_data = False
  self.inference_data_grid = [0, 0, 0]
  self.use_mixed_precision = False
  self.use_graphs = False
- self.training_report_frequency = 1000
- self.profiler_range = None # [1000, 2000]
+ self.training_log_interval = 1000
+ self.profiler_range = [1000, 2000]
 
  def _update_ddp(self, new_ddp):
  super(ParametersRunning, self)._update_ddp(new_ddp)
  self.during_training_metric = self.during_training_metric
- self.after_before_training_metric = self.after_before_training_metric
+ self.after_training_metric = self.after_training_metric
 
  @property
  def during_training_metric(self):
@@ -786,7 +783,7 @@ def during_training_metric(self, value):
  self._during_training_metric = value
 
  @property
- def after_before_training_metric(self):
+ def after_training_metric(self):
  """
  Get the metric used during training.
 
@@ -798,17 +795,17 @@ def after_before_training_metric(self):
  DFT results. Of these, the mean average error in eV/atom will be
  calculated.
  """
- return self._after_before_training_metric
+ return self._after_training_metric
 
- @after_before_training_metric.setter
- def after_before_training_metric(self, value):
+ @after_training_metric.setter
+ def after_training_metric(self, value):
  if value != "ldos":
  if self._configuration["ddp"]:
  raise Exception(
  "Currently, MALA can only operate with the "
  '"ldos" metric for ddp runs.'
  )
- self._after_before_training_metric = value
+ self._after_training_metric = value
 
  @during_training_metric.setter
  def during_training_metric(self, value):
@@ -1474,7 +1471,7 @@ def save(self, filename, save_format="json"):
  if member[0][0] != "_":
  if isinstance(member[1], ParametersBase):
  # All the subclasses have to provide this function.
- member[1]: ParametersBase
+ member[1]: ParametersBase # type: ignore
  json_dict[member[0]] = member[1].to_json()
  with open(filename, "w", encoding="utf-8") as f:
  json.dump(json_dict, f, ensure_ascii=False, indent=4)