diff --git a/lightly/models/modules/center.py b/lightly/models/modules/center.py
index 55eee220d..21be7b9bf 100644
--- a/lightly/models/modules/center.py
+++ b/lightly/models/modules/center.py
@@ -31,6 +31,11 @@ def __init__(
         mode: str = "mean",
         momentum: float = 0.9,
     ) -> None:
+        """Initializes the Center module with the specified parameters.
+
+        Raises:
+            ValueError: If an unknown mode is provided.
+        """
         super().__init__()
 
         center_fn = CENTER_MODE_TO_FUNCTION.get(mode)
@@ -49,8 +54,10 @@ def __init__(
 
     @property
     def value(self) -> Tensor:
-        """The current value of the center. Use this property to do any operations based
-        on the center."""
+        """The current value of the center.
+
+        Use this property to do any operations based on the center.
+        """
         return self.center
 
     @torch.no_grad()
@@ -75,7 +82,17 @@ def _center_mean(self, x: Tensor) -> Tensor:
 
 @torch.no_grad()
 def center_mean(x: Tensor, dim: Tuple[int, ...]) -> Tensor:
-    """Returns the center of the input tensor by calculating the mean."""
+    """Returns the center of the input tensor by calculating the mean.
+
+    Args:
+        x:
+            Input tensor.
+        dim:
+            Dimensions along which the mean is calculated.
+
+    Returns:
+        The center of the input tensor.
+    """
     batch_center = torch.mean(x, dim=dim, keepdim=True)
     if dist.is_available() and dist.is_initialized():
         dist.all_reduce(batch_center)
diff --git a/lightly/models/modules/heads.py b/lightly/models/modules/heads.py
index d9dcb6989..1bfb7ccbb 100644
--- a/lightly/models/modules/heads.py
+++ b/lightly/models/modules/heads.py
@@ -29,7 +29,6 @@ class ProjectionHead(nn.Module):
         >>>     (256, 256, nn.BatchNorm1d(256), nn.ReLU()),
         >>>     (256, 128, None, None)
         >>> ])
-
     """
 
     def __init__(
@@ -41,6 +40,7 @@ def __init__(
             ],
         ],
     ) -> None:
+        """Initializes the ProjectionHead module with the specified blocks."""
         super().__init__()
 
         layers: List[nn.Module] = []
@@ -60,7 +60,6 @@ def forward(self, x: Tensor) -> Tensor:
         Args:
             x:
                 Input of shape bsz x num_ftrs.
-
         """
         projection: Tensor = self.layers(x)
         return projection
@@ -73,13 +72,22 @@ class BarlowTwinsProjectionHead(ProjectionHead):
     units. The first two layers of the projector are followed by a batch
     normalization layer and rectified linear units." [0]
 
-    [0]: 2021, Barlow Twins, https://arxiv.org/abs/2103.03230
-
+    - [0]: 2021, Barlow Twins, https://arxiv.org/abs/2103.03230
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 8192, output_dim: int = 8192
     ):
+        """Initializes the BarlowTwinsProjectionHead with the specified dimensions.
+
+        Args:
+            input_dim:
+                Dimensionality of the input features.
+            hidden_dim:
+                Dimensionality of the hidden layers.
+            output_dim:
+                Dimensionality of the output features.
+        """
         super(BarlowTwinsProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -96,13 +104,13 @@ class BYOLProjectionHead(ProjectionHead):
     batch normalization, rectified linear units (ReLU), and a final
     linear layer with output dimension 256." [0]
 
-    [0]: BYOL, 2020, https://arxiv.org/abs/2006.07733
-
+    - [0]: BYOL, 2020, https://arxiv.org/abs/2006.07733
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 4096, output_dim: int = 256
     ):
+        """Initializes the BYOLProjectionHead with the specified dimensions."""
         super(BYOLProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -118,8 +126,7 @@ class BYOLPredictionHead(ProjectionHead):
     batch normalization, rectified linear units (ReLU), and a final
     linear layer with output dimension 256." [0]
 
-    [0]: BYOL, 2020, https://arxiv.org/abs/2006.07733
-
+    - [0]: BYOL, 2020, https://arxiv.org/abs/2006.07733
     """
 
     def __init__(
@@ -143,9 +150,9 @@ class MoCoProjectionHead(ProjectionHead):
     hidden layers of both MLPs are 4096-d and are with ReLU; the output layers of both
     MLPs are 256-d, without ReLU. In MoCo v3, all layers in both MLPs have BN" [2]
 
-    [0]: MoCo v1, 2020, https://arxiv.org/abs/1911.05722
-    [1]: MoCo v2, 2020, https://arxiv.org/abs/2003.04297
-    [2]: MoCo v3, 2021, https://arxiv.org/abs/2104.02057
+    - [0]: MoCo v1, 2020, https://arxiv.org/abs/1911.05722
+    - [1]: MoCo v2, 2020, https://arxiv.org/abs/2003.04297
+    - [2]: MoCo v3, 2021, https://arxiv.org/abs/2104.02057
     """
 
     def __init__(
@@ -159,12 +166,16 @@ def __init__(
         """Initialize a new MoCoProjectionHead instance.
 
         Args:
-            input_dim: Number of input dimensions.
-            hidden_dim: Number of hidden dimensions (2048 for v2, 4096 for v3).
-            output_dim: Number of output dimensions (128 for v2, 256 for v3).
-            num_layers: Number of hidden layers (2 for v2, 3 for v3).
-            batch_norm: Whether or not to use batch norms.
-                (False for v2, True for v3)
+            input_dim:
+                Number of input dimensions.
+            hidden_dim:
+                Number of hidden dimensions (2048 for v2, 4096 for v3).
+            output_dim:
+                Number of output dimensions (128 for v2, 256 for v3).
+            num_layers:
+                Number of hidden layers (2 for v2, 3 for v3).
+            batch_norm:
+                Whether or not to use batch norms. (False for v2, True for v3).
         """
         layers: List[Tuple[int, int, Optional[nn.Module], Optional[nn.Module]]] = []
         layers.append(
@@ -204,13 +215,22 @@ class NNCLRProjectionHead(ProjectionHead):
     layers are followed by batch-normalization [36]. All the batch-norm layers
     except the last layer are followed by ReLU activation." [0]
 
-    [0]: NNCLR, 2021, https://arxiv.org/abs/2104.14548
-
+    - [0]: NNCLR, 2021, https://arxiv.org/abs/2104.14548
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 2048, output_dim: int = 256
     ):
+        """Initializes the NNCLRProjectionHead with the specified dimensions.
+
+        Args:
+            input_dim:
+                Dimensionality of the input features.
+            hidden_dim:
+                Dimensionality of the hidden layers.
+            output_dim:
+                Dimensionality of the output features.
+        """
         super(NNCLRProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -227,8 +247,7 @@ class NNCLRPredictionHead(ProjectionHead):
     of size [4096,d]. The hidden layer of the prediction MLP is followed by
     batch-norm and ReLU. The last layer has no batch-norm or activation." [0]
 
-    [0]: NNCLR, 2021, https://arxiv.org/abs/2104.14548
-
+    - [0]: NNCLR, 2021, https://arxiv.org/abs/2104.14548
     """
 
     def __init__(
@@ -265,11 +284,16 @@ def __init__(
         """Initialize a new SimCLRProjectionHead instance.
 
         Args:
-            input_dim: Number of input dimensions.
-            hidden_dim: Number of hidden dimensions.
-            output_dim: Number of output dimensions.
-            num_layers: Number of hidden layers (2 for v1, 3+ for v2).
-            batch_norm: Whether or not to use batch norms.
+            input_dim:
+                Number of input dimensions.
+            hidden_dim:
+                Number of hidden dimensions.
+            output_dim:
+                Number of output dimensions.
+            num_layers:
+                Number of hidden layers (2 for v1, 3+ for v2).
+            batch_norm:
+                Whether or not to use batch norms.
         """
         layers: List[Tuple[int, int, Optional[nn.Module], Optional[nn.Module]]] = []
         layers.append(
@@ -307,8 +331,7 @@ class SimSiamProjectionHead(ProjectionHead):
     layer, including its output fc. Its output fc has no ReLU. The hidden fc is
     2048-d. This MLP has 3 layers." [0]
 
-    [0]: SimSiam, 2020, https://arxiv.org/abs/2011.10566
-
+    - [0]: SimSiam, 2020, https://arxiv.org/abs/2011.10566
     """
 
     def __init__(
@@ -329,13 +352,21 @@ def __init__(
 
 
 class SMoGPrototypes(nn.Module):
-    """SMoG prototypes module for synchronous momentum grouping."""
+    """SMoG prototypes module for synchronous momentum grouping.
+
+    Args:
+        group_features:
+            Tensor containing the group features.
+        beta:
+            Beta parameter for momentum updating.
+    """
 
     def __init__(
         self,
         group_features: Tensor,
         beta: float,
     ):
+        """Initializes the SMoGPrototypes module with the specified parameter."""
         super(SMoGPrototypes, self).__init__()
         self.group_features = nn.Parameter(group_features, requires_grad=False)
         self.beta = beta
@@ -354,8 +385,7 @@ def forward(
                 Temperature parameter for calculating the logits.
 
         Returns:
-            The logits.
-
+            The computed logits.
         """
         x = torch.nn.functional.normalize(x, dim=1)
         group_features = torch.nn.functional.normalize(group_features, dim=1)
@@ -371,7 +401,6 @@ def get_updated_group_features(self, x: Tensor) -> Tensor:
 
         Returns:
             The updated group features.
-
         """
         assignments = self.assign_groups(x)
         group_features = torch.clone(self.group_features.data)
@@ -392,11 +421,11 @@ def assign_groups(self, x: Tensor) -> Tensor:
         """Assigns each representation in x to a group based on cosine similarity.
 
         Args:
-            Tensor of shape bsz x dim.
+            x:
+                Tensor of shape (bsz, dim).
 
         Returns:
-            Tensor of shape bsz indicating group assignments.
-
+            Tensor of shape (bsz,) indicating group assignments.
         """
         return torch.argmax(self.forward(x, self.group_features), dim=-1)
 
@@ -408,13 +437,22 @@ class SMoGProjectionHead(ProjectionHead):
     followed by a BatchNorm [28] and an activation function. (...) The output
     layer of projection head also has BN" [0]
 
-    [0]: SMoG, 2022, https://arxiv.org/pdf/2207.06167.pdf
-
+    - [0]: SMoG, 2022, https://arxiv.org/pdf/2207.06167.pdf
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 2048, output_dim: int = 128
     ):
+        """Initializes the SMoGProjectionHead with the specified dimensions.
+
+        Args:
+            input_dim:
+                Dimensionality of the input features.
+            hidden_dim:
+                Dimensionality of the hidden layers.
+            output_dim:
+                Dimensionality of the output features.
+        """
         super(SMoGProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -435,13 +473,23 @@ class SMoGPredictionHead(ProjectionHead):
     followed by a BatchNorm [28] and an activation function. (...) The output
     layer of projection head also has BN" [0]
 
-    [0]: SMoG, 2022, https://arxiv.org/pdf/2207.06167.pdf
-
+    - [0]: SMoG, 2022, https://arxiv.org/pdf/2207.06167.pdf
     """
 
     def __init__(
         self, input_dim: int = 128, hidden_dim: int = 2048, output_dim: int = 128
     ):
+        """Initializes the SMoGPredictionHead with the specified dimensions.
+
+        Args:
+            input_dim:
+                Dimensionality of the input features.
+            hidden_dim:
+                Dimensionality of the hidden layers.
+            output_dim:
+                Dimensionality of the output features.
+        """
+
         super(SMoGPredictionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -456,13 +504,22 @@ class SimSiamPredictionHead(ProjectionHead):
     "The prediction MLP (h) has BN applied to its hidden fc layers. Its output
     fc does not have BN (...) or ReLU. This MLP has 2 layers." [0]
 
-    [0]: SimSiam, 2020, https://arxiv.org/abs/2011.10566
-
+    - [0]: SimSiam, 2020, https://arxiv.org/abs/2011.10566
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 512, output_dim: int = 2048
     ):
+        """Initializes the SimSiamPredictionHead with the specified dimensions.
+
+        Args:
+            input_dim:
+                Dimensionality of the input features.
+            hidden_dim:
+                Dimensionality of the hidden layers.
+            output_dim:
+                Dimensionality of the output features.
+        """
         super(SimSiamPredictionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -474,12 +531,13 @@ def __init__(
 class SwaVProjectionHead(ProjectionHead):
     """Projection head used for SwaV.
 
-    [0]: SwAV, 2020, https://arxiv.org/abs/2006.09882
+    - [0]: SwAV, 2020, https://arxiv.org/abs/2006.09882
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 2048, output_dim: int = 128
     ):
+        """Initializes the SwaVProjectionHead with the specified dimensions."""
         super(SwaVProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -513,7 +571,6 @@ class SwaVPrototypes(nn.Module):
         >>>
         >>> # logits has shape bsz x 512
         >>> logits = prototypes(features)
-
     """
 
     def __init__(
@@ -522,7 +579,9 @@ def __init__(
         n_prototypes: Union[List[int], int] = 3000,
         n_steps_frozen_prototypes: int = 0,
     ):
+        """Intializes the SwaVPrototypes module with the specified parameters"""
         super(SwaVPrototypes, self).__init__()
+
         # Default to a list of 1 if n_prototypes is an int.
         self.n_prototypes = (
             n_prototypes if isinstance(n_prototypes, list) else [n_prototypes]
@@ -536,6 +595,18 @@ def __init__(
     def forward(
         self, x: Tensor, step: Optional[int] = None
     ) -> Union[Tensor, List[Tensor]]:
+        """Forward pass of the SwaVPrototypes module.
+
+        Args:
+            x:
+                Input tensor.
+            step:
+                Current training step.
+
+        Returns:
+            The logits after passing through the prototype heads. Returns a single tensor
+            if there's one prototype head, otherwise returns a list of tensors.
+        """
         self._freeze_prototypes_if_required(step)
         out = []
         for layer in self.heads:
@@ -548,6 +619,7 @@ def normalize(self) -> None:
             utils.normalize_weight(layer.weight)
 
     def _freeze_prototypes_if_required(self, step: Optional[int] = None) -> None:
+        """Freezes the prototypes if the specified number of steps has been reached."""
         if self.n_steps_frozen_prototypes > 0:
             if step is None:
                 raise ValueError(
@@ -588,7 +660,6 @@ class DINOProjectionHead(ProjectionHead):
             Whether or not to weight normalize the last layer of the DINO head.
             Not normalizing leads to better performance but can make the
             training unstable.
-
     """
 
     def __init__(
@@ -601,6 +672,7 @@ def __init__(
         freeze_last_layer: int = -1,
         norm_last_layer: bool = True,
     ):
+        """Initializes the DINOProjectionHead with the specified dimensions."""
         bn = nn.BatchNorm1d(hidden_dim) if batch_norm else None
 
         super().__init__(
@@ -672,16 +744,24 @@ def __init__(
         """Initialize a new MMCRProjectionHead instance.
 
         Args:
-            input_dim: Number of input dimensions.
-            hidden_dim: Number of hidden dimensions.
-            output_dim: Number of output dimensions.
-            num_layers: Number of hidden layers.
-            batch_norm: Whether or not to use batch norms.
-            use_bias: Whether or not to use bias in the linear layers.
+            input_dim:
+                Number of input dimensions.
+            hidden_dim:
+                Number of hidden dimensions.
+            output_dim:
+                Number of output dimensions.
+            num_layers:
+                Number of hidden layers.
+            batch_norm:
+                Whether or not to use batch norms.
+            use_bias:
+                Whether or not to use bias in the linear layers.
         """
         layers: List[
             Tuple[int, int, Optional[nn.Module], Optional[nn.Module], bool]
         ] = []
+
+        # Add the first layer
         layers.append(
             (
                 input_dim,
@@ -691,6 +771,8 @@ def __init__(
                 use_bias,
             )
         )
+
+        # Add the hidden layers
         for _ in range(num_layers - 1):
             layers.append(
                 (
@@ -701,6 +783,8 @@ def __init__(
                     use_bias,
                 )
             )
+
+        # Add the output layer
         layers.append((hidden_dim, output_dim, None, None, use_bias))
         super().__init__(layers)
 
@@ -710,6 +794,7 @@ class MSNProjectionHead(ProjectionHead):
 
     "We train with a 3-layer projection head with output dimension 256 and
     batch-normalization at the input and hidden layers.." [0]
+
     Code inspired by [1].
 
     - [0]: Masked Siamese Networks, 2022, https://arxiv.org/abs/2204.07141
@@ -730,6 +815,7 @@ def __init__(
         hidden_dim: int = 2048,
         output_dim: int = 256,
     ):
+        """Initializes the MSNProjectionHead with the specified dimensions."""
         super().__init__(
             blocks=[
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.GELU()),
@@ -746,13 +832,13 @@ class TiCoProjectionHead(ProjectionHead):
     batch normalization, rectified linear units (ReLU), and a final
     linear layer with output dimension 256." [0]
 
-    [0]: TiCo, 2022, https://arxiv.org/pdf/2206.10698.pdf
-
+    - [0]: TiCo, 2022, https://arxiv.org/pdf/2206.10698.pdf
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 4096, output_dim: int = 256
     ):
+        """Initializes the TiCoProjectionHead with the specified dimensions."""
         super(TiCoProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()),
@@ -768,8 +854,7 @@ class VICRegProjectionHead(ProjectionHead):
     units. The first two layers of the projector are followed by a batch
     normalization layer and rectified linear units." [0]
 
-    [0]: 2022, VICReg, https://arxiv.org/pdf/2105.04906.pdf
-
+    - [0]: 2022, VICReg, https://arxiv.org/pdf/2105.04906.pdf
     """
 
     def __init__(
@@ -779,6 +864,18 @@ def __init__(
         output_dim: int = 8192,
         num_layers: int = 3,
     ):
+        """Initializes the VICRegProjectionHead with the specified dimensions.
+
+        Args:
+            input_dim:
+                Dimensionality of the input features.
+            hidden_dim:
+                Dimensionality of the hidden layers.
+            output_dim:
+                Dimensionality of the output features.
+            num_layers:
+                Number of layers in the projection head.
+        """
         hidden_layers = [
             (hidden_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU())
             for _ in range(num_layers - 2)  # Exclude first and last layer.
@@ -795,16 +892,16 @@ def __init__(
 class VicRegLLocalProjectionHead(ProjectionHead):
     """Projection head used for the local head of VICRegL.
 
-    The projector network has three linear layers. The first two layers of the projector
-    are followed by a batch normalization layer and rectified linear units.
-
-    2022, VICRegL, https://arxiv.org/abs/2210.01571
+    "The projector network has three linear layers. The first two layers of the projector
+    are followed by a batch normalization layer and rectified linear units." [0]
 
+    - [0]: 2022, VICRegL, https://arxiv.org/abs/2210.01571
     """
 
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 8192, output_dim: int = 8192
     ):
+        """Initializes the VicRegLLocalProjectionHead with the specified dimensions."""
         super(VicRegLLocalProjectionHead, self).__init__(
             [
                 (input_dim, hidden_dim, nn.LayerNorm(hidden_dim), nn.ReLU()),
@@ -826,6 +923,7 @@ class DenseCLProjectionHead(ProjectionHead):
     def __init__(
         self, input_dim: int = 2048, hidden_dim: int = 2048, output_dim: int = 128
     ):
+        """Initializes the DenseCLProjectionHead with the specified dimensions."""
         super().__init__(
             [
                 (input_dim, hidden_dim, None, nn.ReLU()),
diff --git a/lightly/models/modules/heads_timm.py b/lightly/models/modules/heads_timm.py
index 7ea20de21..44ae6a4ca 100644
--- a/lightly/models/modules/heads_timm.py
+++ b/lightly/models/modules/heads_timm.py
@@ -9,6 +9,22 @@ class AIMPredictionHeadBlock(Module):
     """Prediction head block for AIM [0].
 
     - [0]: AIM, 2024, https://arxiv.org/abs/2401.08541
+
+    Args:
+        input_dim:
+            Dimensionality of the input features.
+        output_dim:
+            Dimensionality of the output features.
+        mlp_ratio:
+            Ratio used to determine the hidden layer size in the MLP.
+        proj_drop:
+            Dropout rate for the projection layer.
+        act_layer:
+            Activation layer to use.
+        norm_layer:
+            Normalization layer to use.
+        mlp_layer:
+            MLP layer to use.
     """
 
     def __init__(
@@ -21,6 +37,8 @@ def __init__(
         norm_layer: Type[Module] = LayerNorm,
         mlp_layer: Type[Module] = Mlp,
     ) -> None:
+        """Initializes the AIMPredictionHeadBlock module with the specified parameters."""
+
         super().__init__()
         self.norm = norm_layer(input_dim)  # type: ignore[call-arg]
         self.mlp = mlp_layer(  # type: ignore[call-arg]
@@ -33,6 +51,15 @@ def __init__(
         )
 
     def forward(self, x: Tensor) -> Tensor:
+        """Forward pass of the AIMPredictionHeadBlock.
+
+        Args:
+            x:
+                Input tensor.
+
+        Returns:
+            Output tensor after applying the MLP and normalization.
+        """
         x = x + self.mlp(self.norm(x))
         return x
 
@@ -41,6 +68,28 @@ class AIMPredictionHead(Module):
     """Prediction head for AIM [0].
 
     - [0]: AIM, 2024, https://arxiv.org/abs/2401.08541
+
+    Args:
+        input_dim:
+            Dimensionality of the input features.
+        output_dim:
+            Dimensionality of the output features.
+        hidden_dim:
+            Dimensionality of the hidden layer.
+        num_blocks:
+            Number of blocks in the prediction head.
+        mlp_ratio:
+            Ratio used to determine the hidden layer size in the MLP.
+        proj_drop:
+            Dropout rate for the projection layer.
+        act_layer:
+            Activation layer to use.
+        norm_layer:
+            Normalization layer to use.
+        mlp_layer:
+            MLP layer to use.
+        block_fn:
+            Block function to use for the prediction head.
     """
 
     def __init__(
@@ -56,6 +105,8 @@ def __init__(
         mlp_layer: Type[Module] = Mlp,
         block_fn: Type[Module] = AIMPredictionHeadBlock,
     ) -> None:
+        """Initializes the AIMPredictionHead module with the specified parameters."""
+
         super().__init__()
         self.blocks = Sequential(
             # Linear layer to project the input dimension to the hidden dimension.
@@ -79,5 +130,14 @@ def __init__(
         )
 
     def forward(self, x: Tensor) -> Tensor:
+        """Forward pass of the AIMPredictionHead.
+
+        Args:
+            x:
+                Input tensor.
+
+        Returns:
+            Output tensor after processing through the prediction head blocks.
+        """
         x = self.blocks(x)
         return x