Merge pull request #19 from statice/eicca/module-logger

Use module level logger
statice · May 12, 2023 · efae58d · efae58d
2 parents d131199 + 0bc2505
commit efae58d
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 15 deletions.
diff --git a/LICENSE.md b/LICENSE.md
@@ -29,4 +29,4 @@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -34,6 +34,7 @@ description of the framework and the attack algorithms can be found in the paper
 [A Unified Framework for Quantifying Privacy Risk in Synthetic Data](https://petsymposium.org/popets/2023/popets-2023-0055.php), accepted at the 23rd Privacy Enhancing Technologies Symposium ([PETS 2023](https://petsymposium.org/cfp23.php)).
 
 
+
 ## Setup and installation
 
 `Anonymeter` requires Python 3.8.x, 3.9.x or 3.10.x installed. The simplest way to install `Anonymeter` is from `PyPi`. Simply run
@@ -104,12 +105,44 @@ evaluator.evaluate()
 risk = evaluator.risk()
 ```
 
+## Configuring logging
+
+`Anonymeter` uses the standard Python logger named `anonymeter`.
+You can configure the logging level and the output destination
+using the standard Python logging API (see [here](https://docs.python.org/3/library/logging.html) for more details).
+
+For example, to set the logging level to `DEBUG` you can use the following snippet:
+
+```python
+import logging
+
+# set the logging level to DEBUG
+logging.getLogger("anonymeter").setLevel(logging.DEBUG)
+```
+
+And if you want to log to a file, you can use the following snippet:
+
+```python
+import logging
+
+# create a file handler
+file_handler = logging.FileHandler("anonymeter.log")
+
+# set the logging level for the file handler
+file_handler.setLevel(logging.DEBUG)
+
+# add the file handler to the logger
+logger = logging.getLogger("anonymeter")
+logger.addHandler(file_handler)
+logger.setLevel(logging.DEBUG)
+```
+
+
 ## Cite this work
 
 If you use anonymeter in your work, we would appreciate citations to the following paper:
 
 "A Unified Framework for Quantifying Privacy Risk in Synthetic Data", M. Giomi *et al*, PoPETS 2023.
-
 This `bibtex` entry can be used to refer to the paper:
 
 ```text

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,8 +44,7 @@ dev = [
     "flake8-broken-line~=0.5",
     "flake8-bugbear~=23.2",
     "pre-commit==2.20.0",
-    "mypy==0.961",
-    "pytest-mypy==0.9.1",
+    "mypy~=1.2.0",
 
     # Code formatting
     "isort~=5.10",

diff --git a/src/anonymeter/evaluators/inference_evaluator.py b/src/anonymeter/evaluators/inference_evaluator.py
@@ -20,7 +20,7 @@ def _run_attack(
     n_jobs: int,
     naive: bool,
     regression: Optional[bool],
-) -> np.ndarray:
+) -> int:
     if regression is None:
         regression = pd.api.types.is_numeric_dtype(target[secret])
 
@@ -159,7 +159,7 @@ def __init__(
         self._aux_cols = aux_cols
         self._evaluated = False
 
-    def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> np.ndarray:
+    def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int:
         return _run_attack(
             target=target,
             syn=self._syn,

diff --git a/src/anonymeter/evaluators/linkability_evaluator.py b/src/anonymeter/evaluators/linkability_evaluator.py
@@ -11,6 +11,8 @@
 from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors
 from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk
 
+logger = logging.getLogger(__name__)
+
 
 class LinkabilityIndexes:
     """Utility class to store indexes from linkability attack.
@@ -50,7 +52,7 @@ def find_links(self, n_neighbors: int) -> Dict[int, Set[int]]:
 
         """
         if n_neighbors > self._idx_0.shape[0]:
-            logging.warning(f"Neighbors too large ({n_neighbors}, using {self._idx_0.shape[0]}) instead.")
+            logger.warning(f"Neighbors too large ({n_neighbors}, using {self._idx_0.shape[0]}) instead.")
             n_neighbors = self._idx_0.shape[0]
 
         if n_neighbors < 1:

diff --git a/src/anonymeter/evaluators/singling_out_evaluator.py b/src/anonymeter/evaluators/singling_out_evaluator.py
@@ -13,6 +13,7 @@
 from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk
 
 rng = np.random.default_rng()
+logger = logging.getLogger(__name__)
 
 
 def _escape_quotes(string: str) -> str:
@@ -122,7 +123,7 @@ def safe_query_counts(query: str, df: pd.DataFrame) -> Optional[int]:
     try:
         return len(df.query(query, engine="python"))
     except Exception as ex:
-        logging.debug(f"Query {query} failed with {ex}.")
+        logger.debug(f"Query {query} failed with {ex}.")
         return None
 
 
@@ -346,7 +347,7 @@ def _evaluate_queries(df: pd.DataFrame, queries: List[str]) -> List[str]:
     counts = np.array([safe_query_counts(query=q, df=df) for q in queries], dtype=float)
 
     if np.any(np.isnan(counts)) > 0:
-        logging.warning(
+        logger.warning(
             f"Found {np.sum(np.isnan(counts))} failed queries "
             f"out of {len(queries)}. Check DEBUG messages for more details."
         )
@@ -366,7 +367,7 @@ def _generate_singling_out_queries(df: pd.DataFrame, mode: str, n_attacks: int,
         raise RuntimeError(f"Parameter `mode` can be either `univariate` or `multivariate`. Got {mode} instead.")
 
     if len(queries) < n_attacks:
-        logging.warning(
+        logger.warning(
             f"Attack `{mode}` could generate only {len(queries)} "
             f"singling out queries out of the requested {n_attacks}. "
             "This can probably lead to an underestimate of the "
@@ -444,12 +445,12 @@ def queries(self, baseline: bool = False) -> List[str]:
         """
         return self._random_queries if baseline else self._queries
 
-    def evaluate(self, mode: str) -> "SinglingOutEvaluator":
+    def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
         """Run the attack and evaluate the guesses on the original dataset.
 
         Parameters
         ----------
-        mode : str
+        mode : str, default is "multivariate"
             Name of the algorithm used to generate the singling out queries.
             Could be either `multivariate` or `univariate`.
 
@@ -459,7 +460,12 @@ def evaluate(self, mode: str) -> "SinglingOutEvaluator":
             The evaluated singling out evaluator.
 
         """
-        n_cols = 1 if mode == "univariate" else self._n_cols
+        if mode == "multivariate":
+            n_cols = self._n_cols
+        elif mode == "univariate":
+            n_cols = 1
+        else:
+            raise ValueError(f"mode must be either 'multivariate' or 'univariate', got {mode} instead.")
 
         baseline_queries = _random_queries(df=self._syn, n_queries=self._n_attacks, n_cols=n_cols)
         self._baseline_queries = _evaluate_queries(df=self._ori, queries=baseline_queries)

diff --git a/src/anonymeter/neighbors/mixed_types_kneighbors.py b/src/anonymeter/neighbors/mixed_types_kneighbors.py
@@ -14,6 +14,8 @@
 from anonymeter.preprocessing.transformations import mixed_types_transform
 from anonymeter.preprocessing.type_detection import detect_consistent_col_types
 
+logger = logging.getLogger(__name__)
+
 
 @jit(nopython=True, nogil=True)
 def gower_distance(r0: np.ndarray, r1: np.ndarray, cat_cols_index: np.ndarray) -> float64:
@@ -199,7 +201,7 @@ def kneighbors(
             n_neighbors = self._n_neighbors
 
         if n_neighbors > self._candidates.shape[0]:
-            logging.warning(
+            logger.warning(
                 f"Parameter ``n_neighbors``={n_neighbors} cannot be "
                 f"larger than the size of the training data {self._candidates.shape[0]}."
             )

diff --git a/src/anonymeter/preprocessing/transformations.py b/src/anonymeter/preprocessing/transformations.py
@@ -8,6 +8,8 @@
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 
+logger = logging.getLogger(__name__)
+
 
 def _encode_categorical(
     df1: pd.DataFrame,
@@ -33,7 +35,7 @@ def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame
 
     if any(ranges == 0):
         cnames = ", ".join(ranges[ranges == 0].index.values)
-        logging.debug(
+        logger.debug(
             f"Numerical column(s) {cnames} have a null-range: all elements "
             "have the same value. These column(s) won't be scaled."
         )