Skip to content

Commit

Permalink
Merge pull request #19 from statice/eicca/module-logger
Browse files Browse the repository at this point in the history
Use module level logger
  • Loading branch information
eicca authored May 12, 2023
2 parents d131199 + 0bc2505 commit efae58d
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 15 deletions.
2 changes: 1 addition & 1 deletion LICENSE.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
POSSIBILITY OF SUCH DAMAGE.
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ description of the framework and the attack algorithms can be found in the paper
[A Unified Framework for Quantifying Privacy Risk in Synthetic Data](https://petsymposium.org/popets/2023/popets-2023-0055.php), accepted at the 23rd Privacy Enhancing Technologies Symposium ([PETS 2023](https://petsymposium.org/cfp23.php)).



## Setup and installation

`Anonymeter` requires Python 3.8.x, 3.9.x or 3.10.x installed. The simplest way to install `Anonymeter` is from `PyPi`. Simply run
Expand Down Expand Up @@ -104,12 +105,44 @@ evaluator.evaluate()
risk = evaluator.risk()
```

## Configuring logging

`Anonymeter` uses the standard Python logger named `anonymeter`.
You can configure the logging level and the output destination
using the standard Python logging API (see [here](https://docs.python.org/3/library/logging.html) for more details).

For example, to set the logging level to `DEBUG` you can use the following snippet:

```python
import logging

# set the logging level to DEBUG
logging.getLogger("anonymeter").setLevel(logging.DEBUG)
```

And if you want to log to a file, you can use the following snippet:

```python
import logging

# create a file handler
file_handler = logging.FileHandler("anonymeter.log")

# set the logging level for the file handler
file_handler.setLevel(logging.DEBUG)

# add the file handler to the logger
logger = logging.getLogger("anonymeter")
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)
```


## Cite this work

If you use anonymeter in your work, we would appreciate citations to the following paper:

"A Unified Framework for Quantifying Privacy Risk in Synthetic Data", M. Giomi *et al*, PoPETS 2023.

This `bibtex` entry can be used to refer to the paper:

```text
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ dev = [
"flake8-broken-line~=0.5",
"flake8-bugbear~=23.2",
"pre-commit==2.20.0",
"mypy==0.961",
"pytest-mypy==0.9.1",
"mypy~=1.2.0",

# Code formatting
"isort~=5.10",
Expand Down
4 changes: 2 additions & 2 deletions src/anonymeter/evaluators/inference_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _run_attack(
n_jobs: int,
naive: bool,
regression: Optional[bool],
) -> np.ndarray:
) -> int:
if regression is None:
regression = pd.api.types.is_numeric_dtype(target[secret])

Expand Down Expand Up @@ -159,7 +159,7 @@ def __init__(
self._aux_cols = aux_cols
self._evaluated = False

def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> np.ndarray:
def _attack(self, target: pd.DataFrame, naive: bool, n_jobs: int) -> int:
return _run_attack(
target=target,
syn=self._syn,
Expand Down
4 changes: 3 additions & 1 deletion src/anonymeter/evaluators/linkability_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from anonymeter.neighbors.mixed_types_kneighbors import MixedTypeKNeighbors
from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk

logger = logging.getLogger(__name__)


class LinkabilityIndexes:
"""Utility class to store indexes from linkability attack.
Expand Down Expand Up @@ -50,7 +52,7 @@ def find_links(self, n_neighbors: int) -> Dict[int, Set[int]]:
"""
if n_neighbors > self._idx_0.shape[0]:
logging.warning(f"Neighbors too large ({n_neighbors}, using {self._idx_0.shape[0]}) instead.")
logger.warning(f"Neighbors too large ({n_neighbors}, using {self._idx_0.shape[0]}) instead.")
n_neighbors = self._idx_0.shape[0]

if n_neighbors < 1:
Expand Down
18 changes: 12 additions & 6 deletions src/anonymeter/evaluators/singling_out_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from anonymeter.stats.confidence import EvaluationResults, PrivacyRisk

rng = np.random.default_rng()
logger = logging.getLogger(__name__)


def _escape_quotes(string: str) -> str:
Expand Down Expand Up @@ -122,7 +123,7 @@ def safe_query_counts(query: str, df: pd.DataFrame) -> Optional[int]:
try:
return len(df.query(query, engine="python"))
except Exception as ex:
logging.debug(f"Query {query} failed with {ex}.")
logger.debug(f"Query {query} failed with {ex}.")
return None


Expand Down Expand Up @@ -346,7 +347,7 @@ def _evaluate_queries(df: pd.DataFrame, queries: List[str]) -> List[str]:
counts = np.array([safe_query_counts(query=q, df=df) for q in queries], dtype=float)

if np.any(np.isnan(counts)) > 0:
logging.warning(
logger.warning(
f"Found {np.sum(np.isnan(counts))} failed queries "
f"out of {len(queries)}. Check DEBUG messages for more details."
)
Expand All @@ -366,7 +367,7 @@ def _generate_singling_out_queries(df: pd.DataFrame, mode: str, n_attacks: int,
raise RuntimeError(f"Parameter `mode` can be either `univariate` or `multivariate`. Got {mode} instead.")

if len(queries) < n_attacks:
logging.warning(
logger.warning(
f"Attack `{mode}` could generate only {len(queries)} "
f"singling out queries out of the requested {n_attacks}. "
"This can probably lead to an underestimate of the "
Expand Down Expand Up @@ -444,12 +445,12 @@ def queries(self, baseline: bool = False) -> List[str]:
"""
return self._random_queries if baseline else self._queries

def evaluate(self, mode: str) -> "SinglingOutEvaluator":
def evaluate(self, mode: str = "multivariate") -> "SinglingOutEvaluator":
"""Run the attack and evaluate the guesses on the original dataset.
Parameters
----------
mode : str
mode : str, default is "multivariate"
Name of the algorithm used to generate the singling out queries.
Could be either `multivariate` or `univariate`.
Expand All @@ -459,7 +460,12 @@ def evaluate(self, mode: str) -> "SinglingOutEvaluator":
The evaluated singling out evaluator.
"""
n_cols = 1 if mode == "univariate" else self._n_cols
if mode == "multivariate":
n_cols = self._n_cols
elif mode == "univariate":
n_cols = 1
else:
raise ValueError(f"mode must be either 'multivariate' or 'univariate', got {mode} instead.")

baseline_queries = _random_queries(df=self._syn, n_queries=self._n_attacks, n_cols=n_cols)
self._baseline_queries = _evaluate_queries(df=self._ori, queries=baseline_queries)
Expand Down
4 changes: 3 additions & 1 deletion src/anonymeter/neighbors/mixed_types_kneighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from anonymeter.preprocessing.transformations import mixed_types_transform
from anonymeter.preprocessing.type_detection import detect_consistent_col_types

logger = logging.getLogger(__name__)


@jit(nopython=True, nogil=True)
def gower_distance(r0: np.ndarray, r1: np.ndarray, cat_cols_index: np.ndarray) -> float64:
Expand Down Expand Up @@ -199,7 +201,7 @@ def kneighbors(
n_neighbors = self._n_neighbors

if n_neighbors > self._candidates.shape[0]:
logging.warning(
logger.warning(
f"Parameter ``n_neighbors``={n_neighbors} cannot be "
f"larger than the size of the training data {self._candidates.shape[0]}."
)
Expand Down
4 changes: 3 additions & 1 deletion src/anonymeter/preprocessing/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import pandas as pd
from sklearn.preprocessing import LabelEncoder

logger = logging.getLogger(__name__)


def _encode_categorical(
df1: pd.DataFrame,
Expand All @@ -33,7 +35,7 @@ def _scale_numerical(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame

if any(ranges == 0):
cnames = ", ".join(ranges[ranges == 0].index.values)
logging.debug(
logger.debug(
f"Numerical column(s) {cnames} have a null-range: all elements "
"have the same value. These column(s) won't be scaled."
)
Expand Down

0 comments on commit efae58d

Please sign in to comment.