Release 0.2.0

apple · Jun 11, 2024 · d11afae · d11afae
2 parents 6870412 + 2065d11
commit d11afae
Show file tree

Hide file tree

Showing 92 changed files with 6,184 additions and 494 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -207,11 +207,13 @@ jobs:
 workflows:
  build_and_test:
  when:
- matches:
- # Only on branches approved by Apple CircleCI policy:
- # https://app.circleci.com/settings/organization/github/apple/policies/baseline_apple
- pattern: "^main|gh-readonly-queue/main/pr-\\d+-[0-9a-f]{40}.*$"
- value: << pipeline.git.branch >>
+ or:
+ - matches:
+ # Only on branches approved by Apple CircleCI policy:
+ # https://app.circleci.com/settings/organization/github/apple/policies/baseline_apple
+ pattern: "^main|gh-readonly-queue/main/pr-\\d+-[0-9a-f]{40}.*$"
+ value: << pipeline.git.branch >>
+ - equal: [ develop, << pipeline.git.branch >> ]
  jobs:
  - code-quality
  - build-documentation-wheel

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,6 +45,7 @@ repos:
  rev: v1.5.0
  hooks:
  - id: mypy
+ exclude: ^publications/
  # TODO: license header hook
 
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,29 @@
 * 
 
 
+## v0.2.0
+
+### Breaking change!
+
+* `EMMGMMHyperParams` is renamed to `EMGMMHyperParams` (#55)
+
+### New features
+
+* Return local metadata from model training to algorithm (#71).
+
+### Tasks completed
+
+* Update FLAIR preprocessing script to download dataset from HuggingFace, available at https://huggingface.co/datasets/apple/flair (#72).
+* Update LLM Benchmark Configs (#63).
+* New improved worker scheduling in distributed simulations. Speeds up FLAIR benchmark by 19% (#73).
+* Don't pin PyTorch version to 2.0.1 (#69).
+* Move `--noise_cohort_size` to `add_mechanism_arguments` (#70).
+
+### Bug fixes
+
+* 
+
+
 ## v0.1.0
 
 2024-03-01

diff --git a/README.md b/README.md
@@ -1,5 +1,11 @@
 # `pfl`: Python framework for Private Federated Learning simulations
 
+[![GitHub License](https://img.shields.io/github/license/apple/pfl-research)](https://github.com/apple/pfl-research/blob/main/LICENSE)
+[![CircleCI](https://dl.circleci.com/status-badge/img/gh/apple/pfl-research/tree/main.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/apple/pfl-research/tree/main)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pfl)](https://github.com/apple/pfl-research/blob/main/pyproject.toml#L18)
+
+**Documentation website:** https://apple.github.io/pfl-research
+
 `pfl` is a Python framework developed at Apple to empower researchers to run efficient simulations with privacy-preserving federated learning (FL) and disseminate the results of their research in FL. We are a team comprising engineering and research expertise, and we encourage researchers to publish their papers, with this code, with confidence.
 
 The framework is `not` intended to be used for third-party FL deployments but the results of the simulations can be tremendously useful in actual FL deployments.
@@ -32,6 +38,8 @@ pip install 'pfl[tf,pytorch,trees]'
 To try out `pfl` immediately without installation, we provide several colab notebooks for learning the different components in `pfl` hands-on.
 `<TODO push notebooks to colab>`
 
+Also available as Jupyter notebooks [here](https://github.com/apple/pfl-research/tree/develop/tutorials).
+
 ## Getting started - benchmarks
 
 `pfl` aims to streamline the benchmarking process of testing hypotheses in the Federated Learning paradigm. The official benchmarks are available in the [benchmarks](./benchmarks) directory, using a variety of realistic dataset-model combinations with and without differential privacy (yes, we do also have CIFAR10).

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.1.0
+0.2.0
diff --git a/benchmarks/dataset/argument_parsing.py b/benchmarks/dataset/argument_parsing.py
@@ -132,6 +132,20 @@ def add_dataset_arguments(
  default=100,
  help='Maximum number of images per user')
 
+ parser.add_argument(
+ '--scheduling_base_weight_multiplier',
+ type=float,
+ default=1.0,
+ help=('Figure 3b in pfl-research paper '
+ 'https://arxiv.org/abs/2404.06430 show adding a'
+ 'base value for each user\'s weight for scheduling '
+ 'in distributed simulations speeds up training. '
+ 'This parameter adds a '
+ 'multiplicative factor of the median user weight '
+ 'as base value. 0.0 means no base value added and '
+ '~1.0 is the optimal value for the FLAIR benchmark '
+ 'according to Figure 3b (can be different for '
+ 'other setups).'))
  elif known_args.dataset == 'alpaca':
  parser = add_artificial_fed_dataset_arguments(parser)
 
@@ -289,6 +303,8 @@ def get_datasets(
  data_path=args.data_path,
  use_fine_grained_labels=args.use_fine_grained_labels,
  max_num_user_images=args.max_num_user_images,
+ scheduling_base_weight_multiplier=args.
+ scheduling_base_weight_multiplier,
  numpy_to_tensor=numpy_to_tensor)
  elif args.dataset == 'flair_pytorch':
  from .flair import make_flair_pytorch_datasets

diff --git a/benchmarks/dataset/flair/__init__.py b/benchmarks/dataset/flair/__init__.py
@@ -24,7 +24,9 @@ def get_central_data_and_metadata(data_path: str,
 
 
 def make_flair_datasets(data_path: str, use_fine_grained_labels: bool,
- max_num_user_images: int, numpy_to_tensor: Callable):
+ max_num_user_images: int,
+ scheduling_base_weight_multiplier: float,
+ numpy_to_tensor: Callable):
  """
  Create a train and val ``FederatedDataset`` as well as a
  central dataset from the FLAIR dataset.
@@ -33,11 +35,10 @@ def make_flair_datasets(data_path: str, use_fine_grained_labels: bool,
 
  training_federated_dataset = make_federated_dataset(
  data_path, 'train', use_fine_grained_labels, max_num_user_images,
- numpy_to_tensor)
- val_federated_dataset = make_federated_dataset(data_path, 'val',
- use_fine_grained_labels,
- max_num_user_images,
- numpy_to_tensor)
+ scheduling_base_weight_multiplier, numpy_to_tensor)
+ val_federated_dataset = make_federated_dataset(
+ data_path, 'val', use_fine_grained_labels, max_num_user_images,
+ scheduling_base_weight_multiplier, numpy_to_tensor)
 
  central_data, metadata = get_central_data_and_metadata(
  data_path, use_fine_grained_labels)

diff --git a/benchmarks/dataset/flair/download_dataset.py b/benchmarks/dataset/flair/download_dataset.py