Merge pull request #320 from sodascience/develop

Release 1.0.3
sodascience · Sep 4, 2024 · 37b500d · 37b500d
2 parents 4e72bb3 + bb128f9
commit 37b500d
Show file tree

Hide file tree

Showing 11 changed files with 1,852 additions and 180 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
         <a href="https://metasyn.readthedocs.io/en/latest/index.html"><img src="https://readthedocs.org/projects/metasyn/badge/?version=latest" alt="Readthedocs"></img></a>
         <a href="https://hub.docker.com/r/sodateam/metasyn"><img src="https://img.shields.io/docker/v/sodateam/metasyn?logo=docker&label=docker&color=blue" alt="Docker image version"></img></a>
         <a href="https://zenodo.org/doi/10.5281/zenodo.7696031"><img src="https://zenodo.org/badge/DOI/10.5281/zenodo.7696031.svg" alt="DOI"></a>
+        <a href="https://joss.theoj.org/papers/43fd4234e18bfd94b952aea35db8b883"><img src="https://joss.theoj.org/papers/43fd4234e18bfd94b952aea35db8b883/status.svg"></a>
     </span>
   </p>
 </p>
@@ -24,7 +25,7 @@ With metasyn you can __fit__ a model to an existing dataframe, __export__ it to
 - 🔎 __Transparent__. With metasyn you share not only synthetic data, but also the model and settings used to create it through a traceable, auditable metadata format. Everyone can read and understand what the model does; it is crystal clear which information becomes public.
 - 🔐 __Private__. By default, metasyn does not incorporate multivariate information, meaning less risk of privacy issues such as identity, attribute, or group disclosure. On top of this, we support privacy plugins such as our own [disclosure control plugin](https://github.com/sodascience/metasyn-disclosure-control) to further enhance privacy in critically sensitive situations.
 - 🔗 __Integrated__. We integrate closely with popular, modern tools in the python ecosystem, building on the wonderful [polars](https://pola.rs/) dataframe library ([pandas](https://pandas.pydata.org/) is supported too), as well as [faker](https://faker.readthedocs.io/en/master/) to generate localized data for names, emails, and phone numbers, and more.
-- 📦 __Extensible__. Are you missing features? Do you have a different definition of privacy? Our plugin system allows you (or your organisation) to create their own extension to adjust metasyn to what you need. Or you can [contribute](https://metasyn.readthedocs.io/en/latest/developer/contributing.html) directly to the project.
+- 📦 __Extensible__. Are you missing features? Do you have a different definition of privacy? Our plugin system allows you (or your organisation) to create their own extension to adjust metasyn to what you need. Or you can [contribute](#contributing) directly to the project.
 
 ## Installation
 Metasyn can be installed directly from PyPI using the following command in the terminal:
@@ -40,11 +41,14 @@ pip install git+https://github.com/sodascience/metasyn
 ```
 
 ## Usage
+
+![demo](https://github.com/user-attachments/assets/f3982077-4a02-4a41-b88c-d5145ef8bdd7)
+
 To generate synthetic data, `metasyn` first needs to fit a `MetaFrame` to the data which can then be used to produce new synthetic rows:
 
 ![Example input and output](https://github.com/sodascience/metasyn/blob/main/docs/source/images/example_input_output_concise.png)
 
-In Python code this happens as follows:
+The above image closely matches the Python code:
 
 ```python
 import polars as pl
@@ -79,9 +83,9 @@ For more information on how to create dataframes with polars, refer to the [Pola
 
 ## Where to go next
 
-- As a next step to learn more about generating synthetic data with metasyn we recommend to check out the [user guide](https://metasyn.readthedocs.io/en/latest/usage/usage.html).
+- To explore more options and try this out online, take a look at our interactive tutorial: [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sodascience/metasyn/blob/main/examples/getting_started.ipynb)
+- As a next step to learn more about generating synthetic data with metasyn we recommend to check out the [user guide](https://metasyn.readthedocs.io/en/latest/usage/usage.html) and other [documentation](https://metasyn.readthedocs.io/en/latest).
 - For even more privacy, have a look at our [disclosure control plugin](https://github.com/sodascience/metasyn-disclosure-control).
-- To learn more about how `metasyn` works, go to detailed overview in our [documentation](https://metasyn.readthedocs.io/en/latest/about/metasyn_in_detail.html). 
 - Want to create programs that build on metasyn? Take a look at our versioned [Docker containers](https://hub.docker.com/r/sodateam/metasyn) and our [CLI](https://metasyn.readthedocs.io/en/latest/usage/cli.html).
 
 ## Contributing

diff --git a/docs/paper/build_command.txt b/docs/paper/build_command.txt
@@ -1,8 +1,13 @@
+# Word count:
 
+wc -w docs/paper/paper.md
+
+# Building the paper:
 On windows:
 
 docker run --rm --volume %cd%/docs/paper:/data --env JOURNAL=joss openjournals/inara
 
 on unix:
 
 docker run --rm --volume $PWD/docs/paper:/data --user $(id -u):$(id -g) --env JOURNAL=joss openjournals/inara
+
diff --git a/docs/paper/paper.bib b/docs/paper/paper.bib
@@ -1,7 +1,9 @@
 @misc{bates2019ons,
   title={ONS methodology working paper series number 16—Synthetic data pilot},
   author={Bates, AG and Spakulov{\'a}, I and Dove, I and Mealor, A},
-  year={2019}
+  year={2019},
+  url={https://www.ons.gov.uk/methodology/methodologicalpublications/generalmethodology/onsworkingpaperseries/onsmethodologyworkingpaperseriesnumber16syntheticdatapilot},
+  urldate={2024-08-12}
 }
 
 @inproceedings{dwork2006differential,
@@ -10,14 +12,16 @@ @inproceedings{dwork2006differential
   booktitle={International colloquium on automata, languages, and programming},
   pages={1--12},
   year={2006},
-  organization={Springer}
+  organization={Springer},
+  doi={10.1007/11787006_1}
 }
 
-@book{dewolf2012statistical,
+@book{hundepool2012statistical,
   title={Statistical disclosure control},
-  author={de Wolf, Peter-Paul},
+  author={Hundepool, Anco and Domingo-Ferrer, Josep and Franconi, Luisa and Giessing, Sarah and Nordholt, Eric Schulte and Spicer, Keith and De Wolf, Peter-Paul},
   year={2012},
-  publisher={Wiley \& Sons, Chichester}
+  publisher={Wiley \& Sons, Chichester},
+  doi={10.1002/9781118348239}
 }
 
 @article{sweeney2002k,
@@ -28,39 +32,26 @@ @article{sweeney2002k
   number={05},
   pages={557--570},
   year={2002},
-  publisher={World Scientific}
+  publisher={World Scientific},
+  doi={10.1142/S0218488502001648}
 }
 
 @misc{bond2015guidelines,
-  title={Guidelines for Output Checking. Eurostat},
+  title={Guidelines for the checking of output based on microdata research},
+  publisher={Eurostat},
   author={Bond, S and Brandt, M and de Wolf, PP},
-  year={2015}
-}
-
-@article{dwork2010differential,
-  title={Differential privacy for statistics: What we know and what we want to learn},
-  author={Dwork, Cynthia and Smith, Adam},
-  journal={Journal of Privacy and Confidentiality},
-  volume={1},
-  number={2},
-  year={2010}
+  year={2015},
+  url={https://web.archive.org/web/20160408145718/http://dwbproject.org/export/sites/default/about/public_deliveraples/dwb_d11-8_synthetic-data_cta-ecta_output-checking-guidelines_final-reports.zip},
+  urldate={2024-08-12}
 }
 
 @book{hastie2009elements,
   title={The elements of statistical learning: data mining, inference, and prediction},
   author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H and Friedman, Jerome H},
   volume={2},
   year={2009},
-  publisher={Springer}
-}
-
-@inproceedings{akaike1973information,
-  title={Information theory and an extension of the maximum likelihood principle},
-  author={Akaike, H},
-  booktitle={2nd International Symposium on Information Theory},
-  pages={267--281},
-  year={1973},
-  organization={Akad{\'e}miai Kiad{\'o} Location Budapest, Hungary}
+  publisher={Springer},
+  doi={10.1007/978-0-387-84858-7}
 }
 
 @article{neath2012bayesian,
@@ -71,8 +62,10 @@ @article{neath2012bayesian
   number={2},
   pages={199--203},
   year={2012},
-  publisher={Wiley Online Library}
+  publisher={Wiley Online Library},
+  doi={10.1002/wics.199}
 }
+
 @software{vink2024polars,
   author       = {Ritchie Vink and
                   Stijn de Gooijer and
@@ -131,7 +124,8 @@ @article{nowok2016synthpop
   journal={Journal of statistical software},
   volume={74},
   pages={1--26},
-  year={2016}
+  year={2016},
+  doi={10.18637/jss.v074.i11}
 }
 
 @article{templ2017simulation,
@@ -142,20 +136,23 @@ @article{templ2017simulation
   number={10},
   pages={1--38},
   year={2017},
-  publisher={UCLA, Dept. of Statistics}
+  publisher={UCLA, Dept. of Statistics},
+  doi={10.18637/jss.v079.i10}
 }
 
 @inproceedings{ping2017datasynthesizer,
   title={Datasynthesizer: Privacy-preserving synthetic datasets},
   author={Ping, Haoyue and Stoyanovich, Julia and Howe, Bill},
   booktitle={Proceedings of the 29th International Conference on Scientific and Statistical Database Management},
   pages={1--5},
-  year={2017}
+  year={2017},
+  doi={10.1145/3085504.3091117}
 }
 
 @article{vankesteren2024democratize,
   title={To democratize research with sensitive data, we should make synthetic data more accessible},
   author={{van Kesteren}, Erik-Jan},
   journal={arXiv preprint arXiv:2404.17271},
-  year={2024}
+  year={2024},
+  doi={10.48550/arXiv.2404.17271}
 }