From 47d3aeb6e73af051ddba91ed6773816451994b97 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Thu, 24 Nov 2022 06:28:25 +0000 Subject: [PATCH] chore(examples): dataset compare examples (#1167) * build(deps): bump cycjimmy/semantic-release-action from 2 to 3 (#1154) * chore(actions): disable lint when prs come from dependabot (#1164) * chore(actions): fix push and latest tag configs (#1166) * docs(changelogs): fix changelog format (#1163) * chore: move example files and add new hcc example Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Vasco Ramos --- .github/workflows/merge-dev.yml | 4 +- .github/workflows/merge-master.yml | 14 +- .github/workflows/pull-request.yml | 2 + .../pages/reference/changelog/v3_3_1.rst | 2 +- .../pages/reference/changelog/v3_4_0.rst | 6 +- .../pages/reference/changelog/v3_5_0.md | 8 +- ...on_auto_example.py => correlation_demo.py} | 0 .../eda_dataset_compare.py} | 0 .../hcc/eda-with-feature-comparison.ipynb | 181 ++++++++++++++++++ examples/hcc/eda-with-feature-comparison.py | 34 ++++ examples/report_comparison/comparison.ipynb | 120 ------------ 11 files changed, 234 insertions(+), 137 deletions(-) rename examples/features/{correlation_auto_example.py => correlation_demo.py} (100%) rename examples/{report_comparison/comparison.py => features/eda_dataset_compare.py} (100%) create mode 100644 examples/hcc/eda-with-feature-comparison.ipynb create mode 100644 examples/hcc/eda-with-feature-comparison.py delete mode 100644 examples/report_comparison/comparison.ipynb diff --git a/.github/workflows/merge-dev.yml b/.github/workflows/merge-dev.yml index d6249b423..31fa59600 100644 --- a/.github/workflows/merge-dev.yml +++ b/.github/workflows/merge-dev.yml @@ -25,8 +25,8 @@ jobs: uses: oprypin/find-latest-tag@v1.1.1 with: repository: ${{ github.repository }} - regex: '^\d+\.\d+\.\d+' - releases-only: false + regex: '^v\d+\.\d+\.\d+' + releases-only: true - name: Extract semantic version id: semantic diff --git a/.github/workflows/merge-master.yml b/.github/workflows/merge-master.yml index ba605e447..d763a9508 100644 --- a/.github/workflows/merge-master.yml +++ b/.github/workflows/merge-master.yml @@ -24,7 +24,7 @@ jobs: with: token: ${{ secrets.ACCESS_TOKEN }} - - uses: cycjimmy/semantic-release-action@v2 + - uses: cycjimmy/semantic-release-action@v3 id: semantic env: GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} @@ -50,6 +50,7 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 + token: ${{ secrets.ACCESS_TOKEN }} - name: Calculate changelog path id: path @@ -71,11 +72,11 @@ jobs: - name: Add changelog to docs run: | cd docsrc/source/pages/reference - sed -i -e 's/# \[.*/# Changelog ${{ needs.prepare.outputs.release }}/g' \ - -e 's/## Bug Fixes/## πŸ› Bug fixes/g' \ - -e 's/## Features/## πŸŽ‰ Features/g' \ - -e 's/## BREAKING CHANGES/## 🚨 Breaking changes/g' \ - -e 's/## Documentation/## πŸ“– Documentation/g' \ + sed -i -e 's/## \[.*/### Changelog ${{ needs.prepare.outputs.release }}/g' \ + -e 's/### Bug Fixes/#### πŸ› Bug fixes/g' \ + -e 's/### Features/#### πŸŽ‰ Features/g' \ + -e 's/### BREAKING CHANGES/#### 🚨 Breaking changes/g' \ + -e 's/### Documentation/#### πŸ“– Documentation/g' \ changelog/${{ steps.path.outputs.value }} grep -q ".. include:: changelog/${{ steps.path.outputs.value }}" changelog.rst || sed -i "4 a\\ @@ -101,7 +102,6 @@ jobs: with: branch: ${{ github.ref }} github_token: ${{ secrets.ACCESS_TOKEN }} - force: true prerelease-tag: diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 859627713..41b21f3e0 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -6,6 +6,7 @@ on: jobs: commitlint: + if: github.actor != 'dependabot[bot]' name: Lint commit message runs-on: ubuntu-latest @@ -17,6 +18,7 @@ jobs: - uses: wagoid/commitlint-github-action@v5 lint: + if: github.actor != 'dependabot[bot]' name: Lint source code runs-on: ubuntu-latest diff --git a/docsrc/source/pages/reference/changelog/v3_3_1.rst b/docsrc/source/pages/reference/changelog/v3_3_1.rst index 72be6a332..27b40be0f 100644 --- a/docsrc/source/pages/reference/changelog/v3_3_1.rst +++ b/docsrc/source/pages/reference/changelog/v3_3_1.rst @@ -2,7 +2,7 @@ Changelog v3.3.1 ---------------- πŸ› Bug fixes -^^^^^^^^^^^ +^^^^^^^^^^^^ - remove unused imports (`66864c1 `__) diff --git a/docsrc/source/pages/reference/changelog/v3_4_0.rst b/docsrc/source/pages/reference/changelog/v3_4_0.rst index 1861bf59f..0d45c4f8e 100644 --- a/docsrc/source/pages/reference/changelog/v3_4_0.rst +++ b/docsrc/source/pages/reference/changelog/v3_4_0.rst @@ -1,8 +1,8 @@ Changelog v3.4.0 -================ +---------------- πŸ› Bug fixes ------------ +^^^^^^^^^^^^ - correlation passing extra parameters (`#1114 `__) @@ -29,7 +29,7 @@ Changelog v3.4.0 (`985fbd1 `__) πŸŽ‰ Features ----------- +^^^^^^^^^^^^ - add support for Pandas 1.5 (`#1076 `__) diff --git a/docsrc/source/pages/reference/changelog/v3_5_0.md b/docsrc/source/pages/reference/changelog/v3_5_0.md index c8d140ab6..f4899e4e4 100644 --- a/docsrc/source/pages/reference/changelog/v3_5_0.md +++ b/docsrc/source/pages/reference/changelog/v3_5_0.md @@ -1,7 +1,7 @@ -# Changelog v3.5.0 +### Changelog v3.5.0 -### πŸ› Bug fixes +#### πŸ› Bug fixes * change context managed backend ([#1149](https://github.com/ydataai/pandas-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/pandas-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1)) * dataset names on comparison report ([#1159](https://github.com/ydataai/pandas-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/pandas-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18)) @@ -13,8 +13,8 @@ * update repository links ([#1141](https://github.com/ydataai/pandas-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/pandas-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5)) -### πŸŽ‰ Features +#### πŸŽ‰ Features * add typechecking to profile report ([#1139](https://github.com/ydataai/pandas-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/pandas-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61)) * report comparison example ([#1160](https://github.com/ydataai/pandas-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/pandas-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73)) -* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150) \ No newline at end of file +* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150) diff --git a/examples/features/correlation_auto_example.py b/examples/features/correlation_demo.py similarity index 100% rename from examples/features/correlation_auto_example.py rename to examples/features/correlation_demo.py diff --git a/examples/report_comparison/comparison.py b/examples/features/eda_dataset_compare.py similarity index 100% rename from examples/report_comparison/comparison.py rename to examples/features/eda_dataset_compare.py diff --git a/examples/hcc/eda-with-feature-comparison.ipynb b/examples/hcc/eda-with-feature-comparison.ipynb new file mode 100644 index 000000000..f77682488 --- /dev/null +++ b/examples/hcc/eda-with-feature-comparison.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas Profiling: HCC Dataset\n", + "Source of data: https://www.kaggle.com/datasets/mrsantos/hcc-dataset\n", + "\n", + "As modifiations have been introduced for the purpose of this use case, the .csv file is provided (hcc.csv)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from pandas_profiling import ProfileReport" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read the HCC Dataset\n", + "df = pd.read_csv(\"hcc.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Produce and save the profiling report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "original_report = ProfileReport(df, title=\"Original Data\")\n", + "original_report.to_file(\"original_report.html\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of \"Alerts\"\n", + "Pandas Profiling alerts for the presence of 4 potential data quality problems:\n", + "\n", + "- `DUPLICATES`: 4 duplicate rows in data\n", + "- `CONSTANT`: Constant value β€œ999” in β€˜O2’\n", + "- `HIGH CORRELATION`: Several features marked as highly correlated\n", + "- `MISSING`: Missing Values in β€˜Ferritin’\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Removing Duplicate Rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop duplicate rows\n", + "df_transformed = df.copy()\n", + "df_transformed = df_transformed.drop_duplicates()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Removing Irrelevant Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove O2\n", + "df_transformed = df_transformed.drop(columns=\"O2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing Data Imputation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Impute Missing Values\n", + "from sklearn.impute import SimpleImputer\n", + "\n", + "mean_imputer = SimpleImputer(strategy=\"mean\")\n", + "df_transformed[\"Ferritin\"] = mean_imputer.fit_transform(\n", + " df_transformed[\"Ferritin\"].values.reshape(-1, 1)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Produce Comparison Report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_report = ProfileReport(df_transformed, title=\"Transformed Data\")\n", + "comparison_report = original_report.compare(transformed_report)\n", + "comparison_report.to_file(\"original_vs_transformed.html\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.8 ('feat-comp')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "vscode": { + "interpreter": { + "hash": "13390b9b50dde76c6c011e02183633aae7d8498993a6e6577a16e1b7cb8c7a8c" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/hcc/eda-with-feature-comparison.py b/examples/hcc/eda-with-feature-comparison.py new file mode 100644 index 000000000..9d5ab8edf --- /dev/null +++ b/examples/hcc/eda-with-feature-comparison.py @@ -0,0 +1,34 @@ +""" + Comparison report example for HCC dataset +""" +import pandas as pd +from sklearn.impute import SimpleImputer + +from pandas_profiling import ProfileReport + +if __name__ == "__main__": + + # Load the dataset + df = pd.read_csv("hcc.csv") + + # Produce profile report + original_report = ProfileReport(df, title="Original Data") + original_report.to_file("original_report.html") + + # Drop duplicate rows + df_transformed = df.copy() + df_transformed = df_transformed.drop_duplicates() + + # Remove O2 + df_transformed = df_transformed.drop(columns="O2") + + # Impute Missing Values + mean_imputer = SimpleImputer(strategy="mean") + df_transformed["Ferritin"] = mean_imputer.fit_transform( + df_transformed["Ferritin"].values.reshape(-1, 1) + ) + + # Produce comparison report + transformed_report = ProfileReport(df_transformed, title="Transformed Data") + comparison_report = original_report.compare(transformed_report) + comparison_report.to_file("original_vs_transformed.html") diff --git a/examples/report_comparison/comparison.ipynb b/examples/report_comparison/comparison.ipynb deleted file mode 100644 index bada1575f..000000000 --- a/examples/report_comparison/comparison.ipynb +++ /dev/null @@ -1,120 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "54253ce4", - "metadata": { - "cell_style": "center" - }, - "outputs": [], - "source": [ - "# Installed packages\n", - "import pandas as pd\n", - "\n", - "# Our package\n", - "from pandas_profiling import ProfileReport\n", - "from pandas_profiling.utils.cache import cache_file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "134987f5", - "metadata": {}, - "outputs": [], - "source": [ - "# Read the Titanic Dataset\n", - "file_name = cache_file(\n", - " \"titanic.csv\",\n", - " \"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv\",\n", - ")\n", - "df = pd.read_csv(file_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21dda1c6", - "metadata": {}, - "outputs": [], - "source": [ - "# Generate the Profiling Report from 2 samples from titanic dataset\n", - "profile1 = ProfileReport(df.sample(frac=0.5))\n", - "profile2 = ProfileReport(df.sample(frac=0.5))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fe83402f", - "metadata": {}, - "outputs": [], - "source": [ - "# compare the profiles and generate a comparison profile\n", - "comparison = profile1.compare(profile2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ad00f47", - "metadata": {}, - "outputs": [], - "source": [ - "# display the html profile in an iframe\n", - "comparison" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}