From 47d3aeb6e73af051ddba91ed6773816451994b97 Mon Sep 17 00:00:00 2001
From: Fabiana <30911746+fabclmnt@users.noreply.github.com>
Date: Thu, 24 Nov 2022 06:28:25 +0000
Subject: [PATCH] chore(examples): dataset compare examples (#1167)

* build(deps): bump cycjimmy/semantic-release-action from 2 to 3 (#1154)

* chore(actions): disable lint when prs come from dependabot (#1164)

* chore(actions): fix push and latest tag configs (#1166)

* docs(changelogs): fix changelog format (#1163)

* chore: move example files and add new hcc example

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Vasco Ramos <vasco.ramos@ydata.ai>
---
 .github/workflows/merge-dev.yml               |   4 +-
 .github/workflows/merge-master.yml            |  14 +-
 .github/workflows/pull-request.yml            |   2 +
 .../pages/reference/changelog/v3_3_1.rst      |   2 +-
 .../pages/reference/changelog/v3_4_0.rst      |   6 +-
 .../pages/reference/changelog/v3_5_0.md       |   8 +-
 ...on_auto_example.py => correlation_demo.py} |   0
 .../eda_dataset_compare.py}                   |   0
 .../hcc/eda-with-feature-comparison.ipynb     | 181 ++++++++++++++++++
 examples/hcc/eda-with-feature-comparison.py   |  34 ++++
 examples/report_comparison/comparison.ipynb   | 120 ------------
 11 files changed, 234 insertions(+), 137 deletions(-)
 rename examples/features/{correlation_auto_example.py => correlation_demo.py} (100%)
 rename examples/{report_comparison/comparison.py => features/eda_dataset_compare.py} (100%)
 create mode 100644 examples/hcc/eda-with-feature-comparison.ipynb
 create mode 100644 examples/hcc/eda-with-feature-comparison.py
 delete mode 100644 examples/report_comparison/comparison.ipynb

diff --git a/.github/workflows/merge-dev.yml b/.github/workflows/merge-dev.yml
index d6249b423..31fa59600 100644
--- a/.github/workflows/merge-dev.yml
+++ b/.github/workflows/merge-dev.yml
@@ -25,8 +25,8 @@ jobs:
       uses: oprypin/find-latest-tag@v1.1.1
       with:
         repository: ${{ github.repository }}
-        regex: '^\d+\.\d+\.\d+'
-        releases-only: false
+        regex: '^v\d+\.\d+\.\d+'
+        releases-only: true
 
     - name: Extract semantic version
       id: semantic
diff --git a/.github/workflows/merge-master.yml b/.github/workflows/merge-master.yml
index ba605e447..d763a9508 100644
--- a/.github/workflows/merge-master.yml
+++ b/.github/workflows/merge-master.yml
@@ -24,7 +24,7 @@ jobs:
       with:
         token: ${{ secrets.ACCESS_TOKEN }}
 
-    - uses: cycjimmy/semantic-release-action@v2
+    - uses: cycjimmy/semantic-release-action@v3
       id: semantic
       env:
         GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }}
@@ -50,6 +50,7 @@ jobs:
     - uses: actions/checkout@v3
       with:
         fetch-depth: 0
+        token: ${{ secrets.ACCESS_TOKEN }}
 
     - name: Calculate changelog path
       id: path
@@ -71,11 +72,11 @@ jobs:
     - name: Add changelog to docs
       run: |
         cd docsrc/source/pages/reference
-        sed -i -e 's/# \[.*/# Changelog ${{ needs.prepare.outputs.release }}/g' \
-            -e 's/## Bug Fixes/## 🐛 Bug fixes/g' \
-            -e 's/## Features/## 🎉 Features/g' \
-            -e 's/## BREAKING CHANGES/## 🚨 Breaking changes/g' \
-            -e 's/## Documentation/## 📖 Documentation/g' \
+        sed -i -e 's/## \[.*/### Changelog ${{ needs.prepare.outputs.release }}/g' \
+            -e 's/### Bug Fixes/#### 🐛 Bug fixes/g' \
+            -e 's/### Features/#### 🎉 Features/g' \
+            -e 's/### BREAKING CHANGES/#### 🚨 Breaking changes/g' \
+            -e 's/### Documentation/#### 📖 Documentation/g' \
             changelog/${{ steps.path.outputs.value }}
 
         grep -q ".. include:: changelog/${{ steps.path.outputs.value }}" changelog.rst || sed -i "4 a\\
@@ -101,7 +102,6 @@ jobs:
       with:
         branch: ${{ github.ref }}
         github_token: ${{ secrets.ACCESS_TOKEN }}
-        force: true
    
 
   prerelease-tag:
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index 859627713..41b21f3e0 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -6,6 +6,7 @@ on:
 
 jobs:
   commitlint:
+    if: github.actor != 'dependabot[bot]'
     name: Lint commit message
     runs-on: ubuntu-latest
 
@@ -17,6 +18,7 @@ jobs:
     - uses: wagoid/commitlint-github-action@v5
 
   lint:
+    if: github.actor != 'dependabot[bot]'
     name: Lint source code
     runs-on: ubuntu-latest
 
diff --git a/docsrc/source/pages/reference/changelog/v3_3_1.rst b/docsrc/source/pages/reference/changelog/v3_3_1.rst
index 72be6a332..27b40be0f 100644
--- a/docsrc/source/pages/reference/changelog/v3_3_1.rst
+++ b/docsrc/source/pages/reference/changelog/v3_3_1.rst
@@ -2,7 +2,7 @@ Changelog v3.3.1
 ----------------
 
 🐛 Bug fixes
-^^^^^^^^^^^
+^^^^^^^^^^^^
 
 -  remove unused imports
    (`66864c1 <https://github.com/ydataai/pandas-profiling/commit/66864c15cfa9b80cb426957e17410c579425d450>`__)
diff --git a/docsrc/source/pages/reference/changelog/v3_4_0.rst b/docsrc/source/pages/reference/changelog/v3_4_0.rst
index 1861bf59f..0d45c4f8e 100644
--- a/docsrc/source/pages/reference/changelog/v3_4_0.rst
+++ b/docsrc/source/pages/reference/changelog/v3_4_0.rst
@@ -1,8 +1,8 @@
 Changelog v3.4.0
-================
+----------------
 
 🐛 Bug fixes
------------
+^^^^^^^^^^^^
 
 -  correlation passing extra parameters
    (`#1114 <https://github.com/ydataai/pandas-profiling/issues/1114>`__)
@@ -29,7 +29,7 @@ Changelog v3.4.0
    (`985fbd1 <https://github.com/ydataai/pandas-profiling/commit/985fbd1fc0e826bda3ac1b725fa8842013743ab3>`__)
 
 🎉 Features
-----------
+^^^^^^^^^^^^
 
 -  add support for Pandas 1.5
    (`#1076 <https://github.com/ydataai/pandas-profiling/issues/1076>`__)
diff --git a/docsrc/source/pages/reference/changelog/v3_5_0.md b/docsrc/source/pages/reference/changelog/v3_5_0.md
index c8d140ab6..f4899e4e4 100644
--- a/docsrc/source/pages/reference/changelog/v3_5_0.md
+++ b/docsrc/source/pages/reference/changelog/v3_5_0.md
@@ -1,7 +1,7 @@
-# Changelog v3.5.0
+### Changelog v3.5.0
 
 
-### 🐛 Bug fixes
+#### 🐛 Bug fixes
 
 * change context managed backend ([#1149](https://github.com/ydataai/pandas-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/pandas-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1))
 * dataset names on comparison report ([#1159](https://github.com/ydataai/pandas-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/pandas-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18))
@@ -13,8 +13,8 @@
 * update repository links ([#1141](https://github.com/ydataai/pandas-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/pandas-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5))
 
 
-### 🎉 Features
+#### 🎉 Features
 
 * add typechecking to profile report ([#1139](https://github.com/ydataai/pandas-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/pandas-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61))
 * report comparison example ([#1160](https://github.com/ydataai/pandas-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/pandas-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73))
-* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150)
\ No newline at end of file
+* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150)
diff --git a/examples/features/correlation_auto_example.py b/examples/features/correlation_demo.py
similarity index 100%
rename from examples/features/correlation_auto_example.py
rename to examples/features/correlation_demo.py
diff --git a/examples/report_comparison/comparison.py b/examples/features/eda_dataset_compare.py
similarity index 100%
rename from examples/report_comparison/comparison.py
rename to examples/features/eda_dataset_compare.py
diff --git a/examples/hcc/eda-with-feature-comparison.ipynb b/examples/hcc/eda-with-feature-comparison.ipynb
new file mode 100644
index 000000000..f77682488
--- /dev/null
+++ b/examples/hcc/eda-with-feature-comparison.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pandas Profiling: HCC Dataset\n",
+    "Source of data: https://www.kaggle.com/datasets/mrsantos/hcc-dataset\n",
+    "\n",
+    "As modifiations have been introduced for the purpose of this use case, the .csv file is provided (hcc.csv)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "from pandas_profiling import ProfileReport"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the HCC Dataset\n",
+    "df = pd.read_csv(\"hcc.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Produce and save the profiling report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_report = ProfileReport(df, title=\"Original Data\")\n",
+    "original_report.to_file(\"original_report.html\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis of \"Alerts\"\n",
+    "Pandas Profiling alerts for the presence of 4 potential data quality problems:\n",
+    "\n",
+    "- `DUPLICATES`: 4 duplicate rows in data\n",
+    "- `CONSTANT`: Constant value “999” in ‘O2’\n",
+    "- `HIGH CORRELATION`: Several features marked as highly correlated\n",
+    "- `MISSING`: Missing Values in ‘Ferritin’\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Removing Duplicate Rows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop duplicate rows\n",
+    "df_transformed = df.copy()\n",
+    "df_transformed = df_transformed.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Removing Irrelevant Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove O2\n",
+    "df_transformed = df_transformed.drop(columns=\"O2\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Data Imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Impute Missing Values\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "\n",
+    "mean_imputer = SimpleImputer(strategy=\"mean\")\n",
+    "df_transformed[\"Ferritin\"] = mean_imputer.fit_transform(\n",
+    "    df_transformed[\"Ferritin\"].values.reshape(-1, 1)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Produce Comparison Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transformed_report = ProfileReport(df_transformed, title=\"Transformed Data\")\n",
+    "comparison_report = original_report.compare(transformed_report)\n",
+    "comparison_report.to_file(\"original_vs_transformed.html\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.8 ('feat-comp')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "13390b9b50dde76c6c011e02183633aae7d8498993a6e6577a16e1b7cb8c7a8c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/hcc/eda-with-feature-comparison.py b/examples/hcc/eda-with-feature-comparison.py
new file mode 100644
index 000000000..9d5ab8edf
--- /dev/null
+++ b/examples/hcc/eda-with-feature-comparison.py
@@ -0,0 +1,34 @@
+"""
+    Comparison report example for HCC dataset
+"""
+import pandas as pd
+from sklearn.impute import SimpleImputer
+
+from pandas_profiling import ProfileReport
+
+if __name__ == "__main__":
+
+    # Load the dataset
+    df = pd.read_csv("hcc.csv")
+
+    # Produce profile report
+    original_report = ProfileReport(df, title="Original Data")
+    original_report.to_file("original_report.html")
+
+    # Drop duplicate rows
+    df_transformed = df.copy()
+    df_transformed = df_transformed.drop_duplicates()
+
+    # Remove O2
+    df_transformed = df_transformed.drop(columns="O2")
+
+    # Impute Missing Values
+    mean_imputer = SimpleImputer(strategy="mean")
+    df_transformed["Ferritin"] = mean_imputer.fit_transform(
+        df_transformed["Ferritin"].values.reshape(-1, 1)
+    )
+
+    # Produce comparison report
+    transformed_report = ProfileReport(df_transformed, title="Transformed Data")
+    comparison_report = original_report.compare(transformed_report)
+    comparison_report.to_file("original_vs_transformed.html")
diff --git a/examples/report_comparison/comparison.ipynb b/examples/report_comparison/comparison.ipynb
deleted file mode 100644
index bada1575f..000000000
--- a/examples/report_comparison/comparison.ipynb
+++ /dev/null
@@ -1,120 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "54253ce4",
-   "metadata": {
-    "cell_style": "center"
-   },
-   "outputs": [],
-   "source": [
-    "# Installed packages\n",
-    "import pandas as pd\n",
-    "\n",
-    "# Our package\n",
-    "from pandas_profiling import ProfileReport\n",
-    "from pandas_profiling.utils.cache import cache_file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "134987f5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Read the Titanic Dataset\n",
-    "file_name = cache_file(\n",
-    "    \"titanic.csv\",\n",
-    "    \"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv\",\n",
-    ")\n",
-    "df = pd.read_csv(file_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "21dda1c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate the Profiling Report from 2 samples from titanic dataset\n",
-    "profile1 = ProfileReport(df.sample(frac=0.5))\n",
-    "profile2 = ProfileReport(df.sample(frac=0.5))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fe83402f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# compare the profiles and generate a comparison profile\n",
-    "comparison = profile1.compare(profile2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9ad00f47",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# display the html profile in an iframe\n",
-    "comparison"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  },
-  "varInspector": {
-   "cols": {
-    "lenName": 16,
-    "lenType": 16,
-    "lenVar": 40
-   },
-   "kernels_config": {
-    "python": {
-     "delete_cmd_postfix": "",
-     "delete_cmd_prefix": "del ",
-     "library": "var_list.py",
-     "varRefreshCmd": "print(var_dic_list())"
-    },
-    "r": {
-     "delete_cmd_postfix": ") ",
-     "delete_cmd_prefix": "rm(",
-     "library": "var_list.r",
-     "varRefreshCmd": "cat(var_dic_list()) "
-    }
-   },
-   "types_to_exclude": [
-    "module",
-    "function",
-    "builtin_function_or_method",
-    "instance",
-    "_Feature"
-   ],
-   "window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}