chore(examples): dataset compare examples (#1167)

* build(deps): bump cycjimmy/semantic-release-action from 2 to 3 (#1154) * chore(actions): disable lint when prs come from dependabot (#1164) * chore(actions): fix push and latest tag configs (#1166) * docs(changelogs): fix changelog format (#1163) * chore: move example files and add new hcc example Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Vasco Ramos <[email protected]>
ydataai · Nov 24, 2022 · 47d3aeb · 47d3aeb
1 parent 40a62b8
commit 47d3aeb
Show file tree

Hide file tree

Showing 11 changed files with 234 additions and 137 deletions.
diff --git a/.github/workflows/merge-dev.yml b/.github/workflows/merge-dev.yml
@@ -25,8 +25,8 @@ jobs:
  uses: oprypin/[email protected]
  with:
  repository: ${{ github.repository }}
- regex: '^\d+\.\d+\.\d+'
- releases-only: false
+ regex: '^v\d+\.\d+\.\d+'
+ releases-only: true
 
  - name: Extract semantic version
  id: semantic

diff --git a/.github/workflows/merge-master.yml b/.github/workflows/merge-master.yml
@@ -24,7 +24,7 @@ jobs:
  with:
  token: ${{ secrets.ACCESS_TOKEN }}
 
- - uses: cycjimmy/semantic-release-action@v2
+ - uses: cycjimmy/semantic-release-action@v3
  id: semantic
  env:
  GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }}
@@ -50,6 +50,7 @@ jobs:
  - uses: actions/checkout@v3
  with:
  fetch-depth: 0
+ token: ${{ secrets.ACCESS_TOKEN }}
 
  - name: Calculate changelog path
  id: path
@@ -71,11 +72,11 @@ jobs:
  - name: Add changelog to docs
  run: |
  cd docsrc/source/pages/reference
- sed -i -e 's/# \[.*/# Changelog ${{ needs.prepare.outputs.release }}/g' \
- -e 's/## Bug Fixes/## 🐛 Bug fixes/g' \
- -e 's/## Features/## 🎉 Features/g' \
- -e 's/## BREAKING CHANGES/## 🚨 Breaking changes/g' \
- -e 's/## Documentation/## 📖 Documentation/g' \
+ sed -i -e 's/## \[.*/### Changelog ${{ needs.prepare.outputs.release }}/g' \
+ -e 's/### Bug Fixes/#### 🐛 Bug fixes/g' \
+ -e 's/### Features/#### 🎉 Features/g' \
+ -e 's/### BREAKING CHANGES/#### 🚨 Breaking changes/g' \
+ -e 's/### Documentation/#### 📖 Documentation/g' \
  changelog/${{ steps.path.outputs.value }}
 
  grep -q ".. include:: changelog/${{ steps.path.outputs.value }}" changelog.rst || sed -i "4 a\\
@@ -101,7 +102,6 @@ jobs:
  with:
  branch: ${{ github.ref }}
  github_token: ${{ secrets.ACCESS_TOKEN }}
- force: true
 
 
  prerelease-tag:

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -6,6 +6,7 @@ on:
 
 jobs:
  commitlint:
+ if: github.actor != 'dependabot[bot]'
  name: Lint commit message
  runs-on: ubuntu-latest
 
@@ -17,6 +18,7 @@ jobs:
  - uses: wagoid/commitlint-github-action@v5
 
  lint:
+ if: github.actor != 'dependabot[bot]'
  name: Lint source code
  runs-on: ubuntu-latest
 

diff --git a/docsrc/source/pages/reference/changelog/v3_3_1.rst b/docsrc/source/pages/reference/changelog/v3_3_1.rst
@@ -2,7 +2,7 @@ Changelog v3.3.1
 ----------------
 
 🐛 Bug fixes
-^^^^^^^^^^^
+^^^^^^^^^^^^
 
 - remove unused imports
  (`66864c1 <https://github.com/ydataai/pandas-profiling/commit/66864c15cfa9b80cb426957e17410c579425d450>`__)

diff --git a/docsrc/source/pages/reference/changelog/v3_4_0.rst b/docsrc/source/pages/reference/changelog/v3_4_0.rst
@@ -1,8 +1,8 @@
 Changelog v3.4.0
-================
+----------------
 
 🐛 Bug fixes
------------
+^^^^^^^^^^^^
 
 - correlation passing extra parameters
  (`#1114 <https://github.com/ydataai/pandas-profiling/issues/1114>`__)
@@ -29,7 +29,7 @@ Changelog v3.4.0
  (`985fbd1 <https://github.com/ydataai/pandas-profiling/commit/985fbd1fc0e826bda3ac1b725fa8842013743ab3>`__)
 
 🎉 Features
-----------
+^^^^^^^^^^^^
 
 - add support for Pandas 1.5
  (`#1076 <https://github.com/ydataai/pandas-profiling/issues/1076>`__)

diff --git a/docsrc/source/pages/reference/changelog/v3_5_0.md b/docsrc/source/pages/reference/changelog/v3_5_0.md
@@ -1,7 +1,7 @@
-# Changelog v3.5.0
+### Changelog v3.5.0
 
 
-### 🐛 Bug fixes
+#### 🐛 Bug fixes
 
 * change context managed backend ([#1149](https://github.com/ydataai/pandas-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/pandas-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1))
 * dataset names on comparison report ([#1159](https://github.com/ydataai/pandas-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/pandas-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18))
@@ -13,8 +13,8 @@
 * update repository links ([#1141](https://github.com/ydataai/pandas-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/pandas-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5))
 
 
-### 🎉 Features
+#### 🎉 Features
 
 * add typechecking to profile report ([#1139](https://github.com/ydataai/pandas-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/pandas-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61))
 * report comparison example ([#1160](https://github.com/ydataai/pandas-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/pandas-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73))
-* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150)
+* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150)
diff --git a/...ples/features/correlation_auto_example.py → examples/features/correlation_demo.py b/...ples/features/correlation_auto_example.py → examples/features/correlation_demo.py
diff --git a/examples/report_comparison/comparison.py → examples/features/eda_dataset_compare.py b/examples/report_comparison/comparison.py → examples/features/eda_dataset_compare.py
diff --git a/examples/hcc/eda-with-feature-comparison.ipynb b/examples/hcc/eda-with-feature-comparison.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Pandas Profiling: HCC Dataset\n",
+ "Source of data: https://www.kaggle.com/datasets/mrsantos/hcc-dataset\n",
+ "\n",
+ "As modifiations have been introduced for the purpose of this use case, the .csv file is provided (hcc.csv)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Import libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "from pandas_profiling import ProfileReport"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Read the HCC Dataset\n",
+ "df = pd.read_csv(\"hcc.csv\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Produce and save the profiling report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "original_report = ProfileReport(df, title=\"Original Data\")\n",
+ "original_report.to_file(\"original_report.html\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis of \"Alerts\"\n",
+ "Pandas Profiling alerts for the presence of 4 potential data quality problems:\n",
+ "\n",
+ "- `DUPLICATES`: 4 duplicate rows in data\n",
+ "- `CONSTANT`: Constant value “999” in ‘O2’\n",
+ "- `HIGH CORRELATION`: Several features marked as highly correlated\n",
+ "- `MISSING`: Missing Values in ‘Ferritin’\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Removing Duplicate Rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop duplicate rows\n",
+ "df_transformed = df.copy()\n",
+ "df_transformed = df_transformed.drop_duplicates()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Removing Irrelevant Features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Remove O2\n",
+ "df_transformed = df_transformed.drop(columns=\"O2\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Missing Data Imputation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Impute Missing Values\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "\n",
+ "mean_imputer = SimpleImputer(strategy=\"mean\")\n",
+ "df_transformed[\"Ferritin\"] = mean_imputer.fit_transform(\n",
+ " df_transformed[\"Ferritin\"].values.reshape(-1, 1)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Produce Comparison Report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transformed_report = ProfileReport(df_transformed, title=\"Transformed Data\")\n",
+ "comparison_report = original_report.compare(transformed_report)\n",
+ "comparison_report.to_file(\"original_vs_transformed.html\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.8 ('feat-comp')",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.8"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "13390b9b50dde76c6c011e02183633aae7d8498993a6e6577a16e1b7cb8c7a8c"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/hcc/eda-with-feature-comparison.py b/examples/hcc/eda-with-feature-comparison.py
@@ -0,0 +1,34 @@
+"""
+ Comparison report example for HCC dataset
+"""
+import pandas as pd
+from sklearn.impute import SimpleImputer
+
+from pandas_profiling import ProfileReport
+
+if __name__ == "__main__":
+
+ # Load the dataset
+ df = pd.read_csv("hcc.csv")
+
+ # Produce profile report
+ original_report = ProfileReport(df, title="Original Data")
+ original_report.to_file("original_report.html")
+
+ # Drop duplicate rows
+ df_transformed = df.copy()
+ df_transformed = df_transformed.drop_duplicates()
+
+ # Remove O2
+ df_transformed = df_transformed.drop(columns="O2")
+
+ # Impute Missing Values
+ mean_imputer = SimpleImputer(strategy="mean")
+ df_transformed["Ferritin"] = mean_imputer.fit_transform(
+ df_transformed["Ferritin"].values.reshape(-1, 1)
+ )
+
+ # Produce comparison report
+ transformed_report = ProfileReport(df_transformed, title="Transformed Data")
+ comparison_report = original_report.compare(transformed_report)
+ comparison_report.to_file("original_vs_transformed.html")