From f7d6f288cf0a7cc59f9767f3b8e0d390a1b53dc1 Mon Sep 17 00:00:00 2001
From: ccl-core <91942859+ccl-core@users.noreply.github.com>
Date: Mon, 18 Sep 2023 14:16:16 +0000
Subject: [PATCH 1/5] Adding notebook testing to ci actions

---
 .github/workflows/ci.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7a37142d8..b01877ab5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -45,6 +45,43 @@ jobs:
     - name: Docstrings are defined
       run: make flake8
 
+  notebook-test:
+    name: Notebook Tests / Python ${{ matrix.python-version }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # Tests currently fail for 3.8 and 3.9.
+        python-version: ['3.10', '3.11']
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./python/mlcroissant
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install library
+      run: pip install .[dev]
+
+    - run: pip install ipython
+    
+    # Notebooks are in the recipes/ folder. 
+    - name: Run notebook
+      run: |
+        ipython kernel install --user --name croissant-notebook
+        for notebook in recipes/*ipynb
+          do
+            jupyter nbconvert \
+              --ExecutePreprocessor.timeout=600 \
+              --ExecutePreprocessor.kernel_name=croissant-notebook \
+              --to notebook \
+              --execute $notebook
+          done
+          
   python-format:
     name: Python format
     runs-on: ubuntu-latest

From b02a9ee6bfa51f4501e8751c9cf6eb66385197bf Mon Sep 17 00:00:00 2001
From: ccl-core <91942859+ccl-core@users.noreply.github.com>
Date: Mon, 18 Sep 2023 15:55:17 +0000
Subject: [PATCH 2/5] Add deps

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b01877ab5..3c34afff9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -67,7 +67,7 @@ jobs:
     - name: Install library
       run: pip install .[dev]
 
-    - run: pip install ipython
+    - run: pip install ipython ipykernel
     
     # Notebooks are in the recipes/ folder. 
     - name: Run notebook

From 2d340c2192b41671ae3c9b8d7141f38517bedf65 Mon Sep 17 00:00:00 2001
From: ccl-core <91942859+ccl-core@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:00:37 +0000
Subject: [PATCH 3/5] Add other deps

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3c34afff9..62e64d846 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -67,7 +67,7 @@ jobs:
     - name: Install library
       run: pip install .[dev]
 
-    - run: pip install ipython ipykernel
+    - run: pip install ipython ipykernel nbconvert
     
     # Notebooks are in the recipes/ folder. 
     - name: Run notebook

From 622b3f3ad999d7c39cd167e307c5bbf986c175b5 Mon Sep 17 00:00:00 2001
From: ccl-core <91942859+ccl-core@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:14:48 +0000
Subject: [PATCH 4/5] Update notebook data_type to data_types

---
 python/mlcroissant/recipes/introduction.ipynb | 173 +++++++++---------
 1 file changed, 91 insertions(+), 82 deletions(-)

diff --git a/python/mlcroissant/recipes/introduction.ipynb b/python/mlcroissant/recipes/introduction.ipynb
index 93fa3243f..73b079246 100644
--- a/python/mlcroissant/recipes/introduction.ipynb
+++ b/python/mlcroissant/recipes/introduction.ipynb
@@ -1,30 +1,19 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
-      "source": [
-        "# Tutorial for `mlcroissant` 🥐"
-      ],
       "metadata": {
         "id": "AriH9CP6AKhs"
-      }
+      },
+      "source": [
+        "# Tutorial for `mlcroissant` 🥐"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Hh-0cehIAErA"
+      },
       "source": [
         "## Introduction\n",
         "\n",
@@ -37,10 +26,7 @@
         "- Programmatically write your JSON-LD Croissant files.\n",
         "- Verify your JSON-LD Croissant files.\n",
         "- Load data from Croissant datasets."
-      ],
-      "metadata": {
-        "id": "Hh-0cehIAErA"
-      }
+      ]
     },
     {
       "cell_type": "code",
@@ -57,19 +43,24 @@
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Xwrol5JR_GTY"
+      },
       "source": [
         "## Example\n",
         "\n",
         "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n",
         "\n",
         "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset."
-      ],
-      "metadata": {
-        "id": "Xwrol5JR_GTY"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "7OyQffJv-zso"
+      },
+      "outputs": [],
       "source": [
         "import mlcroissant as mlc\n",
         "\n",
@@ -102,7 +93,7 @@
         "            mlc.nodes.Field(\n",
         "                name=\"context\",\n",
         "                description=\"\",\n",
-        "                data_type=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n",
+        "                data_types=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n",
         "                source=mlc.nodes.Source(\n",
         "                    uid=\"jsonl-files\",\n",
         "                    node_type=\"distribution\",\n",
@@ -113,7 +104,7 @@
         "            mlc.nodes.Field(\n",
         "                name=\"completion\",\n",
         "                description=\"The expected completion of the promt.\",\n",
-        "                data_type=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n",
+        "                data_types=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n",
         "                source=mlc.nodes.Source(\n",
         "                    uid=\"jsonl-files\",\n",
         "                    node_type=\"distribution\",\n",
@@ -126,7 +117,7 @@
         "                    \"The machine learning task appearing as the name of the\"\n",
         "                    \" file.\"\n",
         "                ),\n",
-        "                data_type=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n",
+        "                data_types=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n",
         "                source=mlc.nodes.Source(\n",
         "                    uid=\"jsonl-files\",\n",
         "                    node_type=\"distribution\",\n",
@@ -191,61 +182,61 @@
         "    distribution=distribution,\n",
         "    record_sets=record_sets,\n",
         ")\n"
-      ],
-      "metadata": {
-        "id": "7OyQffJv-zso"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "2RUVgWI-DldZ"
+      },
       "source": [
         "When creating `Metadata`:\n",
         "- We also check for errors in the configuration.\n",
         "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n",
         "\n",
         "For instance, in this case:"
-      ],
-      "metadata": {
-        "id": "2RUVgWI-DldZ"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "print(metadata.issues.report())"
-      ],
+      "execution_count": 2,
       "metadata": {
         "id": "AENcJUwMCd1B"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "print(metadata.issues.report())"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "vES3KHaND4P2"
+      },
       "source": [
         "`Property \"https://schema.org/license\" is recommended`...\n",
         "\n",
         "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!"
-      ],
-      "metadata": {
-        "id": "vES3KHaND4P2"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "S0BEhzqiEjd0"
+      },
       "source": [
         "## Build the Croissant file and yield data\n",
         "\n",
         "Let's write the Croissant JSON-LD to a file on disk!"
-      ],
-      "metadata": {
-        "id": "S0BEhzqiEjd0"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "-XCycu81ECVq"
+      },
+      "outputs": [],
       "source": [
         "import json\n",
         "\n",
@@ -254,44 +245,44 @@
         "  content = json.dumps(content, indent=2)\n",
         "  print(content)\n",
         "  f.write(content)"
-      ],
-      "metadata": {
-        "id": "-XCycu81ECVq"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "From this JSON-LD file, we can easily create a dataset..."
-      ],
       "metadata": {
         "id": "Ypb_ll3SE6UU"
-      }
+      },
+      "source": [
+        "From this JSON-LD file, we can easily create a dataset..."
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "dataset = mlc.Dataset(file=\"croissant.json\")"
-      ],
+      "execution_count": 5,
       "metadata": {
         "id": "_JNyQFuAEiIs"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "dataset = mlc.Dataset(file=\"croissant.json\")"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "...and yield records from this dataset:"
-      ],
       "metadata": {
         "id": "ldwdIGPoFT_p"
-      }
+      },
+      "source": [
+        "...and yield records from this dataset:"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "MHdVY4TBEqZ8"
+      },
+      "outputs": [],
       "source": [
         "records = dataset.records(record_set=\"jsonl\")\n",
         "\n",
@@ -299,21 +290,39 @@
         "  print(record)\n",
         "  if i > 10:\n",
         "    break"
-      ],
-      "metadata": {
-        "id": "MHdVY4TBEqZ8"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
-      "source": [],
+      "execution_count": null,
       "metadata": {
         "id": "8a2sCy0GFYCQ"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.8"
     }
-  ]
-}
\ No newline at end of file
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

From e0133486b6841a0499df714354d36676114f5de2 Mon Sep 17 00:00:00 2001
From: ccl-core <91942859+ccl-core@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:19:43 +0000
Subject: [PATCH 5/5] Restore notebook formatting.

---
 python/mlcroissant/recipes/introduction.ipynb | 165 +++++++++---------
 1 file changed, 78 insertions(+), 87 deletions(-)

diff --git a/python/mlcroissant/recipes/introduction.ipynb b/python/mlcroissant/recipes/introduction.ipynb
index 73b079246..9a8164da7 100644
--- a/python/mlcroissant/recipes/introduction.ipynb
+++ b/python/mlcroissant/recipes/introduction.ipynb
@@ -1,19 +1,30 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
   "cells": [
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "AriH9CP6AKhs"
-      },
       "source": [
         "# Tutorial for `mlcroissant` 🥐"
-      ]
+      ],
+      "metadata": {
+        "id": "AriH9CP6AKhs"
+      }
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "Hh-0cehIAErA"
-      },
       "source": [
         "## Introduction\n",
         "\n",
@@ -26,7 +37,10 @@
         "- Programmatically write your JSON-LD Croissant files.\n",
         "- Verify your JSON-LD Croissant files.\n",
         "- Load data from Croissant datasets."
-      ]
+      ],
+      "metadata": {
+        "id": "Hh-0cehIAErA"
+      }
     },
     {
       "cell_type": "code",
@@ -43,24 +57,19 @@
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "Xwrol5JR_GTY"
-      },
       "source": [
         "## Example\n",
         "\n",
         "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n",
         "\n",
         "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset."
-      ]
+      ],
+      "metadata": {
+        "id": "Xwrol5JR_GTY"
+      }
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "7OyQffJv-zso"
-      },
-      "outputs": [],
       "source": [
         "import mlcroissant as mlc\n",
         "\n",
@@ -182,61 +191,61 @@
         "    distribution=distribution,\n",
         "    record_sets=record_sets,\n",
         ")\n"
-      ]
+      ],
+      "metadata": {
+        "id": "7OyQffJv-zso"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "2RUVgWI-DldZ"
-      },
       "source": [
         "When creating `Metadata`:\n",
         "- We also check for errors in the configuration.\n",
         "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n",
         "\n",
         "For instance, in this case:"
-      ]
+      ],
+      "metadata": {
+        "id": "2RUVgWI-DldZ"
+      }
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "source": [
+        "print(metadata.issues.report())"
+      ],
       "metadata": {
         "id": "AENcJUwMCd1B"
       },
-      "outputs": [],
-      "source": [
-        "print(metadata.issues.report())"
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "vES3KHaND4P2"
-      },
       "source": [
         "`Property \"https://schema.org/license\" is recommended`...\n",
         "\n",
         "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!"
-      ]
+      ],
+      "metadata": {
+        "id": "vES3KHaND4P2"
+      }
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "S0BEhzqiEjd0"
-      },
       "source": [
         "## Build the Croissant file and yield data\n",
         "\n",
         "Let's write the Croissant JSON-LD to a file on disk!"
-      ]
+      ],
+      "metadata": {
+        "id": "S0BEhzqiEjd0"
+      }
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "id": "-XCycu81ECVq"
-      },
-      "outputs": [],
       "source": [
         "import json\n",
         "\n",
@@ -245,44 +254,44 @@
         "  content = json.dumps(content, indent=2)\n",
         "  print(content)\n",
         "  f.write(content)"
-      ]
+      ],
+      "metadata": {
+        "id": "-XCycu81ECVq"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "Ypb_ll3SE6UU"
-      },
       "source": [
         "From this JSON-LD file, we can easily create a dataset..."
-      ]
+      ],
+      "metadata": {
+        "id": "Ypb_ll3SE6UU"
+      }
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "source": [
+        "dataset = mlc.Dataset(file=\"croissant.json\")"
+      ],
       "metadata": {
         "id": "_JNyQFuAEiIs"
       },
-      "outputs": [],
-      "source": [
-        "dataset = mlc.Dataset(file=\"croissant.json\")"
-      ]
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
-      "metadata": {
-        "id": "ldwdIGPoFT_p"
-      },
       "source": [
         "...and yield records from this dataset:"
-      ]
+      ],
+      "metadata": {
+        "id": "ldwdIGPoFT_p"
+      }
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "id": "MHdVY4TBEqZ8"
-      },
-      "outputs": [],
       "source": [
         "records = dataset.records(record_set=\"jsonl\")\n",
         "\n",
@@ -290,39 +299,21 @@
         "  print(record)\n",
         "  if i > 10:\n",
         "    break"
-      ]
+      ],
+      "metadata": {
+        "id": "MHdVY4TBEqZ8"
+      },
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "source": [],
       "metadata": {
         "id": "8a2sCy0GFYCQ"
       },
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.8"
+      "execution_count": null,
+      "outputs": []
     }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  ]
 }