From f7d6f288cf0a7cc59f9767f3b8e0d390a1b53dc1 Mon Sep 17 00:00:00 2001 From: ccl-core <91942859+ccl-core@users.noreply.github.com> Date: Mon, 18 Sep 2023 14:16:16 +0000 Subject: [PATCH 1/5] Adding notebook testing to ci actions --- .github/workflows/ci.yml | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7a37142d8..b01877ab5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,43 @@ jobs: - name: Docstrings are defined run: make flake8 + notebook-test: + name: Notebook Tests / Python ${{ matrix.python-version }} + strategy: + fail-fast: false + matrix: + # Tests currently fail for 3.8 and 3.9. + python-version: ['3.10', '3.11'] + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./python/mlcroissant + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install library + run: pip install .[dev] + + - run: pip install ipython + + # Notebooks are in the recipes/ folder. + - name: Run notebook + run: | + ipython kernel install --user --name croissant-notebook + for notebook in recipes/*ipynb + do + jupyter nbconvert \ + --ExecutePreprocessor.timeout=600 \ + --ExecutePreprocessor.kernel_name=croissant-notebook \ + --to notebook \ + --execute $notebook + done + python-format: name: Python format runs-on: ubuntu-latest From b02a9ee6bfa51f4501e8751c9cf6eb66385197bf Mon Sep 17 00:00:00 2001 From: ccl-core <91942859+ccl-core@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:55:17 +0000 Subject: [PATCH 2/5] Add deps --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b01877ab5..3c34afff9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,7 +67,7 @@ jobs: - name: Install library run: pip install .[dev] - - run: pip install ipython + - run: pip install ipython ipykernel # Notebooks are in the recipes/ folder. - name: Run notebook From 2d340c2192b41671ae3c9b8d7141f38517bedf65 Mon Sep 17 00:00:00 2001 From: ccl-core <91942859+ccl-core@users.noreply.github.com> Date: Mon, 18 Sep 2023 16:00:37 +0000 Subject: [PATCH 3/5] Add other deps --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c34afff9..62e64d846 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,7 +67,7 @@ jobs: - name: Install library run: pip install .[dev] - - run: pip install ipython ipykernel + - run: pip install ipython ipykernel nbconvert # Notebooks are in the recipes/ folder. - name: Run notebook From 622b3f3ad999d7c39cd167e307c5bbf986c175b5 Mon Sep 17 00:00:00 2001 From: ccl-core <91942859+ccl-core@users.noreply.github.com> Date: Mon, 18 Sep 2023 16:14:48 +0000 Subject: [PATCH 4/5] Update notebook data_type to data_types --- python/mlcroissant/recipes/introduction.ipynb | 173 +++++++++--------- 1 file changed, 91 insertions(+), 82 deletions(-) diff --git a/python/mlcroissant/recipes/introduction.ipynb b/python/mlcroissant/recipes/introduction.ipynb index 93fa3243f..73b079246 100644 --- a/python/mlcroissant/recipes/introduction.ipynb +++ b/python/mlcroissant/recipes/introduction.ipynb @@ -1,30 +1,19 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", - "source": [ - "# Tutorial for `mlcroissant` 🥐" - ], "metadata": { "id": "AriH9CP6AKhs" - } + }, + "source": [ + "# Tutorial for `mlcroissant` 🥐" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "Hh-0cehIAErA" + }, "source": [ "## Introduction\n", "\n", @@ -37,10 +26,7 @@ "- Programmatically write your JSON-LD Croissant files.\n", "- Verify your JSON-LD Croissant files.\n", "- Load data from Croissant datasets." - ], - "metadata": { - "id": "Hh-0cehIAErA" - } + ] }, { "cell_type": "code", @@ -57,19 +43,24 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "Xwrol5JR_GTY" + }, "source": [ "## Example\n", "\n", "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n", "\n", "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset." - ], - "metadata": { - "id": "Xwrol5JR_GTY" - } + ] }, { "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "7OyQffJv-zso" + }, + "outputs": [], "source": [ "import mlcroissant as mlc\n", "\n", @@ -102,7 +93,7 @@ " mlc.nodes.Field(\n", " name=\"context\",\n", " description=\"\",\n", - " data_type=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n", + " data_types=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n", " source=mlc.nodes.Source(\n", " uid=\"jsonl-files\",\n", " node_type=\"distribution\",\n", @@ -113,7 +104,7 @@ " mlc.nodes.Field(\n", " name=\"completion\",\n", " description=\"The expected completion of the promt.\",\n", - " data_type=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n", + " data_types=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n", " source=mlc.nodes.Source(\n", " uid=\"jsonl-files\",\n", " node_type=\"distribution\",\n", @@ -126,7 +117,7 @@ " \"The machine learning task appearing as the name of the\"\n", " \" file.\"\n", " ),\n", - " data_type=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n", + " data_types=mlc.constants.SCHEMA_ORG_DATA_TYPE_TEXT,\n", " source=mlc.nodes.Source(\n", " uid=\"jsonl-files\",\n", " node_type=\"distribution\",\n", @@ -191,61 +182,61 @@ " distribution=distribution,\n", " record_sets=record_sets,\n", ")\n" - ], - "metadata": { - "id": "7OyQffJv-zso" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "2RUVgWI-DldZ" + }, "source": [ "When creating `Metadata`:\n", "- We also check for errors in the configuration.\n", "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n", "\n", "For instance, in this case:" - ], - "metadata": { - "id": "2RUVgWI-DldZ" - } + ] }, { "cell_type": "code", - "source": [ - "print(metadata.issues.report())" - ], + "execution_count": 2, "metadata": { "id": "AENcJUwMCd1B" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "print(metadata.issues.report())" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "vES3KHaND4P2" + }, "source": [ "`Property \"https://schema.org/license\" is recommended`...\n", "\n", "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!" - ], - "metadata": { - "id": "vES3KHaND4P2" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "S0BEhzqiEjd0" + }, "source": [ "## Build the Croissant file and yield data\n", "\n", "Let's write the Croissant JSON-LD to a file on disk!" - ], - "metadata": { - "id": "S0BEhzqiEjd0" - } + ] }, { "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "-XCycu81ECVq" + }, + "outputs": [], "source": [ "import json\n", "\n", @@ -254,44 +245,44 @@ " content = json.dumps(content, indent=2)\n", " print(content)\n", " f.write(content)" - ], - "metadata": { - "id": "-XCycu81ECVq" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "From this JSON-LD file, we can easily create a dataset..." - ], "metadata": { "id": "Ypb_ll3SE6UU" - } + }, + "source": [ + "From this JSON-LD file, we can easily create a dataset..." + ] }, { "cell_type": "code", - "source": [ - "dataset = mlc.Dataset(file=\"croissant.json\")" - ], + "execution_count": 5, "metadata": { "id": "_JNyQFuAEiIs" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "dataset = mlc.Dataset(file=\"croissant.json\")" + ] }, { "cell_type": "markdown", - "source": [ - "...and yield records from this dataset:" - ], "metadata": { "id": "ldwdIGPoFT_p" - } + }, + "source": [ + "...and yield records from this dataset:" + ] }, { "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "MHdVY4TBEqZ8" + }, + "outputs": [], "source": [ "records = dataset.records(record_set=\"jsonl\")\n", "\n", @@ -299,21 +290,39 @@ " print(record)\n", " if i > 10:\n", " break" - ], - "metadata": { - "id": "MHdVY4TBEqZ8" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "8a2sCy0GFYCQ" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From e0133486b6841a0499df714354d36676114f5de2 Mon Sep 17 00:00:00 2001 From: ccl-core <91942859+ccl-core@users.noreply.github.com> Date: Mon, 18 Sep 2023 16:19:43 +0000 Subject: [PATCH 5/5] Restore notebook formatting. --- python/mlcroissant/recipes/introduction.ipynb | 165 +++++++++--------- 1 file changed, 78 insertions(+), 87 deletions(-) diff --git a/python/mlcroissant/recipes/introduction.ipynb b/python/mlcroissant/recipes/introduction.ipynb index 73b079246..9a8164da7 100644 --- a/python/mlcroissant/recipes/introduction.ipynb +++ b/python/mlcroissant/recipes/introduction.ipynb @@ -1,19 +1,30 @@ { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, "cells": [ { "cell_type": "markdown", - "metadata": { - "id": "AriH9CP6AKhs" - }, "source": [ "# Tutorial for `mlcroissant` 🥐" - ] + ], + "metadata": { + "id": "AriH9CP6AKhs" + } }, { "cell_type": "markdown", - "metadata": { - "id": "Hh-0cehIAErA" - }, "source": [ "## Introduction\n", "\n", @@ -26,7 +37,10 @@ "- Programmatically write your JSON-LD Croissant files.\n", "- Verify your JSON-LD Croissant files.\n", "- Load data from Croissant datasets." - ] + ], + "metadata": { + "id": "Hh-0cehIAErA" + } }, { "cell_type": "code", @@ -43,24 +57,19 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "Xwrol5JR_GTY" - }, "source": [ "## Example\n", "\n", "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n", "\n", "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset." - ] + ], + "metadata": { + "id": "Xwrol5JR_GTY" + } }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "7OyQffJv-zso" - }, - "outputs": [], "source": [ "import mlcroissant as mlc\n", "\n", @@ -182,61 +191,61 @@ " distribution=distribution,\n", " record_sets=record_sets,\n", ")\n" - ] + ], + "metadata": { + "id": "7OyQffJv-zso" + }, + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", - "metadata": { - "id": "2RUVgWI-DldZ" - }, "source": [ "When creating `Metadata`:\n", "- We also check for errors in the configuration.\n", "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n", "\n", "For instance, in this case:" - ] + ], + "metadata": { + "id": "2RUVgWI-DldZ" + } }, { "cell_type": "code", - "execution_count": 2, + "source": [ + "print(metadata.issues.report())" + ], "metadata": { "id": "AENcJUwMCd1B" }, - "outputs": [], - "source": [ - "print(metadata.issues.report())" - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", - "metadata": { - "id": "vES3KHaND4P2" - }, "source": [ "`Property \"https://schema.org/license\" is recommended`...\n", "\n", "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!" - ] + ], + "metadata": { + "id": "vES3KHaND4P2" + } }, { "cell_type": "markdown", - "metadata": { - "id": "S0BEhzqiEjd0" - }, "source": [ "## Build the Croissant file and yield data\n", "\n", "Let's write the Croissant JSON-LD to a file on disk!" - ] + ], + "metadata": { + "id": "S0BEhzqiEjd0" + } }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "-XCycu81ECVq" - }, - "outputs": [], "source": [ "import json\n", "\n", @@ -245,44 +254,44 @@ " content = json.dumps(content, indent=2)\n", " print(content)\n", " f.write(content)" - ] + ], + "metadata": { + "id": "-XCycu81ECVq" + }, + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", - "metadata": { - "id": "Ypb_ll3SE6UU" - }, "source": [ "From this JSON-LD file, we can easily create a dataset..." - ] + ], + "metadata": { + "id": "Ypb_ll3SE6UU" + } }, { "cell_type": "code", - "execution_count": 5, + "source": [ + "dataset = mlc.Dataset(file=\"croissant.json\")" + ], "metadata": { "id": "_JNyQFuAEiIs" }, - "outputs": [], - "source": [ - "dataset = mlc.Dataset(file=\"croissant.json\")" - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", - "metadata": { - "id": "ldwdIGPoFT_p" - }, "source": [ "...and yield records from this dataset:" - ] + ], + "metadata": { + "id": "ldwdIGPoFT_p" + } }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "MHdVY4TBEqZ8" - }, - "outputs": [], "source": [ "records = dataset.records(record_set=\"jsonl\")\n", "\n", @@ -290,39 +299,21 @@ " print(record)\n", " if i > 10:\n", " break" - ] + ], + "metadata": { + "id": "MHdVY4TBEqZ8" + }, + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": null, + "source": [], "metadata": { "id": "8a2sCy0GFYCQ" }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" + "execution_count": null, + "outputs": [] } - }, - "nbformat": 4, - "nbformat_minor": 0 + ] }