From e40df4b4faa6e1e47fae693e871f915c4a4e6dc1 Mon Sep 17 00:00:00 2001 From: HaudinFlorence Date: Wed, 6 Nov 2024 17:56:20 +0100 Subject: [PATCH] Change the logics to get the headers using parsing of the markdown cell content to html and using BeautifulSoup.select. --- Untitled.ipynb | 143 ++++++++++++++++++++++++++ Untitled1.ipynb | 59 +++++++++++ Untitled2.ipynb | 122 ++++++++++++++++++++++ nbconvert/filters/markdown_mistune.py | 58 +++-------- test.ipynb | 83 +++++++++++++++ test1.ipynb | 59 +++++++++++ 6 files changed, 478 insertions(+), 46 deletions(-) create mode 100644 Untitled.ipynb create mode 100644 Untitled1.ipynb create mode 100644 Untitled2.ipynb create mode 100644 test.ipynb create mode 100644 test1.ipynb diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 000000000..e834e2743 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,143 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "8253d8be-f1ee-4e5c-a868-416d78ac4d9f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(1, 'Heading 1'), (2, 'Heading 2')]\n", + ">\n", + ">\n" + ] + } + ], + "source": [ + "import mistune\n", + "import re\n", + "\n", + "class TitleExtractorRenderer(mistune.HTMLRenderer):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.titles = []\n", + "\n", + " # Override the heading method for Markdown headings\n", + " def heading(self, text, level):\n", + " self.titles.append((level, text))\n", + " return '' # return empty since we only want to extract\n", + "\n", + " # Override the html_block method to handle raw HTML\n", + " def html_block(self, html):\n", + " # Regex to find HTML headings

to

\n", + " matches = re.findall(r'(.*?)', html, re.IGNORECASE)\n", + " for level, text in matches:\n", + " self.titles.append((int(level), text))\n", + " print(text);\n", + " return '' # return empty as we're only extracting titles\n", + "\n", + "# Create an instance of the renderer and Markdown parser\n", + "renderer = TitleExtractorRenderer()\n", + "markdown = mistune.create_markdown(renderer=renderer)\n", + "\n", + "# Parse your markdown input\n", + "markdown_text = \"\"\"\n", + "# Heading 1\n", + "\n", + "Some paragraph here.\n", + "\n", + "## Heading 2\n", + "\n", + "

HTML Heading 1

\n", + "

HTML Heading 2

\n", + "\"\"\"\n", + "\n", + "# Process the markdown to extract titles\n", + "markdown(markdown_text)\n", + "\n", + "# Print the extracted titles\n", + "print(renderer.titles)\n", + "print(renderer.heading)\n", + "print(renderer.html_block)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5009b588-7bb0-46b8-8460-c3eb7f63582f", + "metadata": {}, + "outputs": [], + "source": [ + "import mistune\n", + "import re\n", + "\n", + "class TitleExtractorRenderer(mistune.HTMLRenderer):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.titles = []\n", + "\n", + " # Override the heading method for Markdown headings\n", + " def heading(self, text, level):\n", + " self.titles.append((level, text))\n", + " return '' # return empty since we only want to extract\n", + "\n", + " # Override the html_block method to handle raw HTML\n", + " def html_block(self, html):\n", + " # Regex to find HTML headings

to

\n", + " matches = re.findall(r'(.*?)', html, re.IGNORECASE)\n", + " for level, text in matches:\n", + " self.titles.append((int(level), text))\n", + " print(text);\n", + " return '' # return empty as we're only extracting titles\n", + "\n", + "# Create an instance of the renderer and Markdown parser\n", + "renderer = TitleExtractorRenderer()\n", + "markdown = mistune.create_markdown(renderer=renderer)\n", + "\n", + "# Parse your markdown input\n", + "markdown_text = \"\"\"\n", + "# Heading 1\n", + "\n", + "Some paragraph here.\n", + "\n", + "## Heading 2\n", + "\n", + "

HTML Heading 1

\n", + "

HTML Heading 2

\n", + "\"\"\"\n", + "\n", + "# Process the markdown to extract titles\n", + "markdown(markdown_text)\n", + "\n", + "# Print the extracted titles\n", + "print(renderer.titles)\n", + "print(renderer.heading)\n", + "print(renderer.html_block)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Untitled1.ipynb b/Untitled1.ipynb new file mode 100644 index 000000000..294e829f2 --- /dev/null +++ b/Untitled1.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999", + "metadata": {}, + "source": [ + "# Main title" + ] + }, + { + "cell_type": "markdown", + "id": "4f05b735-2329-444f-bb54-40aa23d3aa81", + "metadata": {}, + "source": [ + "## paragraph 1" + ] + }, + { + "cell_type": "markdown", + "id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff", + "metadata": {}, + "source": [ + "

paragraph 2

" + ] + }, + { + "cell_type": "markdown", + "id": "828c2b99-a9f7-424a-8241-e9f812cde8a9", + "metadata": {}, + "source": [ + "

\n", + "paragraph3\n", + "

" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Untitled2.ipynb b/Untitled2.ipynb new file mode 100644 index 000000000..5eb27cbb4 --- /dev/null +++ b/Untitled2.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "34b9e91f-b4d9-481e-9b97-fc5766c98ce6", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'extrac' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 68\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# print(\"header_level:\", header_level)\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;66;03m# print(\"raw_text:\", raw_text)\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m titles_array\n\u001b[0;32m---> 68\u001b[0m \u001b[43mextrac\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'extrac' is not defined" + ] + } + ], + "source": [ + "import mistune\n", + "import re\n", + "from mistune.renderers.markdown import MarkdownRenderer\n", + "from nbformat import NotebookNode\n", + "\n", + "class HeadingExtractor(MarkdownRenderer):\n", + " \"\"\"A renderer to capture headings\"\"\"\n", + "\n", + " def __init__(self):\n", + " \"\"\"Initialize the class.\"\"\"\n", + " super().__init__()\n", + " self.headings = []\n", + "\n", + " def heading(self, text, level):\n", + " \"\"\"Return an empty string for the headings to avoid outputting them.\"\"\"\n", + " matches = re.findall(r'.*?<\\/h[1-6]>', text)\n", + " print(matches)\n", + " \n", + " for level, text in matches:\n", + " # You can use int() to convert the level to an integer\n", + " self.headings.append((int(level), text.strip())) # .strip() removes any leading/trailing whitespace\n", + " print(f\"Level: {level}, Text: {text.strip()}\")\n", + " # self.headings.append((level, text))\n", + " return \"\"\n", + "\n", + "\n", + "def extract_titles_from_notebook_node(nb: NotebookNode):\n", + " \"\"\"Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook\n", + " The input argument is the notebooknode from which a single string with all the markdown content concatenated\n", + " The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s\"\"\"\n", + "\n", + " markdown_collection = \"\"\n", + "\n", + " for cell in nb.cells:\n", + " if cell.cell_type == \"markdown\":\n", + " lines = cell.source.splitlines()\n", + " for line in lines:\n", + " newline= line\n", + " \n", + " if line.startswith('#'):\n", + " newline = mistune.html(newline)\n", + " \n", + " #print(\"line:\", line)\n", + " #print('newline:', newline)\n", + " markdown_collection = markdown_collection + newline.strip() + \"\\n\"\n", + " #print(markdown_collection)\n", + " titles_array = []\n", + " renderer = HeadingExtractor()\n", + " extract_titles = mistune.create_markdown(renderer=renderer)\n", + " extract_titles(markdown_collection)\n", + " headings = renderer.headings\n", + " print(\"Titles:\", headings)\n", + "\n", + " # Iterate on all headings to get the necessary information on the various titles\n", + " for __, title in headings:\n", + " children = title[\"children\"]\n", + " attrs = title[\"attrs\"]\n", + " raw_text = children[0][\"raw\"]\n", + " header_level = attrs[\"level\"]\n", + " id = raw_text.replace(\" \", \"-\")\n", + " href = \"#\" + id\n", + " titles_array.append([header_level, raw_text, id, href])\n", + " # print(\"header_level:\", header_level)\n", + " # print(\"raw_text:\", raw_text)\n", + " return titles_array\n", + "\n", + "\n", + "extract_titles_from_notebook_node()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11d24acf-e8ba-4cf4-9443-c505d5687811", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py index 8c5eb4715..ed5e3b338 100644 --- a/nbconvert/filters/markdown_mistune.py +++ b/nbconvert/filters/markdown_mistune.py @@ -13,7 +13,6 @@ import bs4 import mistune -from mistune.renderers.markdown import MarkdownRenderer from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexer import Lexer @@ -492,61 +491,28 @@ def markdown2html_mistune(source: str) -> str: return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source) -class HeadingExtractor(MarkdownRenderer): - """A renderer to capture headings""" - - def __init__(self): - """Initialize the class.""" - super().__init__() - self.headings = [] - - def heading(self, text, level): - """Return an empty string for the headings to avoid outputting them.""" - self.headings.append((level, text)) - return "" - - def extract_titles_from_notebook_node(nb: NotebookNode): """Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook The input argument is the notebooknode from which a single string with all the markdown content concatenated The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s""" - markdown_collection = "" + cells_html_collection = "" for cell in nb.cells: if cell.cell_type == "markdown": - lines = cell.source.splitlines() - for line in lines: - newline = line.replace("

", "# ") - newline = newline.replace("

", "## ") - newline = newline.replace("

", "### ") - newline = newline.replace("

", "#### ") - newline = newline.replace("

", "##### ") - newline = newline.replace("
", "###### ") - newline = newline.replace("
", "") - newline = newline.replace("
", "") - newline = newline.replace("
", "") - newline = newline.replace("", "") - newline = newline.replace("", "") - newline = newline.replace("", "") - if newline.startswith('#'): - markdown_collection = markdown_collection + newline.strip() + "\n" + markdown_source = cell.source + html_source = mistune.html(markdown_source) # convert all the markdown sources to html + cells_html_collection = cells_html_collection + html_source + "\n" titles_array = [] - renderer = HeadingExtractor() - extract_titles = mistune.create_markdown(renderer=renderer) - print(markdown_collection) - extract_titles(markdown_collection) - headings = renderer.headings + html_collection = bs4.BeautifulSoup(cells_html_collection, 'html.parser') + headings = html_collection.select('h1, h2, h3, h4, h5, h6') # Iterate on all headings to get the necessary information on the various titles - for __, title in headings: - children = title["children"] - attrs = title["attrs"] - raw_text = children[0]["raw"] - header_level = attrs["level"] - id = raw_text.replace(" ", "-") + for heading in headings: + level = int(heading.name[1]) + text = heading.get_text() + id = text.replace(" ", "-") href = "#" + id - titles_array.append([header_level, raw_text, id, href]) - # print("header_level:", header_level) - # print("raw_text:", raw_text) + titles_array.append([level, text, id, href]) + return titles_array diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 000000000..a462774c7 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "id": "f7f0b758-23e8-4ef6-9826-68687a9a1d5d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Level: 1, Text: Welcome to My Website\n", + "Level: 2, Text: About Us\n", + "Level: 3, Text: Our Mission\n", + "Level: 4, Text: Our Team\n", + "Level: 5, Text: Our Goals\n", + "Level: 6, Text: Contact Information\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "def extract_headers(sample):\n", + " # Regex pattern to capture header level and text content\n", + " matches = re.findall(r']*>(.*?)', sample)\n", + " \n", + " headings = []\n", + " for level, text in matches:\n", + " # You can use int() to convert the level to an integer\n", + " headings.append((int(level), text.strip())) # .strip() removes any leading/trailing whitespace\n", + " print(f\"Level: {level}, Text: {text.strip()}\")\n", + " \n", + " return headings\n", + "\n", + "# Sample HTML content\n", + "html_content = '''\n", + "

Welcome to My Website

\n", + "

This is a paragraph of text.

\n", + "

About Us

\n", + "

More information about the website.

\n", + "

Our Mission

\n", + "

Our Team

\n", + "
Our Goals
\n", + "
Contact Information
\n", + "'''\n", + "\n", + "# Call the function with the sample HTML\n", + "headers = extract_headers(html_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe90507c-cf50-4b71-9d41-67409d65d1fe", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/test1.ipynb b/test1.ipynb new file mode 100644 index 000000000..294e829f2 --- /dev/null +++ b/test1.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999", + "metadata": {}, + "source": [ + "# Main title" + ] + }, + { + "cell_type": "markdown", + "id": "4f05b735-2329-444f-bb54-40aa23d3aa81", + "metadata": {}, + "source": [ + "## paragraph 1" + ] + }, + { + "cell_type": "markdown", + "id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff", + "metadata": {}, + "source": [ + "

paragraph 2

" + ] + }, + { + "cell_type": "markdown", + "id": "828c2b99-a9f7-424a-8241-e9f812cde8a9", + "metadata": {}, + "source": [ + "

\n", + "paragraph3\n", + "

" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}