From e40df4b4faa6e1e47fae693e871f915c4a4e6dc1 Mon Sep 17 00:00:00 2001
From: HaudinFlorence <haudin.florence@gmail.com>
Date: Wed, 6 Nov 2024 17:56:20 +0100
Subject: [PATCH] Change the logics to get the headers using parsing of the
 markdown cell content to html and using BeautifulSoup.select.

---
 Untitled.ipynb                        | 143 ++++++++++++++++++++++++++
 Untitled1.ipynb                       |  59 +++++++++++
 Untitled2.ipynb                       | 122 ++++++++++++++++++++++
 nbconvert/filters/markdown_mistune.py |  58 +++--------
 test.ipynb                            |  83 +++++++++++++++
 test1.ipynb                           |  59 +++++++++++
 6 files changed, 478 insertions(+), 46 deletions(-)
 create mode 100644 Untitled.ipynb
 create mode 100644 Untitled1.ipynb
 create mode 100644 Untitled2.ipynb
 create mode 100644 test.ipynb
 create mode 100644 test1.ipynb
diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 000000000..e834e2743
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8253d8be-f1ee-4e5c-a868-416d78ac4d9f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(1, 'Heading 1'), (2, 'Heading 2')]\n",
+      "<bound method TitleExtractorRenderer.heading of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n",
+      "<bound method TitleExtractorRenderer.html_block of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mistune\n",
+    "import re\n",
+    "\n",
+    "class TitleExtractorRenderer(mistune.HTMLRenderer):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.titles = []\n",
+    "\n",
+    "    # Override the heading method for Markdown headings\n",
+    "    def heading(self, text, level):\n",
+    "        self.titles.append((level, text))\n",
+    "        return ''  # return empty since we only want to extract\n",
+    "\n",
+    "    # Override the html_block method to handle raw HTML\n",
+    "    def html_block(self, html):\n",
+    "        # Regex to find HTML headings <h1> to <h6>\n",
+    "        matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n",
+    "        for level, text in matches:\n",
+    "            self.titles.append((int(level), text))\n",
+    "            print(text);\n",
+    "        return ''  # return empty as we're only extracting titles\n",
+    "\n",
+    "# Create an instance of the renderer and Markdown parser\n",
+    "renderer = TitleExtractorRenderer()\n",
+    "markdown = mistune.create_markdown(renderer=renderer)\n",
+    "\n",
+    "# Parse your markdown input\n",
+    "markdown_text = \"\"\"\n",
+    "# Heading 1\n",
+    "\n",
+    "Some paragraph here.\n",
+    "\n",
+    "## Heading 2\n",
+    "\n",
+    "<h1>HTML Heading 1</h1>\n",
+    "<h2>HTML Heading 2</h2>\n",
+    "\"\"\"\n",
+    "\n",
+    "# Process the markdown to extract titles\n",
+    "markdown(markdown_text)\n",
+    "\n",
+    "# Print the extracted titles\n",
+    "print(renderer.titles)\n",
+    "print(renderer.heading)\n",
+    "print(renderer.html_block)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5009b588-7bb0-46b8-8460-c3eb7f63582f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mistune\n",
+    "import re\n",
+    "\n",
+    "class TitleExtractorRenderer(mistune.HTMLRenderer):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.titles = []\n",
+    "\n",
+    "    # Override the heading method for Markdown headings\n",
+    "    def heading(self, text, level):\n",
+    "        self.titles.append((level, text))\n",
+    "        return ''  # return empty since we only want to extract\n",
+    "\n",
+    "    # Override the html_block method to handle raw HTML\n",
+    "    def html_block(self, html):\n",
+    "        # Regex to find HTML headings <h1> to <h6>\n",
+    "        matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n",
+    "        for level, text in matches:\n",
+    "            self.titles.append((int(level), text))\n",
+    "            print(text);\n",
+    "        return ''  # return empty as we're only extracting titles\n",
+    "\n",
+    "# Create an instance of the renderer and Markdown parser\n",
+    "renderer = TitleExtractorRenderer()\n",
+    "markdown = mistune.create_markdown(renderer=renderer)\n",
+    "\n",
+    "# Parse your markdown input\n",
+    "markdown_text = \"\"\"\n",
+    "# Heading 1\n",
+    "\n",
+    "Some paragraph here.\n",
+    "\n",
+    "## Heading 2\n",
+    "\n",
+    "<h1>HTML Heading 1</h1>\n",
+    "<h2>HTML Heading 2</h2>\n",
+    "\"\"\"\n",
+    "\n",
+    "# Process the markdown to extract titles\n",
+    "markdown(markdown_text)\n",
+    "\n",
+    "# Print the extracted titles\n",
+    "print(renderer.titles)\n",
+    "print(renderer.heading)\n",
+    "print(renderer.html_block)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Untitled1.ipynb b/Untitled1.ipynb
new file mode 100644
index 000000000..294e829f2
--- /dev/null
+++ b/Untitled1.ipynb
@@ -0,0 +1,59 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999",
+   "metadata": {},
+   "source": [
+    "# Main title"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f05b735-2329-444f-bb54-40aa23d3aa81",
+   "metadata": {},
+   "source": [
+    "## paragraph 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff",
+   "metadata": {},
+   "source": [
+    "<h2> paragraph 2 </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "828c2b99-a9f7-424a-8241-e9f812cde8a9",
+   "metadata": {},
+   "source": [
+    "<h2>\n",
+    "paragraph3\n",
+    "</h2>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Untitled2.ipynb b/Untitled2.ipynb
new file mode 100644
index 000000000..5eb27cbb4
--- /dev/null
+++ b/Untitled2.ipynb
@@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "34b9e91f-b4d9-481e-9b97-fc5766c98ce6",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'extrac' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 68\u001b[0m\n\u001b[1;32m     63\u001b[0m         \u001b[38;5;66;03m# print(\"header_level:\", header_level)\u001b[39;00m\n\u001b[1;32m     64\u001b[0m         \u001b[38;5;66;03m# print(\"raw_text:\", raw_text)\u001b[39;00m\n\u001b[1;32m     65\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m titles_array\n\u001b[0;32m---> 68\u001b[0m \u001b[43mextrac\u001b[49m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'extrac' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "import mistune\n",
+    "import re\n",
+    "from mistune.renderers.markdown import MarkdownRenderer\n",
+    "from nbformat import NotebookNode\n",
+    "\n",
+    "class HeadingExtractor(MarkdownRenderer):\n",
+    "    \"\"\"A renderer to capture headings\"\"\"\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize the class.\"\"\"\n",
+    "        super().__init__()\n",
+    "        self.headings = []\n",
+    "\n",
+    "    def heading(self, text, level):\n",
+    "        \"\"\"Return an empty string for the headings to avoid outputting them.\"\"\"\n",
+    "        matches = re.findall(r'<h[1-6]>.*?<\\/h[1-6]>', text)\n",
+    "        print(matches)\n",
+    "        \n",
+    "        for level, text in matches:\n",
+    "        # You can use int() to convert the level to an integer\n",
+    "            self.headings.append((int(level), text.strip()))  # .strip() removes any leading/trailing whitespace\n",
+    "        print(f\"Level: {level}, Text: {text.strip()}\")\n",
+    "        # self.headings.append((level, text))\n",
+    "        return \"\"\n",
+    "\n",
+    "\n",
+    "def extract_titles_from_notebook_node(nb: NotebookNode):\n",
+    "    \"\"\"Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook\n",
+    "    The input argument is the notebooknode from which a single string with all the markdown content concatenated\n",
+    "    The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s\"\"\"\n",
+    "\n",
+    "    markdown_collection = \"\"\n",
+    "\n",
+    "    for cell in nb.cells:\n",
+    "        if cell.cell_type == \"markdown\":\n",
+    "            lines = cell.source.splitlines()\n",
+    "            for line in lines:\n",
+    "                newline= line\n",
+    "             \n",
+    "                if line.startswith('#'):\n",
+    "                    newline = mistune.html(newline)\n",
+    "                  \n",
+    "                #print(\"line:\", line)\n",
+    "                #print('newline:', newline)\n",
+    "            markdown_collection = markdown_collection + newline.strip() + \"\\n\"\n",
+    "    #print(markdown_collection)\n",
+    "    titles_array = []\n",
+    "    renderer = HeadingExtractor()\n",
+    "    extract_titles = mistune.create_markdown(renderer=renderer)\n",
+    "    extract_titles(markdown_collection)\n",
+    "    headings = renderer.headings\n",
+    "    print(\"Titles:\", headings)\n",
+    "\n",
+    "    # Iterate on all headings to get the necessary information on the various titles\n",
+    "    for __, title in headings:\n",
+    "        children = title[\"children\"]\n",
+    "        attrs = title[\"attrs\"]\n",
+    "        raw_text = children[0][\"raw\"]\n",
+    "        header_level = attrs[\"level\"]\n",
+    "        id = raw_text.replace(\" \", \"-\")\n",
+    "        href = \"#\" + id\n",
+    "        titles_array.append([header_level, raw_text, id, href])\n",
+    "        # print(\"header_level:\", header_level)\n",
+    "        # print(\"raw_text:\", raw_text)\n",
+    "    return titles_array\n",
+    "\n",
+    "\n",
+    "extract_titles_from_notebook_node()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11d24acf-e8ba-4cf4-9443-c505d5687811",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py
index 8c5eb4715..ed5e3b338 100644
--- a/nbconvert/filters/markdown_mistune.py
+++ b/nbconvert/filters/markdown_mistune.py
@@ -13,7 +13,6 @@
 
 import bs4
 import mistune
-from mistune.renderers.markdown import MarkdownRenderer
 from pygments import highlight
 from pygments.formatters import HtmlFormatter
 from pygments.lexer import Lexer
@@ -492,61 +491,28 @@ def markdown2html_mistune(source: str) -> str:
     return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)
 
 
-class HeadingExtractor(MarkdownRenderer):
-    """A renderer to capture headings"""
-
-    def __init__(self):
-        """Initialize the class."""
-        super().__init__()
-        self.headings = []
-
-    def heading(self, text, level):
-        """Return an empty string for the headings to avoid outputting them."""
-        self.headings.append((level, text))
-        return ""
-
-
 def extract_titles_from_notebook_node(nb: NotebookNode):
     """Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook
     The input argument is the notebooknode from which a single string with all the markdown content concatenated
     The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s"""
 
-    markdown_collection = ""
+    cells_html_collection = ""
     for cell in nb.cells:
         if cell.cell_type == "markdown":
-            lines = cell.source.splitlines()
-            for line in lines:
-                newline = line.replace("<h1>", "# ")
-                newline = newline.replace("<h2>", "## ")
-                newline = newline.replace("<h3>", "### ")
-                newline = newline.replace("<h4>", "#### ")
-                newline = newline.replace("<h5>", "##### ")
-                newline = newline.replace("<h6>", "###### ")
-                newline = newline.replace("</h1>", "")
-                newline = newline.replace("</h2>", "")
-                newline = newline.replace("</h3>", "")
-                newline = newline.replace("</h4>", "")
-                newline = newline.replace("</h5>", "")
-                newline = newline.replace("</h6>", "")
-            if newline.startswith('#'):
-                markdown_collection = markdown_collection + newline.strip() + "\n"
+            markdown_source = cell.source
+            html_source = mistune.html(markdown_source)  # convert all the markdown sources to html
+            cells_html_collection = cells_html_collection + html_source + "\n"
 
     titles_array = []
-    renderer = HeadingExtractor()
-    extract_titles = mistune.create_markdown(renderer=renderer)
-    print(markdown_collection)
-    extract_titles(markdown_collection)
-    headings = renderer.headings
+    html_collection = bs4.BeautifulSoup(cells_html_collection, 'html.parser')
+    headings = html_collection.select('h1, h2, h3, h4, h5, h6')
 
     # Iterate on all headings to get the necessary information on the various titles
-    for __, title in headings:
-        children = title["children"]
-        attrs = title["attrs"]
-        raw_text = children[0]["raw"]
-        header_level = attrs["level"]
-        id = raw_text.replace(" ", "-")
+    for heading in headings:
+        level = int(heading.name[1])
+        text = heading.get_text()
+        id = text.replace(" ", "-")
         href = "#" + id
-        titles_array.append([header_level, raw_text, id, href])
-        # print("header_level:", header_level)
-        # print("raw_text:", raw_text)
+        titles_array.append([level, text, id, href])
+
     return titles_array
diff --git a/test.ipynb b/test.ipynb
new file mode 100644
index 000000000..a462774c7
--- /dev/null
+++ b/test.ipynb
@@ -0,0 +1,83 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "f7f0b758-23e8-4ef6-9826-68687a9a1d5d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Level: 1, Text: Welcome to My Website\n",
+      "Level: 2, Text: About Us\n",
+      "Level: 3, Text: Our Mission\n",
+      "Level: 4, Text: Our Team\n",
+      "Level: 5, Text: Our Goals\n",
+      "Level: 6, Text: Contact Information\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "def extract_headers(sample):\n",
+    "    # Regex pattern to capture header level and text content\n",
+    "    matches = re.findall(r'<h([1-6])[^>]*>(.*?)</h[1-6]>', sample)\n",
+    "    \n",
+    "    headings = []\n",
+    "    for level, text in matches:\n",
+    "        # You can use int() to convert the level to an integer\n",
+    "        headings.append((int(level), text.strip()))  # .strip() removes any leading/trailing whitespace\n",
+    "                                        print(f\"Level: {level}, Text: {text.strip()}\")\n",
+    "    \n",
+    "    return headings\n",
+    "\n",
+    "# Sample HTML content\n",
+    "html_content = '''\n",
+    "<h1>Welcome to My Website</h1>\n",
+    "<p>This is a paragraph of text.</p>\n",
+    "<h2>About Us</h2>\n",
+    "<p>More information about the website.</p>\n",
+    "<h3>Our Mission</h3>\n",
+    "<h4>Our Team</h4>\n",
+    "<h5>Our Goals</h5>\n",
+    "<h6>Contact Information</h6>\n",
+    "'''\n",
+    "\n",
+    "# Call the function with the sample HTML\n",
+    "headers = extract_headers(html_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe90507c-cf50-4b71-9d41-67409d65d1fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test1.ipynb b/test1.ipynb
new file mode 100644
index 000000000..294e829f2
--- /dev/null
+++ b/test1.ipynb
@@ -0,0 +1,59 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999",
+   "metadata": {},
+   "source": [
+    "# Main title"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f05b735-2329-444f-bb54-40aa23d3aa81",
+   "metadata": {},
+   "source": [
+    "## paragraph 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff",
+   "metadata": {},
+   "source": [
+    "<h2> paragraph 2 </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "828c2b99-a9f7-424a-8241-e9f812cde8a9",
+   "metadata": {},
+   "source": [
+    "<h2>\n",
+    "paragraph3\n",
+    "</h2>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}