Change the logics to get the headers using parsing of the markdown ce…

…ll content to html and using BeautifulSoup.select.
jupyter · Nov 6, 2024 · e40df4b · e40df4b
1 parent e833379
commit e40df4b
Show file tree

Hide file tree

Showing 6 changed files with 478 additions and 46 deletions.
diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8253d8be-f1ee-4e5c-a868-416d78ac4d9f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(1, 'Heading 1'), (2, 'Heading 2')]\n",
+      "<bound method TitleExtractorRenderer.heading of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n",
+      "<bound method TitleExtractorRenderer.html_block of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import mistune\n",
+    "import re\n",
+    "\n",
+    "class TitleExtractorRenderer(mistune.HTMLRenderer):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.titles = []\n",
+    "\n",
+    "    # Override the heading method for Markdown headings\n",
+    "    def heading(self, text, level):\n",
+    "        self.titles.append((level, text))\n",
+    "        return ''  # return empty since we only want to extract\n",
+    "\n",
+    "    # Override the html_block method to handle raw HTML\n",
+    "    def html_block(self, html):\n",
+    "        # Regex to find HTML headings <h1> to <h6>\n",
+    "        matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n",
+    "        for level, text in matches:\n",
+    "            self.titles.append((int(level), text))\n",
+    "            print(text);\n",
+    "        return ''  # return empty as we're only extracting titles\n",
+    "\n",
+    "# Create an instance of the renderer and Markdown parser\n",
+    "renderer = TitleExtractorRenderer()\n",
+    "markdown = mistune.create_markdown(renderer=renderer)\n",
+    "\n",
+    "# Parse your markdown input\n",
+    "markdown_text = \"\"\"\n",
+    "# Heading 1\n",
+    "\n",
+    "Some paragraph here.\n",
+    "\n",
+    "## Heading 2\n",
+    "\n",
+    "<h1>HTML Heading 1</h1>\n",
+    "<h2>HTML Heading 2</h2>\n",
+    "\"\"\"\n",
+    "\n",
+    "# Process the markdown to extract titles\n",
+    "markdown(markdown_text)\n",
+    "\n",
+    "# Print the extracted titles\n",
+    "print(renderer.titles)\n",
+    "print(renderer.heading)\n",
+    "print(renderer.html_block)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5009b588-7bb0-46b8-8460-c3eb7f63582f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mistune\n",
+    "import re\n",
+    "\n",
+    "class TitleExtractorRenderer(mistune.HTMLRenderer):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.titles = []\n",
+    "\n",
+    "    # Override the heading method for Markdown headings\n",
+    "    def heading(self, text, level):\n",
+    "        self.titles.append((level, text))\n",
+    "        return ''  # return empty since we only want to extract\n",
+    "\n",
+    "    # Override the html_block method to handle raw HTML\n",
+    "    def html_block(self, html):\n",
+    "        # Regex to find HTML headings <h1> to <h6>\n",
+    "        matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n",
+    "        for level, text in matches:\n",
+    "            self.titles.append((int(level), text))\n",
+    "            print(text);\n",
+    "        return ''  # return empty as we're only extracting titles\n",
+    "\n",
+    "# Create an instance of the renderer and Markdown parser\n",
+    "renderer = TitleExtractorRenderer()\n",
+    "markdown = mistune.create_markdown(renderer=renderer)\n",
+    "\n",
+    "# Parse your markdown input\n",
+    "markdown_text = \"\"\"\n",
+    "# Heading 1\n",
+    "\n",
+    "Some paragraph here.\n",
+    "\n",
+    "## Heading 2\n",
+    "\n",
+    "<h1>HTML Heading 1</h1>\n",
+    "<h2>HTML Heading 2</h2>\n",
+    "\"\"\"\n",
+    "\n",
+    "# Process the markdown to extract titles\n",
+    "markdown(markdown_text)\n",
+    "\n",
+    "# Print the extracted titles\n",
+    "print(renderer.titles)\n",
+    "print(renderer.heading)\n",
+    "print(renderer.html_block)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Untitled1.ipynb b/Untitled1.ipynb
@@ -0,0 +1,59 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999",
+   "metadata": {},
+   "source": [
+    "# Main title"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f05b735-2329-444f-bb54-40aa23d3aa81",
+   "metadata": {},
+   "source": [
+    "## paragraph 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff",
+   "metadata": {},
+   "source": [
+    "<h2> paragraph 2 </h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "828c2b99-a9f7-424a-8241-e9f812cde8a9",
+   "metadata": {},
+   "source": [
+    "<h2>\n",
+    "paragraph3\n",
+    "</h2>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Untitled2.ipynb b/Untitled2.ipynb
@@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "34b9e91f-b4d9-481e-9b97-fc5766c98ce6",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'extrac' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 68\u001b[0m\n\u001b[1;32m     63\u001b[0m         \u001b[38;5;66;03m# print(\"header_level:\", header_level)\u001b[39;00m\n\u001b[1;32m     64\u001b[0m         \u001b[38;5;66;03m# print(\"raw_text:\", raw_text)\u001b[39;00m\n\u001b[1;32m     65\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m titles_array\n\u001b[0;32m---> 68\u001b[0m \u001b[43mextrac\u001b[49m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'extrac' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "import mistune\n",
+    "import re\n",
+    "from mistune.renderers.markdown import MarkdownRenderer\n",
+    "from nbformat import NotebookNode\n",
+    "\n",
+    "class HeadingExtractor(MarkdownRenderer):\n",
+    "    \"\"\"A renderer to capture headings\"\"\"\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        \"\"\"Initialize the class.\"\"\"\n",
+    "        super().__init__()\n",
+    "        self.headings = []\n",
+    "\n",
+    "    def heading(self, text, level):\n",
+    "        \"\"\"Return an empty string for the headings to avoid outputting them.\"\"\"\n",
+    "        matches = re.findall(r'<h[1-6]>.*?<\\/h[1-6]>', text)\n",
+    "        print(matches)\n",
+    "        \n",
+    "        for level, text in matches:\n",
+    "        # You can use int() to convert the level to an integer\n",
+    "            self.headings.append((int(level), text.strip()))  # .strip() removes any leading/trailing whitespace\n",
+    "        print(f\"Level: {level}, Text: {text.strip()}\")\n",
+    "        # self.headings.append((level, text))\n",
+    "        return \"\"\n",
+    "\n",
+    "\n",
+    "def extract_titles_from_notebook_node(nb: NotebookNode):\n",
+    "    \"\"\"Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook\n",
+    "    The input argument is the notebooknode from which a single string with all the markdown content concatenated\n",
+    "    The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s\"\"\"\n",
+    "\n",
+    "    markdown_collection = \"\"\n",
+    "\n",
+    "    for cell in nb.cells:\n",
+    "        if cell.cell_type == \"markdown\":\n",
+    "            lines = cell.source.splitlines()\n",
+    "            for line in lines:\n",
+    "                newline= line\n",
+    "             \n",
+    "                if line.startswith('#'):\n",
+    "                    newline = mistune.html(newline)\n",
+    "                  \n",
+    "                #print(\"line:\", line)\n",
+    "                #print('newline:', newline)\n",
+    "            markdown_collection = markdown_collection + newline.strip() + \"\\n\"\n",
+    "    #print(markdown_collection)\n",
+    "    titles_array = []\n",
+    "    renderer = HeadingExtractor()\n",
+    "    extract_titles = mistune.create_markdown(renderer=renderer)\n",
+    "    extract_titles(markdown_collection)\n",
+    "    headings = renderer.headings\n",
+    "    print(\"Titles:\", headings)\n",
+    "\n",
+    "    # Iterate on all headings to get the necessary information on the various titles\n",
+    "    for __, title in headings:\n",
+    "        children = title[\"children\"]\n",
+    "        attrs = title[\"attrs\"]\n",
+    "        raw_text = children[0][\"raw\"]\n",
+    "        header_level = attrs[\"level\"]\n",
+    "        id = raw_text.replace(\" \", \"-\")\n",
+    "        href = \"#\" + id\n",
+    "        titles_array.append([header_level, raw_text, id, href])\n",
+    "        # print(\"header_level:\", header_level)\n",
+    "        # print(\"raw_text:\", raw_text)\n",
+    "    return titles_array\n",
+    "\n",
+    "\n",
+    "extract_titles_from_notebook_node()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11d24acf-e8ba-4cf4-9443-c505d5687811",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}