Skip to content

Commit

Permalink
Change the logics to get the headers using parsing of the markdown ce…
Browse files Browse the repository at this point in the history
…ll content to html and using BeautifulSoup.select.
  • Loading branch information
HaudinFlorence committed Nov 6, 2024
1 parent e833379 commit 8623ce8
Show file tree
Hide file tree
Showing 6 changed files with 478 additions and 46 deletions.
143 changes: 143 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"id": "8253d8be-f1ee-4e5c-a868-416d78ac4d9f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1, 'Heading 1'), (2, 'Heading 2')]\n",
"<bound method TitleExtractorRenderer.heading of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n",
"<bound method TitleExtractorRenderer.html_block of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n"
]
}
],
"source": [
"import mistune\n",
"import re\n",
"\n",
"class TitleExtractorRenderer(mistune.HTMLRenderer):\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.titles = []\n",
"\n",
" # Override the heading method for Markdown headings\n",
" def heading(self, text, level):\n",
" self.titles.append((level, text))\n",
" return '' # return empty since we only want to extract\n",
"\n",
" # Override the html_block method to handle raw HTML\n",
" def html_block(self, html):\n",
" # Regex to find HTML headings <h1> to <h6>\n",
" matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n",
" for level, text in matches:\n",
" self.titles.append((int(level), text))\n",
" print(text);\n",
" return '' # return empty as we're only extracting titles\n",
"\n",
"# Create an instance of the renderer and Markdown parser\n",
"renderer = TitleExtractorRenderer()\n",
"markdown = mistune.create_markdown(renderer=renderer)\n",
"\n",
"# Parse your markdown input\n",
"markdown_text = \"\"\"\n",
"# Heading 1\n",
"\n",
"Some paragraph here.\n",
"\n",
"## Heading 2\n",
"\n",
"<h1>HTML Heading 1</h1>\n",
"<h2>HTML Heading 2</h2>\n",
"\"\"\"\n",
"\n",
"# Process the markdown to extract titles\n",
"markdown(markdown_text)\n",
"\n",
"# Print the extracted titles\n",
"print(renderer.titles)\n",
"print(renderer.heading)\n",
"print(renderer.html_block)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5009b588-7bb0-46b8-8460-c3eb7f63582f",
"metadata": {},
"outputs": [],
"source": [
"import mistune\n",
"import re\n",
"\n",
"class TitleExtractorRenderer(mistune.HTMLRenderer):\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.titles = []\n",
"\n",
" # Override the heading method for Markdown headings\n",
" def heading(self, text, level):\n",
" self.titles.append((level, text))\n",
" return '' # return empty since we only want to extract\n",
"\n",
" # Override the html_block method to handle raw HTML\n",
" def html_block(self, html):\n",
" # Regex to find HTML headings <h1> to <h6>\n",
" matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n",
" for level, text in matches:\n",
" self.titles.append((int(level), text))\n",
" print(text);\n",
" return '' # return empty as we're only extracting titles\n",
"\n",
"# Create an instance of the renderer and Markdown parser\n",
"renderer = TitleExtractorRenderer()\n",
"markdown = mistune.create_markdown(renderer=renderer)\n",
"\n",
"# Parse your markdown input\n",
"markdown_text = \"\"\"\n",
"# Heading 1\n",
"\n",
"Some paragraph here.\n",
"\n",
"## Heading 2\n",
"\n",
"<h1>HTML Heading 1</h1>\n",
"<h2>HTML Heading 2</h2>\n",
"\"\"\"\n",
"\n",
"# Process the markdown to extract titles\n",
"markdown(markdown_text)\n",
"\n",
"# Print the extracted titles\n",
"print(renderer.titles)\n",
"print(renderer.heading)\n",
"print(renderer.html_block)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
59 changes: 59 additions & 0 deletions Untitled1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999",
"metadata": {},
"source": [
"# Main title"
]
},
{
"cell_type": "markdown",
"id": "4f05b735-2329-444f-bb54-40aa23d3aa81",
"metadata": {},
"source": [
"## paragraph 1"
]
},
{
"cell_type": "markdown",
"id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff",
"metadata": {},
"source": [
"<h2> paragraph 2 </h2>"
]
},
{
"cell_type": "markdown",
"id": "828c2b99-a9f7-424a-8241-e9f812cde8a9",
"metadata": {},
"source": [
"<h2>\n",
"paragraph3\n",
"</h2>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
122 changes: 122 additions & 0 deletions Untitled2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "34b9e91f-b4d9-481e-9b97-fc5766c98ce6",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'extrac' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 68\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# print(\"header_level:\", header_level)\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;66;03m# print(\"raw_text:\", raw_text)\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m titles_array\n\u001b[0;32m---> 68\u001b[0m \u001b[43mextrac\u001b[49m\n",
"\u001b[0;31mNameError\u001b[0m: name 'extrac' is not defined"
]
}
],
"source": [
"import mistune\n",
"import re\n",
"from mistune.renderers.markdown import MarkdownRenderer\n",
"from nbformat import NotebookNode\n",
"\n",
"class HeadingExtractor(MarkdownRenderer):\n",
" \"\"\"A renderer to capture headings\"\"\"\n",
"\n",
" def __init__(self):\n",
" \"\"\"Initialize the class.\"\"\"\n",
" super().__init__()\n",
" self.headings = []\n",
"\n",
" def heading(self, text, level):\n",
" \"\"\"Return an empty string for the headings to avoid outputting them.\"\"\"\n",
" matches = re.findall(r'<h[1-6]>.*?<\\/h[1-6]>', text)\n",
" print(matches)\n",
" \n",
" for level, text in matches:\n",
" # You can use int() to convert the level to an integer\n",
" self.headings.append((int(level), text.strip())) # .strip() removes any leading/trailing whitespace\n",
" print(f\"Level: {level}, Text: {text.strip()}\")\n",
" # self.headings.append((level, text))\n",
" return \"\"\n",
"\n",
"\n",
"def extract_titles_from_notebook_node(nb: NotebookNode):\n",
" \"\"\"Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook\n",
" The input argument is the notebooknode from which a single string with all the markdown content concatenated\n",
" The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s\"\"\"\n",
"\n",
" markdown_collection = \"\"\n",
"\n",
" for cell in nb.cells:\n",
" if cell.cell_type == \"markdown\":\n",
" lines = cell.source.splitlines()\n",
" for line in lines:\n",
" newline= line\n",
" \n",
" if line.startswith('#'):\n",
" newline = mistune.html(newline)\n",
" \n",
" #print(\"line:\", line)\n",
" #print('newline:', newline)\n",
" markdown_collection = markdown_collection + newline.strip() + \"\\n\"\n",
" #print(markdown_collection)\n",
" titles_array = []\n",
" renderer = HeadingExtractor()\n",
" extract_titles = mistune.create_markdown(renderer=renderer)\n",
" extract_titles(markdown_collection)\n",
" headings = renderer.headings\n",
" print(\"Titles:\", headings)\n",
"\n",
" # Iterate on all headings to get the necessary information on the various titles\n",
" for __, title in headings:\n",
" children = title[\"children\"]\n",
" attrs = title[\"attrs\"]\n",
" raw_text = children[0][\"raw\"]\n",
" header_level = attrs[\"level\"]\n",
" id = raw_text.replace(\" \", \"-\")\n",
" href = \"#\" + id\n",
" titles_array.append([header_level, raw_text, id, href])\n",
" # print(\"header_level:\", header_level)\n",
" # print(\"raw_text:\", raw_text)\n",
" return titles_array\n",
"\n",
"\n",
"extract_titles_from_notebook_node()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "11d24acf-e8ba-4cf4-9443-c505d5687811",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 8623ce8

Please sign in to comment.