Merge pull request #9 from AnswerDotAI/read

read_docs walks up parent paths if llms.txt not found.
AnswerDotAI · Sep 12, 2024 · ca13789 · ca13789
2 parents bc6c377 + 5dbcf65
commit ca13789
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 9 deletions.
diff --git a/03_download.ipynb b/03_download.ipynb
@@ -45,7 +45,8 @@
  "metadata": {},
  "outputs": [],
  "source": [
- "from IPython.display import Markdown,HTML"
+ "from IPython.display import Markdown,HTML\n",
+ "from fastcore.test import *"
  ]
  },
  {
@@ -236,7 +237,7 @@
  " url = (base+path+fname).strip('/')\n",
  " if fname=='/llms.txt': return url\n",
  " if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n",
- " if '.' in fname: return _tryget(url+'.md')\n",
+ " if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n",
  " res = _tryget(url+'/llms.txt')\n",
  " if res: return res\n",
  " res = _tryget(url+'/index.md')\n",
@@ -245,7 +246,9 @@
  " if res: return res\n",
  " res = _tryget(url+'/index-commonmark.md')\n",
  " if res: return res\n",
- " return None"
+ " parsed_url = urlparse(url)\n",
+ " if parsed_url.path == '/' or not parsed_url.path: return None\n",
+ " return find_docs(urljoin(url, '..'))"
  ]
  },
  {
@@ -289,7 +292,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "None\n",
+ "https://claudette.answer.ai/index.html.md\n",
  "https://claudette.answer.ai/index.html.md\n",
  "https://claudette.answer.ai/index.html.md\n",
  "https://llmstxt.org/llms.txt\n",
@@ -301,6 +304,30 @@
  "for o in urls: print(find_docs(o))"
  ]
  },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "439546d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "suffixes = [\"/\", \"/tmp\", \"/tmp/\", \"/tmp/tmp\", \"/tmp/tmp/\"]\n",
+ "for suff in suffixes:\n",
+ " for o in urls: test_eq(find_docs(o), find_docs(o+suff))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "07d1b763",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n",
+ "test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n",
+ "test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")"
+ ]
+ },
  {
  "cell_type": "code",
  "execution_count": null,
@@ -312,7 +339,6 @@
  "def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n",
  " \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n",
  " url = find_docs(url)\n",
- " if not url: return\n",
  " if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n",
  " else: res = get(url).text\n",
  " return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)"

diff --git a/toolslm/download.py b/toolslm/download.py
@@ -76,7 +76,7 @@ def find_docs(url):
  url = (base+path+fname).strip('/')
  if fname=='/llms.txt': return url
  if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)
- if '.' in fname: return _tryget(url+'.md')
+ if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])
  res = _tryget(url+'/llms.txt')
  if res: return res
  res = _tryget(url+'/index.md')
@@ -85,13 +85,14 @@ def find_docs(url):
  if res: return res
  res = _tryget(url+'/index-commonmark.md')
  if res: return res
- return None
+ parsed_url = urlparse(url)
+ if parsed_url.path == '/' or not parsed_url.path: return None
+ return find_docs(urljoin(url, '..'))
 
-# %% ../03_download.ipynb 19
+# %% ../03_download.ipynb 21
 def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):
  "If available, return LLM-friendly llms.txt context or markdown file response for `url`"
  url = find_docs(url)
- if not url: return
  if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)
  else: res = get(url).text
  return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)