Skip to content

Commit

Permalink
Merge pull request #9 from AnswerDotAI/read
Browse files Browse the repository at this point in the history
read_docs walks up parent paths if llms.txt not found.
  • Loading branch information
jph00 authored Sep 12, 2024
2 parents bc6c377 + 5dbcf65 commit ca13789
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 9 deletions.
36 changes: 31 additions & 5 deletions 03_download.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Markdown,HTML"
"from IPython.display import Markdown,HTML\n",
"from fastcore.test import *"
]
},
{
Expand Down Expand Up @@ -236,7 +237,7 @@
" url = (base+path+fname).strip('/')\n",
" if fname=='/llms.txt': return url\n",
" if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n",
" if '.' in fname: return _tryget(url+'.md')\n",
" if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n",
" res = _tryget(url+'/llms.txt')\n",
" if res: return res\n",
" res = _tryget(url+'/index.md')\n",
Expand All @@ -245,7 +246,9 @@
" if res: return res\n",
" res = _tryget(url+'/index-commonmark.md')\n",
" if res: return res\n",
" return None"
" parsed_url = urlparse(url)\n",
" if parsed_url.path == '/' or not parsed_url.path: return None\n",
" return find_docs(urljoin(url, '..'))"
]
},
{
Expand Down Expand Up @@ -289,7 +292,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"None\n",
"https://claudette.answer.ai/index.html.md\n",
"https://claudette.answer.ai/index.html.md\n",
"https://claudette.answer.ai/index.html.md\n",
"https://llmstxt.org/llms.txt\n",
Expand All @@ -301,6 +304,30 @@
"for o in urls: print(find_docs(o))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "439546d4",
"metadata": {},
"outputs": [],
"source": [
"suffixes = [\"/\", \"/tmp\", \"/tmp/\", \"/tmp/tmp\", \"/tmp/tmp/\"]\n",
"for suff in suffixes:\n",
" for o in urls: test_eq(find_docs(o), find_docs(o+suff))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07d1b763",
"metadata": {},
"outputs": [],
"source": [
"test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n",
"test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n",
"test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -312,7 +339,6 @@
"def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n",
" \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n",
" url = find_docs(url)\n",
" if not url: return\n",
" if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n",
" else: res = get(url).text\n",
" return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)"
Expand Down
9 changes: 5 additions & 4 deletions toolslm/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def find_docs(url):
url = (base+path+fname).strip('/')
if fname=='/llms.txt': return url
if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)
if '.' in fname: return _tryget(url+'.md')
if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])
res = _tryget(url+'/llms.txt')
if res: return res
res = _tryget(url+'/index.md')
Expand All @@ -85,13 +85,14 @@ def find_docs(url):
if res: return res
res = _tryget(url+'/index-commonmark.md')
if res: return res
return None
parsed_url = urlparse(url)
if parsed_url.path == '/' or not parsed_url.path: return None
return find_docs(urljoin(url, '..'))

# %% ../03_download.ipynb 19
# %% ../03_download.ipynb 21
def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):
"If available, return LLM-friendly llms.txt context or markdown file response for `url`"
url = find_docs(url)
if not url: return
if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)
else: res = get(url).text
return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)

0 comments on commit ca13789

Please sign in to comment.