Skip to content

Commit

Permalink
Merge pull request #7 from AnswerDotAI/base64
Browse files Browse the repository at this point in the history
Exclude images from context?
  • Loading branch information
jph00 authored Sep 12, 2024
2 parents bdd7b4e + 97556f3 commit 4aef130
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
3 changes: 2 additions & 1 deletion llms_txt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def _doc(kw):
"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs."
url = kw.pop('url')
re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)
txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]
re_base64_img = re.compile(r'<img[^>]*src="data:image/[^"]*"[^>]*>')
txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)]
return Doc('\n'.join(txt), **kw)

# %% ../nbs/01_core.ipynb
Expand Down
3 changes: 2 additions & 1 deletion nbs/01_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,8 @@
" \"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs.\"\n",
" url = kw.pop('url')\n",
" re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)\n",
" txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]\n",
" re_base64_img = re.compile(r'<img[^>]*src=\"data:image/[^\"]*\"[^>]*>')\n",
" txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)]\n",
" return Doc('\\n'.join(txt), **kw)"
]
},
Expand Down

0 comments on commit 4aef130

Please sign in to comment.