Skip to content

Commit

Permalink
Backport PR #712: Handle Single Files and also enable html, pdf file …
Browse files Browse the repository at this point in the history
…formats for /learn (#723)

Co-authored-by: Sanjiv Das <srdas@scu.edu>
  • Loading branch information
meeseeksmachine and srdas authored Apr 10, 2024
1 parent f18be1d commit 011c0ea
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 16 deletions.
54 changes: 38 additions & 16 deletions packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,25 @@
from typing import List

import dask
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import TextSplitter
from pypdf import PdfReader


# Uses pypdf which is used by PyPDFLoader from langchain
def pdf_to_text(path):
reader = PdfReader(path)
text = "\n \n".join([page.extract_text() for page in reader.pages])
return text


def path_to_doc(path):
with open(str(path)) as f:
text = f.read()
if os.path.splitext(path)[1].lower() == ".pdf":
text = pdf_to_text(path)
else:
text = f.read()
m = hashlib.sha256()
m.update(text.encode("utf-8"))
metadata = {"path": str(path), "sha256": m.digest(), "extension": path.suffix}
Expand All @@ -37,6 +49,8 @@ def path_to_doc(path):
".jsx",
".tsx",
".txt",
".html",
".pdf",
}


Expand All @@ -51,21 +65,29 @@ def flatten(*chunk_lists):
def split(path, all_files: bool, splitter):
chunks = []

for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
if filepath.suffix not in SUPPORTED_EXTS:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)
# Check if the path points to a single file
if os.path.isfile(path):
dir = os.path.dirname(path)
filenames = [os.path.basename(path)]
else:
for dir, subdirs, filenames in os.walk(path):
# Filter out hidden filenames, hidden directories, and excluded directories,
# unless "all files" are requested
if not all_files:
subdirs[:] = [
d for d in subdirs if not (d[0] == "." or d in EXCLUDE_DIRS)
]
filenames = [f for f in filenames if not f[0] == "."]

for filename in filenames:
filepath = Path(os.path.join(dir, filename))
# Lower case everything to make sure file extension comparisons are not case sensitive
if filepath.suffix.lower() not in {j.lower() for j in SUPPORTED_EXTS}:
continue

document = dask.delayed(path_to_doc)(filepath)
chunk = dask.delayed(split_document)(document, splitter)
chunks.append(chunk)

flattened_chunks = dask.delayed(flatten)(*chunks)
return flattened_chunks
Expand Down
1 change: 1 addition & 0 deletions packages/jupyter-ai/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ dependencies = [
"typing_extensions>=4.5.0",
"traitlets>=5.0",
"deepmerge>=1.0",
"pypdf==4.1.0",
]

dynamic = ["version", "description", "authors", "urls", "keywords"]
Expand Down

0 comments on commit 011c0ea

Please sign in to comment.