diff --git a/llama_hub/github_repo/base.py b/llama_hub/github_repo/base.py index 42aa3ad904..e45dcf350b 100644 --- a/llama_hub/github_repo/base.py +++ b/llama_hub/github_repo/base.py @@ -247,7 +247,7 @@ def _load_data_from_commit(self, commit_sha: str) -> List[Document]: print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs") return self._loop.run_until_complete( - self._generate_documents(blobs_and_paths=blobs_and_paths) + self._generate_documents(blobs_and_paths=blobs_and_paths, id=commit_sha) ) def _load_data_from_branch(self, branch: str) -> List[Document]: @@ -270,7 +270,7 @@ def _load_data_from_branch(self, branch: str) -> List[Document]: print_if_verbose(self._verbose, f"got {len(blobs_and_paths)} blobs") return self._loop.run_until_complete( - self._generate_documents(blobs_and_paths=blobs_and_paths) + self._generate_documents(blobs_and_paths=blobs_and_paths, id=branch) ) def load_data( @@ -383,12 +383,14 @@ async def _recurse_tree( async def _generate_documents( self, blobs_and_paths: List[Tuple[GitTreeResponseModel.GitTreeObject, str]], + id: Optional[str], ) -> List[Document]: """ Generate documents from a list of blobs and their full paths. :param `blobs_and_paths`: list of tuples of (tree object, file's full path in the repo realtive to the root of the repo) + :param `id`: the branch name or commit sha used when loading the repo :return: list of documents """ buffered_iterator = BufferedGitBlobDataIterator( @@ -447,12 +449,14 @@ async def _generate_documents( f"got {len(decoded_text)} characters" + f"- adding to documents - {full_path}", ) + url = os.path.join("https://github.com/", self._owner, self._repo, "blob/", id, full_path) document = Document( text=decoded_text, doc_id=blob_data.sha, extra_info={ "file_path": full_path, "file_name": full_path.split("/")[-1], + "url": url }, ) documents.append(document)