Skip to content
This repository was archived by the owner on Mar 1, 2024. It is now read-only.

feat(GenomeAnnotationReader): Reader of NCBI's genome annotetion at nuccore databases #676

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.gitignore

*.egg-info/
.modules

Expand All @@ -8,4 +10,6 @@
.idea/
llama-hub.iml
llamahub/
img_cache/
img_cache/

teste.py
22 changes: 22 additions & 0 deletions llama_hub/genome/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# NCBI genome annotation Loader

This loader fetches the genome annotation from NCBI using the nuccore database through Entrez class of Biopython.

## Usage

To use this loader, simply pass a species name and email to `load_data`:

```python
from llama_hub.genome import GenomeAnnotationReader

species = 'Homo sapiens'
email = 'your_email@example.com'

loader = GenomeAnnotationReader()
documents = loader.load_data(
email = email,
species = species
)
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
4 changes: 4 additions & 0 deletions llama_hub/genome/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""Init file."""
from llama_hub.genome.base import GenomeAnnotationReader

__all__ = ["GenomeAnnotationReader"]
58 changes: 58 additions & 0 deletions llama_hub/genome/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Genome reader."""

from typing import List
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class GenomeAnnotationReader(BaseReader):
"""Genome Reader.

Read read genome annotation from NCBI.

"""

def load_data(self, species, email, rettype="gb", retmode="text") -> List[Document]:
"""Load genebank genome annotation NCBI's nuccore database.

Args:
email [str]: "your_email@example.com"
species_query [str]: "Homo sapien"
"""
from Bio import Entrez

Entrez.email = email

try:
# Search for the species
handle = Entrez.esearch(db="nuccore", term=f"{species} [Organism]")
record = Entrez.read(handle)
handle.close()

if len(record["IdList"]) == 0:
print(f"No records found for species: {species}")
return []

# Fetch the genome annotation record
genome_id = record["IdList"]

annotations = []

for id in genome_id:
handle = Entrez.efetch(
db="nuccore", id=id, rettype=rettype, retmode=retmode
)
annotation_text = handle.read()
handle.close()
annotations.append(Document(text=annotation_text))

return annotations

except Exception as e:
print(f"An error occurred: {e}")
return []


if __name__ == "__main__":
reader = GenomeAnnotationReader()
print(reader.load_data())
1 change: 1 addition & 0 deletions llama_hub/genome/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
biopython==1.81
11 changes: 10 additions & 1 deletion llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1096,6 +1096,15 @@
"web reader"
]
},
"GenomeAnnotationReader": {
"id": "genome",
"author": "acpguedes",
"keywords":[
"ncbi",
"annotation",
"genome"
]
},
"EarningsCallTranscript":{
"id":"earnings_call_transcript",
"author": "Athe-kunal",
Expand All @@ -1109,4 +1118,4 @@
"id": "opensearch",
"author": "chnsagitchen"
}
}
}
12 changes: 8 additions & 4 deletions llama_hub/tools/notebooks/waii.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
" # API Key of Waii (not OpenAI API key)\n",
" api_key=\"3a...\",\n",
" # Which database you want to use, you need add the db connection to Waii first\n",
" database_key=\"snowflake://...\"\n",
" database_key=\"snowflake://...\",\n",
")"
]
},
Expand All @@ -39,10 +39,12 @@
"from llama_index import VectorStoreIndex\n",
"\n",
"# Use as Data Loader, load data to index and query it\n",
"documents = waii_tool.load_data('Get all tables with their number of columns')\n",
"documents = waii_tool.load_data(\"Get all tables with their number of columns\")\n",
"index = VectorStoreIndex.from_documents(documents).as_query_engine()\n",
"\n",
"index.query('Which table contains most columns, tell me top 5 tables with number of columns?').response"
"index.query(\n",
" \"Which table contains most columns, tell me top 5 tables with number of columns?\"\n",
").response"
]
},
{
Expand All @@ -56,7 +58,9 @@
"from llama_index.agent import OpenAIAgent\n",
"from llama_index.llms import OpenAI\n",
"\n",
"agent = OpenAIAgent.from_tools(waii_tool.to_tool_list(), llm=OpenAI(model='gpt-4-1106-preview'), verbose=True)"
"agent = OpenAIAgent.from_tools(\n",
" waii_tool.to_tool_list(), llm=OpenAI(model=\"gpt-4-1106-preview\"), verbose=True\n",
")"
]
},
{
Expand Down
Loading