run-llama · acpguedes · Nov 23, 2023 · Nov 23, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+.gitignore
+
 *.egg-info/
 .modules
 
@@ -8,4 +10,6 @@
 .idea/
 llama-hub.iml
 llamahub/
-img_cache/
+img_cache/
+
+teste.py
diff --git a/llama_hub/genome/README.md b/llama_hub/genome/README.md
@@ -0,0 +1,22 @@
+# NCBI genome annotation Loader
+
+This loader fetches the genome annotation from NCBI using the nuccore database through Entrez class of Biopython.
+
+## Usage
+
+To use this loader, simply pass a species name and email to `load_data`:
+
+```python
+from llama_hub.genome import GenomeAnnotationReader
+
+species = 'Homo sapiens'
+email = 'your_email@example.com'
+
+loader = GenomeAnnotationReader()
+documents = loader.load_data(
+    email = email,
+    species = species
+)
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/genome/__init__.py b/llama_hub/genome/__init__.py
@@ -0,0 +1,4 @@
+"""Init file."""
+from llama_hub.genome.base import GenomeAnnotationReader
+
+__all__ = ["GenomeAnnotationReader"]
diff --git a/llama_hub/genome/base.py b/llama_hub/genome/base.py
@@ -0,0 +1,58 @@
+"""Genome reader."""
+
+from typing import List
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+
+class GenomeAnnotationReader(BaseReader):
+    """Genome Reader.
+
+    Read read genome annotation from NCBI.
+
+    """
+
+    def load_data(self, species, email, rettype="gb", retmode="text") -> List[Document]:
+        """Load genebank genome annotation NCBI's nuccore database.
+
+        Args:
+            email [str]: "your_email@example.com"
+            species_query [str]: "Homo sapien"
+        """
+        from Bio import Entrez
+
+        Entrez.email = email
+
+        try:
+            # Search for the species
+            handle = Entrez.esearch(db="nuccore", term=f"{species} [Organism]")
+            record = Entrez.read(handle)
+            handle.close()
+
+            if len(record["IdList"]) == 0:
+                print(f"No records found for species: {species}")
+                return []
+
+            # Fetch the genome annotation record
+            genome_id = record["IdList"]
+
+            annotations = []
+
+            for id in genome_id:
+                handle = Entrez.efetch(
+                    db="nuccore", id=id, rettype=rettype, retmode=retmode
+                )
+                annotation_text = handle.read()
+                handle.close()
+                annotations.append(Document(text=annotation_text))
+
+            return annotations
+
+        except Exception as e:
+            print(f"An error occurred: {e}")
+            return []
+
+
+if __name__ == "__main__":
+    reader = GenomeAnnotationReader()
+    print(reader.load_data())
diff --git a/llama_hub/genome/requirements.txt b/llama_hub/genome/requirements.txt
@@ -0,0 +1 @@
+biopython==1.81
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -1096,6 +1096,15 @@
       "web reader"
     ]
   },
+  "GenomeAnnotationReader": {
+    "id": "genome",
+    "author": "acpguedes",
+    "keywords":[
+      "ncbi",
+      "annotation",
+      "genome"
+    ]
+  },
   "EarningsCallTranscript":{
     "id":"earnings_call_transcript",
     "author": "Athe-kunal",
@@ -1109,4 +1118,4 @@
     "id": "opensearch",
     "author": "chnsagitchen"
   }
-}
+}
diff --git a/llama_hub/tools/notebooks/waii.ipynb b/llama_hub/tools/notebooks/waii.ipynb
@@ -14,7 +14,7 @@
     "    # API Key of Waii (not OpenAI API key)\n",
     "    api_key=\"3a...\",\n",
     "    # Which database you want to use, you need add the db connection to Waii first\n",
-    "    database_key=\"snowflake://...\"\n",
+    "    database_key=\"snowflake://...\",\n",
     ")"
    ]
   },
@@ -39,10 +39,12 @@
     "from llama_index import VectorStoreIndex\n",
     "\n",
     "# Use as Data Loader, load data to index and query it\n",
-    "documents = waii_tool.load_data('Get all tables with their number of columns')\n",
+    "documents = waii_tool.load_data(\"Get all tables with their number of columns\")\n",
     "index = VectorStoreIndex.from_documents(documents).as_query_engine()\n",
     "\n",
-    "index.query('Which table contains most columns, tell me top 5 tables with number of columns?').response"
+    "index.query(\n",
+    "    \"Which table contains most columns, tell me top 5 tables with number of columns?\"\n",
+    ").response"
    ]
   },
   {
@@ -56,7 +58,9 @@
     "from llama_index.agent import OpenAIAgent\n",
     "from llama_index.llms import OpenAI\n",
     "\n",
-    "agent = OpenAIAgent.from_tools(waii_tool.to_tool_list(), llm=OpenAI(model='gpt-4-1106-preview'), verbose=True)"
+    "agent = OpenAIAgent.from_tools(\n",
+    "    waii_tool.to_tool_list(), llm=OpenAI(model=\"gpt-4-1106-preview\"), verbose=True\n",
+    ")"
    ]
   },
   {