Skip to content

Commit

Permalink
Merge pull request #3 from Surajrs812/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
balajivis authored Sep 27, 2024
2 parents 21fdde3 + abe7426 commit 7acaba6
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 46 deletions.
6 changes: 3 additions & 3 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cloud_provider: "groq" #"ollama"
llm_provider: "groq" #"ollama"
llm_model: "llama-3.1-70b-versatile" #gemma:2b"
embedding_model: "mxbai-embed-large"
llm_provider: "ollama" #"ollama"
llm_model: "gemma2:2b" #"llama-3.1-70b-versatile" #gemma:2b"
embedding_model: "nomic-embed-text:latest"

token_limit: 1000
target_words: 100
Expand Down
Binary file modified reports/final_report.pdf
Binary file not shown.
Binary file modified reports/umap_clusters.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 9 additions & 2 deletions src/doc_loaders/doc_loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, DirectoryLoader
# from .multimedia_loader import MultimediaLoader
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DocumentLoader:
def __init__(self, source: str, type: str, filter: str = None):
Expand Down Expand Up @@ -34,8 +38,11 @@ def load(self) -> None:
self._load_webpage()
elif self.type == "directory":
self._load_directory()
else:
raise ValueError("Invalid source. Must be a PDF file or a URL.")
# else:
# loader = MultimediaLoader(self.source)
# loader()



def _load_pdf(self) -> None:
"""
Expand Down
4 changes: 3 additions & 1 deletion src/doc_loaders/multimedia_loader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# We will assume that the models are cached already

import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
import tempfile
from tqdm import tqdm
import logging
from src.chunking.audiochunking import split_audio
import whisper
import fast_whisper
from pytubefix import YouTube

logging.basicConfig(level=logging.INFO)
Expand Down
74 changes: 43 additions & 31 deletions src/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings
import openai
import ollama
# import openai
from langchain_ollama import ChatOllama
# from langchain_openai import OpenAI
from langchain_openai import AzureOpenAI

# Set up logging
logging.basicConfig(level=logging.INFO)
Expand All @@ -18,6 +20,7 @@ def __init__(self, config_path):
:param config_path: Path to the YAML configuration file.
"""
load_dotenv()

try:
with open(config_path, 'r') as file:
self.config = yaml.safe_load(file)
Expand All @@ -26,81 +29,90 @@ def __init__(self, config_path):
logger.error("Failed to load configuration: %s", e)
raise

self.llm_groq = None
self.llm = None
self.embedding_model = None

if self.config['cloud_provider'] == 'groq':
if self.config['llm_provider'] == 'groq':
self.load_llm_groq()

if self.config['cloud_provider'] == 'ollama':
if self.config['llm_provider'] == 'ollama':
self.load_llm_ollama()

if self.config['cloud_provider'] == 'openai':
if self.config['llm_provider'] == 'openai':
self.load_llm_openai()


def get_llm_response(self, prompt):
if self.config['cloud_provider'] == 'groq':
return self.llm_groq.invoke(prompt).content
if self.config['cloud_provider'] == 'ollama':
return self.llm_ollama(prompt)
if self.config['cloud_provider'] == 'openai':
return self.llm_openai(prompt)
# def get_llm_response(self):
# if self.config['llm_provider'] == 'groq':
# return self.llm_groq.invoke(prompt).content
# if self.config['llm_provider'] == 'ollama':
# return self.llm_ollama(prompt)
# if self.config['llm_provider'] == 'openai':
# return self.llm_openai(prompt)

def load_llm_groq(self):
"""
Lazily loads the LLM Groq model based on the configuration if it hasn't been loaded yet.
:return: The loaded LLM Groq model.
"""
if not self.llm_groq:
if not self.llm:
try:
logger.info("Loading Groq LLM model...")
self.llm_groq = ChatGroq(
self.llm = ChatGroq(
model_name=self.config['llm_model'],
api_key=os.getenv("GROQ_API_KEY")
)
logger.info("Groq LLM model loaded successfully.")

except KeyError as e:
logger.error("Missing required config key for LLM: %s", e)
raise
except Exception as e:
logger.error("Error loading Groq LLM model: %s", e)
raise

def load_llm_openai(self,prompt):
def load_llm_openai(self):
"""
Loads the OpenAI model.
:return: The loaded OpenAI model.
"""
if not self.llm:
try:
logger.info("Loading OpenAI model...")
openai.api_key = os.getenv("OPENAI_API_KEY")

completion = openai.ChatCompletion.create(
model=self.config['llm_model'], # Make sure the model is set in config
messages=prompt
)
# openai.api_key = os.getenv("OPENAI_API_KEY")
# os.environ["OPENAI_API_VERSION"] = "2023-12-01-preview"
# os.environ["AZURE_OPENAI_ENDPOINT"] = "..."
# os.environ["AZURE_OPENAI_API_KEY"] = "..."
print("AZURE_OPENAI_DEPLOYMENT:", os.getenv("AZURE_OPENAI_DEPLOYMENT"))
load_dotenv()
# print("AZURE_OPENAI_DEPLOYMENT:", os.getenv("AZURE_OPENAI_DEPLOYMENT"))
self.llm = AzureOpenAI(
deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT"), # The deployment name of the model
)
logger.info("OpenAI model loaded successfully.")
return completion.choices[0].message['content']

except KeyError as e:
logger.error("Missing required config key for OpenAI LLM: %s", e)
raise
except Exception as e:
logger.error("Error loading OpenAI LLM model: %s", e)
raise

def load_llm_ollama(self,prompt):
def load_llm_ollama(self):
"""
Loads the Ollama LLM model.
:return: The loaded Ollama LLM model.
"""
if not self.llm:
try:
logger.info("Loading Ollama LLM model...")
response = ollama.chat(model=self.config['llm_model'], messages=prompt)

self.llm = ChatOllama(
model=self.config['llm_model'],
temperature=0,
)
logger.info("Ollama LLM model loaded successfully.")
return response['message']['content']

except KeyError as e:
logger.error("Missing required config key for Ollama LLM: %s", e)
raise
Expand Down Expand Up @@ -132,12 +144,12 @@ def count_tokens(self, text):
:param text: The text to count tokens for.
:return: Number of tokens in the text.
"""
if not self.llm_groq:
if not self.llm:
logger.warning("Groq LLM model is not loaded. Loading the model first...")
self.load_llm_groq()

try:
num_tokens = self.llm_groq.get_num_tokens(text)
num_tokens = self.llm.get_num_tokens(text)
logger.info("Successfully counted %d tokens for the given text.", num_tokens)
return num_tokens
except Exception as e:
Expand All @@ -151,9 +163,9 @@ def count_tokens(self, text):
model_manager = ModelManager('config/config.yaml')
model_manager.load_embedding_model()

print("Token count:", model_manager.count_tokens("Hello world."))
print("Groq LLM Response:", model_manager.llm_groq.invoke("Hello world."))
# print("Token count:", model_manager.count_tokens("Hello world."))
print("Groq LLM Response:", model_manager.llm.invoke("Hello world."))
print("Embedding Result:", model_manager.embedding_model.embed_documents(["Hello world."]))
print("LLM content:", model_manager.get_llm_response("Hello world."))
# print("LLM content:", model_manager.get_llm_response("Hello world."))
except Exception as e:
logger.error("An error occurred: %s", e)
55 changes: 55 additions & 0 deletions src/models/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# from langchain_text_splitters import TokenTextSplitter

# text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

# texts = text_splitter.split_text("heloo welcone otd jvbk =works")
# print(texts)



























from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama

template = """Question: {question}
Answer: Let's think step by step."""

prompt = ChatPromptTemplate.from_template(template)
llm = ChatOllama(
model="gemma2:2b",
temperature=0,
)



print(llm.invoke([
(
"system",
"You are a helpful assistant that translates English to French. Translate the user sentence.",
),
("human", "I love programming."),
]))
12 changes: 6 additions & 6 deletions src/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __call__(self, source: str, type: str) -> dict:
self.combined_content = " ".join(cluster_content.values())
prompt = self.prompts['create_summary_prompt'].format(combined_content=self.combined_content)

final_summary = self.model_manager.get_llm_response(prompt)
final_summary = self.model_manager.llm.invoke(prompt).content

# Step 7: Perform analysis on the document
chunk_words, total_chunks, total_words, total_tokens, tokens_sent_tokens = self.get_analysis()
Expand Down Expand Up @@ -133,7 +133,7 @@ def find_suitable_theme(self, chunk_text):
"""
prompt = self.prompts['find_suitable_theme_prompt'].format(chunk_text=chunk_text)
logger.info("Finding suitable theme for chunk: %s", chunk_text)
return self.model_manager.get_llm_response(prompt)
return self.model_manager.llm.invoke(prompt).content

def find_themes_for_clusters_slow(self, chunks, representatives):
"""
Expand Down Expand Up @@ -192,7 +192,7 @@ def find_themes_for_clusters(self, chunks, representatives):
prompt = self.prompts['find_suitable_theme_prompt_multiple'].format(first_representative_chunk=first_representative_chunk)

# Step 3: Call the LLM once for all clusters
response = self.model_manager.get_llm_response(prompt)
response = self.model_manager.llm.invoke(prompt).content


print(response)
Expand Down Expand Up @@ -232,9 +232,9 @@ def main():
summarizer = Summarizer(config_path)
print(summarizer.find_suitable_theme("Who is John Galt!"))

#data = summarizer('https://mitrarobot.com',"web")
data = summarizer('https://abc7.com/read-harris-trump-presidential-debate-transcript/15289001/','web')
#data = summarizer('https://www.whitehouse.gov/state-of-the-union-2024/',"web")
data = summarizer('https://medium.com/@balajivis/whats-so-challenging-about-building-chatbots-drawing-lessons-from-the-trenches-1ca7343c6e3d',"web")
# data = summarizer('https://abc7.com/read-harris-trump-presidential-debate-transcript/15289001/','web')
# data = summarizer('https://www.whitehouse.gov/state-of-the-union-2024/',"web")
#data = summarizer('https://d18rn0p25nwr6d.cloudfront.net/CIK-0001921963/77018dae-bae9-4c33-8eaf-fa6685991719.pdf',"pdf")

create_final_report(data,report_path='reports/final_report.pdf')
Expand Down
4 changes: 4 additions & 0 deletions temp.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
set PYTHONPATH=E:\Invento Projects\BrahmaSumm
clip
kotaemon
detectron
unstructured
6 changes: 3 additions & 3 deletions todo.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
- [x] Provide a nice output option such as PDF
- [ ] In Config provide option to summarize as small/medium/large
- [ ] Video https://pypi.org/project/audio-extract/
- [X] Youtube, audio
- [ ] Handle pandas dataframe for excel and optionally explore pandas ai
- [x] Youtube, audio
- [x] Handle pandas dataframe for excel and optionally explore pandas ai
- [ ] Use unstructured document loader https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/
- [ ] explore summarization with LIDA https://microsoft.github.io/lida/
- [ ] Table extraction
Expand All @@ -14,7 +14,7 @@
- [ ] Handle different loaders -- csv, arxiv
- [ ] Improve clustering with more flexible organization as well as remove outliers
- [ ] Give proper error for various things including not finding GROQ key or not finding the Ollama model
- [ ] Support other models for LLM and embeddings [Openai, Ollama, Anthropic]
- [x] Support other models for LLM and embeddings [Openai, Ollama, Anthropic]
- [ ] Provide smaller summmarizer model
- [ ] Multi-document summaries
- [ ] Fix the bug when the target words is large
Expand Down

0 comments on commit 7acaba6

Please sign in to comment.