Skip to content

Commit

Permalink
feat: beta v2 working with syllabus ocr new extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
kaloslazo committed Oct 11, 2024
1 parent 0984815 commit 204d064
Show file tree
Hide file tree
Showing 25 changed files with 296 additions and 73 deletions.
Binary file modified app/__pycache__/config.cpython-312.pyc
Binary file not shown.
Binary file modified app/__pycache__/main.cpython-312.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ class Config:
# Configuraciones actualizadas para Qdrant
QDRANT_URL = os.getenv('QDRANT_URL')
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_COLLECTION_NAME = 'syllabus_collection'
QDRANT_COLLECTION_NAME = os.getenv('QDRANT_COLLECTION_NAME')

config = Config()
Binary file added app/data/raw/syllabus_pdfs/AD1003.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/AD1004.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/AD1102.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/AD1103.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/CC1106.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/CS1111.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/HH1102.pdf
Binary file not shown.
Binary file added app/data/raw/syllabus_pdfs/PI2102.pdf
Binary file not shown.
7 changes: 0 additions & 7 deletions app/data/syllabus_data.csv

This file was deleted.

9 changes: 9 additions & 0 deletions app/data/syllabus_extracted.csv

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions app/functions/process_pdf_pcr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import os
import re
import csv
import logging
from typing import List, Dict
from pdfminer.high_level import extract_text

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class SilaboExtractor:
def __init__(self, pdf_directory: str, output_file: str):
self.pdf_directory = pdf_directory
self.output_file = output_file

def clean_text(self, text: str) -> str:
text = re.sub(r'[.\n]+', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()

def extract_field(self, text: str, pattern: str) -> str:
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
if match:
return self.clean_text(match.group(1))
else:
logging.warning(f"No se encontró coincidencia para el patrón '{pattern}'")
return "No encontrado"

def extract_info(self, text: str) -> Dict[str, str]:
info = {}

campos = [
("Carrera", r"(?:CARRERA|DEPARTAMENTO|DIRECCIÓN)\s*:?\s*(.*?)(?:\n|$)"),
("Curso", r"(?:CURSO|ASIGNATURA)\s*:?\s*(.*?)(?:\n|$)"),
("Malla", r"(?:MALLA|AÑO)\s*:?\s*(.*?)(?:\n|$)"),
("Modalidad", r"(?:MODALIDAD|2\.7\s*Modalidad:)\s*:?\s*(.*?)(?:\n|$)"),
("Creditos", r"(?:CREDITOS|CRÉDITOS|2\.2\s*Créditos:)\s*:?\s*(.*?)(?:\n|$)"),
]

for nombre, patron in campos:
info[nombre] = self.extract_field(text, patron)

objetivos = re.findall(r"(?:Sesión|Objetivo)\s*\d*\s*:?\s*(.*?)(?:\n|$)", text)
info['Objetivos'] = '; '.join(objetivos) if objetivos else self.extract_field(text, r"(?:OBJETIVOS|4\.\s*OBJETIVOS)(.*?)(?:\d+\.\s*COMPETENCIAS|\Z)")

info['Competencias'] = self.extract_field(text, r"(?:COMPETENCIAS[^:]*:|5\.\s*COMPETENCIAS)(.*?)(?:\d+\.\s*RESULTADOS|\Z)")
info['Resultados de Aprendizaje'] = self.extract_field(text, r"(?:RESULTADOS DE APRENDIZAJE|6\.\s*RESULTADOS)(.*?)(?:\d+\.\s*TEMAS|\Z)")
info['Temas'] = self.extract_field(text, r"(?:TEMAS|7\.\s*TEMAS)(.*?)(?:\d+\.\s*PLAN|\Z)")
info['Sistema de Evaluación'] = self.extract_field(text, r"(?:SISTEMA DE EVALUACIÓN|9\.\s*SISTEMA)(.*?)(?:\d+\.\s*REFERENCIAS|\Z)")
info['Referencias Bibliográficas'] = self.extract_field(text, r"(?:REFERENCIAS BIBLIOGRÁFICAS|10\.\s*REFERENCIAS)(.*?)(?:\Z)")

return info

def process_pdf(self, pdf_path: str) -> Dict[str, str]:
try:
text = extract_text(pdf_path)
return self.extract_info(text)
except Exception as e:
logging.error(f"Error procesando {pdf_path}: {e}")
return {}

def process_directory(self) -> List[Dict[str, str]]:
results = []
for filename in os.listdir(self.pdf_directory):
if filename.endswith('.pdf'):
pdf_path = os.path.join(self.pdf_directory, filename)
logging.info(f"Procesando: {pdf_path}")
result = self.process_pdf(pdf_path)
if result:
result['Archivo'] = filename
results.append(result)
return results

def save_to_csv(self, data: List[Dict[str, str]]):
if not data:
logging.warning("No hay datos para guardar en el CSV")
return

fieldnames = ['Archivo', 'Carrera', 'Curso', 'Malla', 'Modalidad', 'Creditos',
'Objetivos', 'Competencias', 'Resultados de Aprendizaje',
'Temas', 'Sistema de Evaluación', 'Referencias Bibliográficas']

with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)

logging.info(f"Datos guardados en {self.output_file}")

def run(self):
logging.info("Iniciando procesamiento de sílabos")
data = self.process_directory()
self.save_to_csv(data)
logging.info("Proceso finalizado")

if __name__ == "__main__":
extractor = SilaboExtractor("./app/data/raw/syllabus_pdfs", "./app/data/syllabus_extracted.csv")
extractor.run()
16 changes: 8 additions & 8 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import logging
from fastapi import FastAPI, Request
from app.services.twilio_service import sendMessage as send_whatsapp_message
from app.services.twilio_service import sendWhatsappMessage
from app.services.chatbot_service import ChatbotService
from openai import OpenAIError

app = FastAPI()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

chatbot_service = ChatbotService("app/data/syllabus_data.csv")
chatbot_service = ChatbotService("./app/data/syllabus_extracted.csv")

@app.post("/hook")
async def chat(request: Request):
Expand All @@ -19,19 +19,19 @@ async def chat(request: Request):
logger.info(f"Mensaje recibido de {from_phone}: {body_data}")

try:
response = chatbot_service.process_message(from_phone, body_data)
response = chatbot_service.processMessage(from_phone, body_data)
logger.debug(f"Respuesta generada: {response}")
send_result = send_whatsapp_message(from_phone, response)
send_result = sendWhatsappMessage(from_phone, response)
return {"status": "success", "message": send_result}

except OpenAIError as e:
logger.error(f"Error de OpenAI: {str(e)}")
error_message = "Lo siento, estamos experimentando problemas técnicos. Por favor, intenta de nuevo más tarde."
send_whatsapp_message(from_phone, error_message)
error_message = "📢 Lo siento, estamos experimentando problemas técnicos. Por favor, intenta de nuevo más tarde."
sendWhatsappMessage(from_phone, error_message)
return {"status": "error", "message": "Error de OpenAI", "details": str(e)}

except Exception as e:
logger.error(f"Error inesperado al procesar el mensaje: {str(e)}", exc_info=True)
error_message = "Lo siento, ocurrió un error inesperado. Por favor, intenta de nuevo más tarde."
send_whatsapp_message(from_phone, error_message)
error_message = "📢 Lo siento, ocurrió un error inesperado. Por favor, intenta de nuevo más tarde."
sendWhatsappMessage(from_phone, error_message)
return {"status": "error", "message": "Error interno", "details": str(e)}
Binary file modified app/models/__pycache__/qa_model.cpython-312.pyc
Binary file not shown.
152 changes: 130 additions & 22 deletions app/models/qa_model.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,162 @@
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
import logging
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_qdrant import Qdrant
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from app.config import config
from qdrant_client import QdrantClient

import logging
from qdrant_client.models import Distance, VectorParams, PointStruct

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

PROMPT_TEMPLATE = """
Responde brevemente basándote en esta información del sílabo del curso preguntado por el estudiante:
Eres un asistente virtual para estudiantes de la Universidad de Ingeniería y Tecnología (UTEC). Tu tarea es proporcionar información precisa basada en el contenido de los sílabos de los cursos.
Contexto del sílabo:
{context}
Pregunta: {question}
Pregunta del estudiante: {question}
Instrucciones:
1. Usa la información proporcionada en el contexto anterior para responder.
2. Si se pregunta por referencias bibliográficas, busca específicamente una sección llamada "BIBLIOGRÁFICAS" o similar en el contexto.
3. Si encuentras referencias bibliográficas relevantes, menciónalas directamente.
4. Si la información exacta no está en el contexto, pero hay información parcial o relacionada, proporciona esa información y menciona que es parcial.
5. Si no hay absolutamente ninguna información relevante, di "Lo siento, no tengo información específica sobre eso en el sílabo."
6. No inventes ni inferas información que no esté explícitamente en el contexto.
Respuesta muy concisa:
Respuesta basada en la información del sílabo:
"""

class QAModel:
def __init__(self, texts):
logger.info(f"QAModel inicializado con {len(texts)} documentos")

self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
self.qdrant_client = QdrantClient(url=config.QDRANT_URL, api_key=config.QDRANT_API_KEY)
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=[
"context", "question"])

self.collection_name = config.QDRANT_COLLECTION_NAME
self.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2")
self.qdrant_client = QdrantClient(
url=config.QDRANT_URL, api_key=config.QDRANT_API_KEY)

self.qdrant = Qdrant(
client = self.qdrant_client,
collection_name=config.QDRANT_COLLECTION_NAME,
client=self.qdrant_client,
collection_name=self.collection_name,
embeddings=self.embeddings,
)
self.llm = OpenAI(temperature=0, max_tokens=100)
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["context", "question"])

self.llm = OpenAI(temperature=0.3, max_tokens=300)
self.qa_chain = ConversationalRetrievalChain.from_llm(
llm=self.llm,
retriever=self.qdrant.as_retriever(search_kwargs={"k": 1}),
combine_docs_chain_kwargs={"prompt": prompt}
retriever=self.qdrant.as_retriever(search_kwargs={"k": 3}),
combine_docs_chain_kwargs={"prompt": prompt},
return_source_documents=True
)

def get_answer(self, question):
def create_collection_if_not_exists(self):
collections = self.qdrant_client.get_collections().collections
if not any(collection.name == self.collection_name for collection in collections):
logger.info(f"Creando nueva colección: {self.collection_name}")
self.qdrant_client.create_collection(
collection_name=self.collection_name,
vectors_config=VectorParams(
size=768, distance=Distance.COSINE),
)
else:
logger.info(f"La colección {self.collection_name} ya existe")

def split_content(self, content, max_length=500):
sections = []
current_section = ""
for line in content.split('\n'):
if len(current_section) + len(line) > max_length:
sections.append(current_section.strip())
current_section = line
else:
current_section += "\n" + line
if current_section:
sections.append(current_section.strip())
return sections

def load_documents(self, texts):
logger.info(f"Cargando {len(texts)} documentos en Qdrant")
points = []
for i, text in enumerate(texts):
content = text.page_content
vector = self.embeddings.embed_query(content)
point = PointStruct(
id=str(i),
payload={'text': content, 'metadata': text.metadata},
vector=vector
)
points.append(point)

try:
operation_info = self.qdrant_client.upsert(
collection_name=self.collection_name,
points=points
)
logger.info(f"Operación de carga completada. Info: {operation_info}")
except Exception as e:
logger.error(f"Error al cargar documentos en Qdrant: {e}")
raise

logger.info("Documentos cargados exitosamente en Qdrant")

def getAnswer(self, question):
logger.info(f"Procesando pregunta: {question}")
if len(question.split()) < 3:
return "Por favor, haz una pregunta más específica sobre cualquier curso."
return "🤔 Por favor, proporciona más detalles para poder ayudarte mejor."
try:
logger.debug("Iniciando búsqueda en Qdrant")
result = self.qa_chain({"question": question, "chat_history": []})
logger.debug(f"Respuesta generada: {result['answer']}")
return result['answer']

# Realizar la búsqueda directamente en Qdrant
query_vector = self.embeddings.embed_query(question)
search_results = self.qdrant_client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=3
)

full_context = []
for i, result in enumerate(search_results):
logger.debug(f"Documento {i+1}:")
logger.debug(f"ID: {result.id}, Score: {result.score}")
logger.debug(f"Contenido: {result.payload['text'][:200]}...")
full_context.append(f"Documento {i+1}:\n{result.payload['text']}")

context = "\n\n".join(full_context)
logger.debug(f"Contexto completo pasado al modelo:\n{context}")

# Usar el contexto completo para generar la respuesta
prompt = self.qa_chain.combine_docs_chain.llm_chain.prompt.format(
context=context,
question=question
)
response = self.qa_chain.combine_docs_chain.llm_chain.llm.predict(prompt)

logger.debug(f"Respuesta generada: {response}")
return response
except Exception as e:
logger.error(f"Error al procesar la pregunta: {str(e)}", exc_info=True)
return "Lo siento, no pude procesar tu pregunta. Por favor, intenta reformularla."
return "🙁 Lo siento, tuve un pequeño problema al procesar tu pregunta. ¿Podrías intentar reformularla?"

def test_retrieval(self, query):
logger.info(f"Probando recuperación para la consulta: {query}")
query_vector = self.embeddings.embed_query(query)
search_result = self.qdrant_client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=3
)
logger.info("Resultados de búsqueda directa en Qdrant:")
for result in search_result:
logger.info(f"ID: {result.id}, Score: {result.score}")
logger.info(f"Contenido: {result.payload['text'][:200]}...")

# Probar la cadena completa
qa_result = self.getAnswer(query)
logger.info(f"Respuesta del modelo: {qa_result}")
2 changes: 1 addition & 1 deletion app/services/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .twilio_service import sendMessage
from .twilio_service import *
Binary file modified app/services/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified app/services/__pycache__/chatbot_service.cpython-312.pyc
Binary file not shown.
Binary file modified app/services/__pycache__/twilio_service.cpython-312.pyc
Binary file not shown.
16 changes: 8 additions & 8 deletions app/services/chatbot_service.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
from app.models.qa_model import QAModel
from app.utils.data_loader import load_and_split_data
from app.utils.data_loader import loadAndSplitData
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class ChatbotService:
def __init__(self, data_path):
texts = load_and_split_data(data_path)
logger.info("ChatbotService inicializado")
texts = loadAndSplitData(data_path)
self.qa_model = QAModel(texts)
self.qa_model.test_retrieval("¿Me recomiendas alguna referencia bibliografica del curso de tendencias de mercado?")
self.chat_history = {}
logger.info("ChatbotService inicializado")

def process_message(self, from_phone, message):
def processMessage(self, from_phone, message):
logger.info(f"Procesando mensaje de {from_phone}: {message}")
if from_phone not in self.chat_history:
self.chat_history[from_phone] = []
if from_phone not in self.chat_history: self.chat_history[from_phone] = []

try:
answer = self.qa_model.get_answer(message)
answer = self.qa_model.getAnswer(message)
self.chat_history[from_phone].append((message, answer))
logger.info(f"Respuesta generada para {from_phone}: {answer}")
return answer

except Exception as e:
logger.error(f"Error al procesar mensaje: {str(e)}", exc_info=True)
return "Lo siento, ocurrió un error al procesar tu mensaje. Por favor, intenta de nuevo."

2 changes: 1 addition & 1 deletion app/services/twilio_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

client = Client(config.TWILIO_ACCOUNT_SID, config.TWILIO_AUTH_TOKEN)

def sendMessage(to_phone, body_data):
def sendWhatsappMessage(to_phone, body_data):
if to_phone.startswith('whatsapp:'): to_phone = to_phone[9:].strip()
if not to_phone.startswith('+'): to_phone = '+' + to_phone

Expand Down
Binary file modified app/utils/__pycache__/data_loader.cpython-312.pyc
Binary file not shown.
Loading

0 comments on commit 204d064

Please sign in to comment.