feat: beta v2 working with syllabus ocr new extraction

kaloslazo · Oct 11, 2024 · 204d064 · 204d064
1 parent 0984815
commit 204d064
Show file tree

Hide file tree

Showing 25 changed files with 296 additions and 73 deletions.
diff --git a/app/__pycache__/config.cpython-312.pyc b/app/__pycache__/config.cpython-312.pyc
diff --git a/app/__pycache__/main.cpython-312.pyc b/app/__pycache__/main.cpython-312.pyc
diff --git a/app/config.py b/app/config.py
@@ -13,6 +13,6 @@ class Config:
     # Configuraciones actualizadas para Qdrant
     QDRANT_URL = os.getenv('QDRANT_URL')
     QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
-    QDRANT_COLLECTION_NAME = 'syllabus_collection'
+    QDRANT_COLLECTION_NAME = os.getenv('QDRANT_COLLECTION_NAME')
 
 config = Config()
diff --git a/app/data/raw/syllabus_pdfs/AD1003.pdf b/app/data/raw/syllabus_pdfs/AD1003.pdf
diff --git a/app/data/raw/syllabus_pdfs/AD1004.pdf b/app/data/raw/syllabus_pdfs/AD1004.pdf
diff --git a/app/data/raw/syllabus_pdfs/AD1102.pdf b/app/data/raw/syllabus_pdfs/AD1102.pdf
diff --git a/app/data/raw/syllabus_pdfs/AD1103.pdf b/app/data/raw/syllabus_pdfs/AD1103.pdf
diff --git a/app/data/raw/syllabus_pdfs/CC1106.pdf b/app/data/raw/syllabus_pdfs/CC1106.pdf
diff --git a/app/data/raw/syllabus_pdfs/CS1111.pdf b/app/data/raw/syllabus_pdfs/CS1111.pdf
diff --git a/app/data/raw/syllabus_pdfs/HH1102.pdf b/app/data/raw/syllabus_pdfs/HH1102.pdf
diff --git a/app/data/raw/syllabus_pdfs/PI2102.pdf b/app/data/raw/syllabus_pdfs/PI2102.pdf
diff --git a/app/data/syllabus_data.csv b/app/data/syllabus_data.csv
diff --git a/app/data/syllabus_extracted.csv b/app/data/syllabus_extracted.csv
diff --git a/app/functions/process_pdf_pcr.py b/app/functions/process_pdf_pcr.py
@@ -0,0 +1,98 @@
+import os
+import re
+import csv
+import logging
+from typing import List, Dict
+from pdfminer.high_level import extract_text
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class SilaboExtractor:
+    def __init__(self, pdf_directory: str, output_file: str):
+        self.pdf_directory = pdf_directory
+        self.output_file = output_file
+
+    def clean_text(self, text: str) -> str:
+        text = re.sub(r'[.\n]+', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+
+    def extract_field(self, text: str, pattern: str) -> str:
+        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
+        if match:
+            return self.clean_text(match.group(1))
+        else:
+            logging.warning(f"No se encontró coincidencia para el patrón '{pattern}'")
+            return "No encontrado"
+
+    def extract_info(self, text: str) -> Dict[str, str]:
+        info = {}
+
+        campos = [
+            ("Carrera", r"(?:CARRERA|DEPARTAMENTO|DIRECCIÓN)\s*:?\s*(.*?)(?:\n|$)"),
+            ("Curso", r"(?:CURSO|ASIGNATURA)\s*:?\s*(.*?)(?:\n|$)"),
+            ("Malla", r"(?:MALLA|AÑO)\s*:?\s*(.*?)(?:\n|$)"),
+            ("Modalidad", r"(?:MODALIDAD|2\.7\s*Modalidad:)\s*:?\s*(.*?)(?:\n|$)"),
+            ("Creditos", r"(?:CREDITOS|CRÉDITOS|2\.2\s*Créditos:)\s*:?\s*(.*?)(?:\n|$)"),
+        ]
+
+        for nombre, patron in campos:
+            info[nombre] = self.extract_field(text, patron)
+
+        objetivos = re.findall(r"(?:Sesión|Objetivo)\s*\d*\s*:?\s*(.*?)(?:\n|$)", text)
+        info['Objetivos'] = '; '.join(objetivos) if objetivos else self.extract_field(text, r"(?:OBJETIVOS|4\.\s*OBJETIVOS)(.*?)(?:\d+\.\s*COMPETENCIAS|\Z)")
+
+        info['Competencias'] = self.extract_field(text, r"(?:COMPETENCIAS[^:]*:|5\.\s*COMPETENCIAS)(.*?)(?:\d+\.\s*RESULTADOS|\Z)")
+        info['Resultados de Aprendizaje'] = self.extract_field(text, r"(?:RESULTADOS DE APRENDIZAJE|6\.\s*RESULTADOS)(.*?)(?:\d+\.\s*TEMAS|\Z)")
+        info['Temas'] = self.extract_field(text, r"(?:TEMAS|7\.\s*TEMAS)(.*?)(?:\d+\.\s*PLAN|\Z)")
+        info['Sistema de Evaluación'] = self.extract_field(text, r"(?:SISTEMA DE EVALUACIÓN|9\.\s*SISTEMA)(.*?)(?:\d+\.\s*REFERENCIAS|\Z)")
+        info['Referencias Bibliográficas'] = self.extract_field(text, r"(?:REFERENCIAS BIBLIOGRÁFICAS|10\.\s*REFERENCIAS)(.*?)(?:\Z)")
+
+        return info
+
+    def process_pdf(self, pdf_path: str) -> Dict[str, str]:
+        try:
+            text = extract_text(pdf_path)
+            return self.extract_info(text)
+        except Exception as e:
+            logging.error(f"Error procesando {pdf_path}: {e}")
+            return {}
+
+    def process_directory(self) -> List[Dict[str, str]]:
+        results = []
+        for filename in os.listdir(self.pdf_directory):
+            if filename.endswith('.pdf'):
+                pdf_path = os.path.join(self.pdf_directory, filename)
+                logging.info(f"Procesando: {pdf_path}")
+                result = self.process_pdf(pdf_path)
+                if result:
+                    result['Archivo'] = filename
+                    results.append(result)
+        return results
+
+    def save_to_csv(self, data: List[Dict[str, str]]):
+        if not data:
+            logging.warning("No hay datos para guardar en el CSV")
+            return
+
+        fieldnames = ['Archivo', 'Carrera', 'Curso', 'Malla', 'Modalidad', 'Creditos', 
+                      'Objetivos', 'Competencias', 'Resultados de Aprendizaje', 
+                      'Temas', 'Sistema de Evaluación', 'Referencias Bibliográficas']
+
+        with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            for row in data:
+                writer.writerow(row)
+
+        logging.info(f"Datos guardados en {self.output_file}")
+
+    def run(self):
+        logging.info("Iniciando procesamiento de sílabos")
+        data = self.process_directory()
+        self.save_to_csv(data)
+        logging.info("Proceso finalizado")
+
+if __name__ == "__main__":
+    extractor = SilaboExtractor("./app/data/raw/syllabus_pdfs", "./app/data/syllabus_extracted.csv")
+    extractor.run()
diff --git a/app/main.py b/app/main.py
@@ -1,14 +1,14 @@
 import logging
 from fastapi import FastAPI, Request
-from app.services.twilio_service import sendMessage as send_whatsapp_message
+from app.services.twilio_service import sendWhatsappMessage
 from app.services.chatbot_service import ChatbotService
 from openai import OpenAIError  
 
 app = FastAPI()
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
-chatbot_service = ChatbotService("app/data/syllabus_data.csv")
+chatbot_service = ChatbotService("./app/data/syllabus_extracted.csv")
 
 @app.post("/hook")
 async def chat(request: Request):
@@ -19,19 +19,19 @@ async def chat(request: Request):
     logger.info(f"Mensaje recibido de {from_phone}: {body_data}")
 
     try:
-        response = chatbot_service.process_message(from_phone, body_data)
+        response = chatbot_service.processMessage(from_phone, body_data)
         logger.debug(f"Respuesta generada: {response}")
-        send_result = send_whatsapp_message(from_phone, response)
+        send_result = sendWhatsappMessage(from_phone, response)
         return {"status": "success", "message": send_result}
 
     except OpenAIError as e:
         logger.error(f"Error de OpenAI: {str(e)}")
-        error_message = "Lo siento, estamos experimentando problemas técnicos. Por favor, intenta de nuevo más tarde."
-        send_whatsapp_message(from_phone, error_message)
+        error_message = "📢 Lo siento, estamos experimentando problemas técnicos. Por favor, intenta de nuevo más tarde."
+        sendWhatsappMessage(from_phone, error_message)
         return {"status": "error", "message": "Error de OpenAI", "details": str(e)}
 
     except Exception as e:
         logger.error(f"Error inesperado al procesar el mensaje: {str(e)}", exc_info=True)
-        error_message = "Lo siento, ocurrió un error inesperado. Por favor, intenta de nuevo más tarde."
-        send_whatsapp_message(from_phone, error_message)
+        error_message = "📢 Lo siento, ocurrió un error inesperado. Por favor, intenta de nuevo más tarde."
+        sendWhatsappMessage(from_phone, error_message)
         return {"status": "error", "message": "Error interno", "details": str(e)}
diff --git a/app/models/__pycache__/qa_model.cpython-312.pyc b/app/models/__pycache__/qa_model.cpython-312.pyc
diff --git a/app/models/qa_model.py b/app/models/qa_model.py
@@ -1,54 +1,162 @@
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import Qdrant
+import logging
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_qdrant import Qdrant
 from langchain.chains import ConversationalRetrievalChain
 from langchain.prompts import PromptTemplate
 from langchain_openai import OpenAI
 from app.config import config
 from qdrant_client import QdrantClient
-
-import logging
+from qdrant_client.models import Distance, VectorParams, PointStruct
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 PROMPT_TEMPLATE = """
-Responde brevemente basándote en esta información del sílabo del curso preguntado por el estudiante:
+Eres un asistente virtual para estudiantes de la Universidad de Ingeniería y Tecnología (UTEC). Tu tarea es proporcionar información precisa basada en el contenido de los sílabos de los cursos.
+
+Contexto del sílabo:
 {context}
 
-Pregunta: {question}
+Pregunta del estudiante: {question}
+
+Instrucciones:
+1. Usa la información proporcionada en el contexto anterior para responder.
+2. Si se pregunta por referencias bibliográficas, busca específicamente una sección llamada "BIBLIOGRÁFICAS" o similar en el contexto.
+3. Si encuentras referencias bibliográficas relevantes, menciónalas directamente.
+4. Si la información exacta no está en el contexto, pero hay información parcial o relacionada, proporciona esa información y menciona que es parcial.
+5. Si no hay absolutamente ninguna información relevante, di "Lo siento, no tengo información específica sobre eso en el sílabo."
+6. No inventes ni inferas información que no esté explícitamente en el contexto.
 
-Respuesta muy concisa:
+Respuesta basada en la información del sílabo:
 """
 
 class QAModel:
     def __init__(self, texts):
         logger.info(f"QAModel inicializado con {len(texts)} documentos")
-
-        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
-        self.qdrant_client = QdrantClient(url=config.QDRANT_URL, api_key=config.QDRANT_API_KEY)
+        prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=[
+            "context", "question"])
+
+        self.collection_name = config.QDRANT_COLLECTION_NAME
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-mpnet-base-v2")
+        self.qdrant_client = QdrantClient(
+            url=config.QDRANT_URL, api_key=config.QDRANT_API_KEY)
 
         self.qdrant = Qdrant(
-            client = self.qdrant_client,
-            collection_name=config.QDRANT_COLLECTION_NAME,
+            client=self.qdrant_client,
+            collection_name=self.collection_name,
             embeddings=self.embeddings,
         )
-        self.llm = OpenAI(temperature=0, max_tokens=100)
-        prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["context", "question"])
+
+        self.llm = OpenAI(temperature=0.3, max_tokens=300)
         self.qa_chain = ConversationalRetrievalChain.from_llm(
             llm=self.llm,
-            retriever=self.qdrant.as_retriever(search_kwargs={"k": 1}),
-            combine_docs_chain_kwargs={"prompt": prompt}
+            retriever=self.qdrant.as_retriever(search_kwargs={"k": 3}),
+            combine_docs_chain_kwargs={"prompt": prompt},
+            return_source_documents=True
         )
 
-    def get_answer(self, question):
+    def create_collection_if_not_exists(self):
+        collections = self.qdrant_client.get_collections().collections
+        if not any(collection.name == self.collection_name for collection in collections):
+            logger.info(f"Creando nueva colección: {self.collection_name}")
+            self.qdrant_client.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=VectorParams(
+                    size=768, distance=Distance.COSINE),
+            )
+        else:
+            logger.info(f"La colección {self.collection_name} ya existe")
+
+    def split_content(self, content, max_length=500):
+        sections = []
+        current_section = ""
+        for line in content.split('\n'):
+            if len(current_section) + len(line) > max_length:
+                sections.append(current_section.strip())
+                current_section = line
+            else:
+                current_section += "\n" + line
+        if current_section:
+            sections.append(current_section.strip())
+        return sections
+
+    def load_documents(self, texts):
+        logger.info(f"Cargando {len(texts)} documentos en Qdrant")
+        points = []
+        for i, text in enumerate(texts):
+            content = text.page_content
+            vector = self.embeddings.embed_query(content)
+            point = PointStruct(
+                id=str(i),
+                payload={'text': content, 'metadata': text.metadata},
+                vector=vector
+            )
+            points.append(point)
+
+        try:
+            operation_info = self.qdrant_client.upsert(
+                collection_name=self.collection_name,
+                points=points
+            )
+            logger.info(f"Operación de carga completada. Info: {operation_info}")
+        except Exception as e:
+            logger.error(f"Error al cargar documentos en Qdrant: {e}")
+            raise
+
+        logger.info("Documentos cargados exitosamente en Qdrant")
+
+    def getAnswer(self, question):
         logger.info(f"Procesando pregunta: {question}")
         if len(question.split()) < 3:
-            return "Por favor, haz una pregunta más específica sobre cualquier curso."
+            return "🤔 Por favor, proporciona más detalles para poder ayudarte mejor."
         try:
             logger.debug("Iniciando búsqueda en Qdrant")
-            result = self.qa_chain({"question": question, "chat_history": []})
-            logger.debug(f"Respuesta generada: {result['answer']}")
-            return result['answer']
+
+            # Realizar la búsqueda directamente en Qdrant
+            query_vector = self.embeddings.embed_query(question)
+            search_results = self.qdrant_client.search(
+                collection_name=self.collection_name,
+                query_vector=query_vector,
+                limit=3
+            )
+
+            full_context = []
+            for i, result in enumerate(search_results):
+                logger.debug(f"Documento {i+1}:")
+                logger.debug(f"ID: {result.id}, Score: {result.score}")
+                logger.debug(f"Contenido: {result.payload['text'][:200]}...")
+                full_context.append(f"Documento {i+1}:\n{result.payload['text']}")
+
+            context = "\n\n".join(full_context)
+            logger.debug(f"Contexto completo pasado al modelo:\n{context}")
+
+            # Usar el contexto completo para generar la respuesta
+            prompt = self.qa_chain.combine_docs_chain.llm_chain.prompt.format(
+                context=context,
+                question=question
+            )
+            response = self.qa_chain.combine_docs_chain.llm_chain.llm.predict(prompt)
+
+            logger.debug(f"Respuesta generada: {response}")
+            return response
         except Exception as e:
             logger.error(f"Error al procesar la pregunta: {str(e)}", exc_info=True)
-            return "Lo siento, no pude procesar tu pregunta. Por favor, intenta reformularla."
+            return "🙁 Lo siento, tuve un pequeño problema al procesar tu pregunta. ¿Podrías intentar reformularla?"
+
+    def test_retrieval(self, query):
+        logger.info(f"Probando recuperación para la consulta: {query}")
+        query_vector = self.embeddings.embed_query(query)
+        search_result = self.qdrant_client.search(
+            collection_name=self.collection_name,
+            query_vector=query_vector,
+            limit=3
+        )
+        logger.info("Resultados de búsqueda directa en Qdrant:")
+        for result in search_result:
+            logger.info(f"ID: {result.id}, Score: {result.score}")
+            logger.info(f"Contenido: {result.payload['text'][:200]}...")
+
+        # Probar la cadena completa
+        qa_result = self.getAnswer(query)
+        logger.info(f"Respuesta del modelo: {qa_result}")
diff --git a/app/services/__init__.py b/app/services/__init__.py
@@ -1 +1 @@
-from .twilio_service import sendMessage
+from .twilio_service import *
diff --git a/app/services/__pycache__/__init__.cpython-312.pyc b/app/services/__pycache__/__init__.cpython-312.pyc
diff --git a/app/services/__pycache__/chatbot_service.cpython-312.pyc b/app/services/__pycache__/chatbot_service.cpython-312.pyc
diff --git a/app/services/__pycache__/twilio_service.cpython-312.pyc b/app/services/__pycache__/twilio_service.cpython-312.pyc
diff --git a/app/services/chatbot_service.py b/app/services/chatbot_service.py
@@ -1,28 +1,28 @@
 from app.models.qa_model import QAModel
-from app.utils.data_loader import load_and_split_data
+from app.utils.data_loader import loadAndSplitData
 import logging
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 class ChatbotService:
     def __init__(self, data_path):
-        texts = load_and_split_data(data_path)
+        logger.info("ChatbotService inicializado")
+        texts = loadAndSplitData(data_path)
         self.qa_model = QAModel(texts)
+        self.qa_model.test_retrieval("¿Me recomiendas alguna referencia bibliografica del curso de tendencias de mercado?")
         self.chat_history = {}
-        logger.info("ChatbotService inicializado")
 
-    def process_message(self, from_phone, message):
+    def processMessage(self, from_phone, message):
         logger.info(f"Procesando mensaje de {from_phone}: {message}")
-        if from_phone not in self.chat_history:
-            self.chat_history[from_phone] = []
+        if from_phone not in self.chat_history: self.chat_history[from_phone] = []
 
         try:
-            answer = self.qa_model.get_answer(message)
+            answer = self.qa_model.getAnswer(message)
             self.chat_history[from_phone].append((message, answer))
             logger.info(f"Respuesta generada para {from_phone}: {answer}")
             return answer
+
         except Exception as e:
             logger.error(f"Error al procesar mensaje: {str(e)}", exc_info=True)
             return "Lo siento, ocurrió un error al procesar tu mensaje. Por favor, intenta de nuevo."
-
diff --git a/app/services/twilio_service.py b/app/services/twilio_service.py
@@ -7,7 +7,7 @@
 
 client = Client(config.TWILIO_ACCOUNT_SID, config.TWILIO_AUTH_TOKEN)
 
-def sendMessage(to_phone, body_data):
+def sendWhatsappMessage(to_phone, body_data):
     if to_phone.startswith('whatsapp:'): to_phone = to_phone[9:].strip()
     if not to_phone.startswith('+'): to_phone = '+' + to_phone
 

diff --git a/app/utils/__pycache__/data_loader.cpython-312.pyc b/app/utils/__pycache__/data_loader.cpython-312.pyc
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .twilio_service import sendMessage
		from .twilio_service import *