-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: beta v2 working with syllabus ocr new extraction
- Loading branch information
Showing
25 changed files
with
296 additions
and
73 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file was deleted.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import os | ||
import re | ||
import csv | ||
import logging | ||
from typing import List, Dict | ||
from pdfminer.high_level import extract_text | ||
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | ||
|
||
class SilaboExtractor: | ||
def __init__(self, pdf_directory: str, output_file: str): | ||
self.pdf_directory = pdf_directory | ||
self.output_file = output_file | ||
|
||
def clean_text(self, text: str) -> str: | ||
text = re.sub(r'[.\n]+', ' ', text) | ||
text = re.sub(r'\s+', ' ', text) | ||
return text.strip() | ||
|
||
def extract_field(self, text: str, pattern: str) -> str: | ||
match = re.search(pattern, text, re.IGNORECASE | re.DOTALL) | ||
if match: | ||
return self.clean_text(match.group(1)) | ||
else: | ||
logging.warning(f"No se encontró coincidencia para el patrón '{pattern}'") | ||
return "No encontrado" | ||
|
||
def extract_info(self, text: str) -> Dict[str, str]: | ||
info = {} | ||
|
||
campos = [ | ||
("Carrera", r"(?:CARRERA|DEPARTAMENTO|DIRECCIÓN)\s*:?\s*(.*?)(?:\n|$)"), | ||
("Curso", r"(?:CURSO|ASIGNATURA)\s*:?\s*(.*?)(?:\n|$)"), | ||
("Malla", r"(?:MALLA|AÑO)\s*:?\s*(.*?)(?:\n|$)"), | ||
("Modalidad", r"(?:MODALIDAD|2\.7\s*Modalidad:)\s*:?\s*(.*?)(?:\n|$)"), | ||
("Creditos", r"(?:CREDITOS|CRÉDITOS|2\.2\s*Créditos:)\s*:?\s*(.*?)(?:\n|$)"), | ||
] | ||
|
||
for nombre, patron in campos: | ||
info[nombre] = self.extract_field(text, patron) | ||
|
||
objetivos = re.findall(r"(?:Sesión|Objetivo)\s*\d*\s*:?\s*(.*?)(?:\n|$)", text) | ||
info['Objetivos'] = '; '.join(objetivos) if objetivos else self.extract_field(text, r"(?:OBJETIVOS|4\.\s*OBJETIVOS)(.*?)(?:\d+\.\s*COMPETENCIAS|\Z)") | ||
|
||
info['Competencias'] = self.extract_field(text, r"(?:COMPETENCIAS[^:]*:|5\.\s*COMPETENCIAS)(.*?)(?:\d+\.\s*RESULTADOS|\Z)") | ||
info['Resultados de Aprendizaje'] = self.extract_field(text, r"(?:RESULTADOS DE APRENDIZAJE|6\.\s*RESULTADOS)(.*?)(?:\d+\.\s*TEMAS|\Z)") | ||
info['Temas'] = self.extract_field(text, r"(?:TEMAS|7\.\s*TEMAS)(.*?)(?:\d+\.\s*PLAN|\Z)") | ||
info['Sistema de Evaluación'] = self.extract_field(text, r"(?:SISTEMA DE EVALUACIÓN|9\.\s*SISTEMA)(.*?)(?:\d+\.\s*REFERENCIAS|\Z)") | ||
info['Referencias Bibliográficas'] = self.extract_field(text, r"(?:REFERENCIAS BIBLIOGRÁFICAS|10\.\s*REFERENCIAS)(.*?)(?:\Z)") | ||
|
||
return info | ||
|
||
def process_pdf(self, pdf_path: str) -> Dict[str, str]: | ||
try: | ||
text = extract_text(pdf_path) | ||
return self.extract_info(text) | ||
except Exception as e: | ||
logging.error(f"Error procesando {pdf_path}: {e}") | ||
return {} | ||
|
||
def process_directory(self) -> List[Dict[str, str]]: | ||
results = [] | ||
for filename in os.listdir(self.pdf_directory): | ||
if filename.endswith('.pdf'): | ||
pdf_path = os.path.join(self.pdf_directory, filename) | ||
logging.info(f"Procesando: {pdf_path}") | ||
result = self.process_pdf(pdf_path) | ||
if result: | ||
result['Archivo'] = filename | ||
results.append(result) | ||
return results | ||
|
||
def save_to_csv(self, data: List[Dict[str, str]]): | ||
if not data: | ||
logging.warning("No hay datos para guardar en el CSV") | ||
return | ||
|
||
fieldnames = ['Archivo', 'Carrera', 'Curso', 'Malla', 'Modalidad', 'Creditos', | ||
'Objetivos', 'Competencias', 'Resultados de Aprendizaje', | ||
'Temas', 'Sistema de Evaluación', 'Referencias Bibliográficas'] | ||
|
||
with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile: | ||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | ||
writer.writeheader() | ||
for row in data: | ||
writer.writerow(row) | ||
|
||
logging.info(f"Datos guardados en {self.output_file}") | ||
|
||
def run(self): | ||
logging.info("Iniciando procesamiento de sílabos") | ||
data = self.process_directory() | ||
self.save_to_csv(data) | ||
logging.info("Proceso finalizado") | ||
|
||
if __name__ == "__main__": | ||
extractor = SilaboExtractor("./app/data/raw/syllabus_pdfs", "./app/data/syllabus_extracted.csv") | ||
extractor.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,162 @@ | ||
from langchain_community.embeddings import HuggingFaceEmbeddings | ||
from langchain_community.vectorstores import Qdrant | ||
import logging | ||
from langchain_huggingface import HuggingFaceEmbeddings | ||
from langchain_qdrant import Qdrant | ||
from langchain.chains import ConversationalRetrievalChain | ||
from langchain.prompts import PromptTemplate | ||
from langchain_openai import OpenAI | ||
from app.config import config | ||
from qdrant_client import QdrantClient | ||
|
||
import logging | ||
from qdrant_client.models import Distance, VectorParams, PointStruct | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
logger = logging.getLogger(__name__) | ||
|
||
PROMPT_TEMPLATE = """ | ||
Responde brevemente basándote en esta información del sílabo del curso preguntado por el estudiante: | ||
Eres un asistente virtual para estudiantes de la Universidad de Ingeniería y Tecnología (UTEC). Tu tarea es proporcionar información precisa basada en el contenido de los sílabos de los cursos. | ||
Contexto del sílabo: | ||
{context} | ||
Pregunta: {question} | ||
Pregunta del estudiante: {question} | ||
Instrucciones: | ||
1. Usa la información proporcionada en el contexto anterior para responder. | ||
2. Si se pregunta por referencias bibliográficas, busca específicamente una sección llamada "BIBLIOGRÁFICAS" o similar en el contexto. | ||
3. Si encuentras referencias bibliográficas relevantes, menciónalas directamente. | ||
4. Si la información exacta no está en el contexto, pero hay información parcial o relacionada, proporciona esa información y menciona que es parcial. | ||
5. Si no hay absolutamente ninguna información relevante, di "Lo siento, no tengo información específica sobre eso en el sílabo." | ||
6. No inventes ni inferas información que no esté explícitamente en el contexto. | ||
Respuesta muy concisa: | ||
Respuesta basada en la información del sílabo: | ||
""" | ||
|
||
class QAModel: | ||
def __init__(self, texts): | ||
logger.info(f"QAModel inicializado con {len(texts)} documentos") | ||
|
||
self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | ||
self.qdrant_client = QdrantClient(url=config.QDRANT_URL, api_key=config.QDRANT_API_KEY) | ||
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=[ | ||
"context", "question"]) | ||
|
||
self.collection_name = config.QDRANT_COLLECTION_NAME | ||
self.embeddings = HuggingFaceEmbeddings( | ||
model_name="sentence-transformers/all-mpnet-base-v2") | ||
self.qdrant_client = QdrantClient( | ||
url=config.QDRANT_URL, api_key=config.QDRANT_API_KEY) | ||
|
||
self.qdrant = Qdrant( | ||
client = self.qdrant_client, | ||
collection_name=config.QDRANT_COLLECTION_NAME, | ||
client=self.qdrant_client, | ||
collection_name=self.collection_name, | ||
embeddings=self.embeddings, | ||
) | ||
self.llm = OpenAI(temperature=0, max_tokens=100) | ||
prompt = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=["context", "question"]) | ||
|
||
self.llm = OpenAI(temperature=0.3, max_tokens=300) | ||
self.qa_chain = ConversationalRetrievalChain.from_llm( | ||
llm=self.llm, | ||
retriever=self.qdrant.as_retriever(search_kwargs={"k": 1}), | ||
combine_docs_chain_kwargs={"prompt": prompt} | ||
retriever=self.qdrant.as_retriever(search_kwargs={"k": 3}), | ||
combine_docs_chain_kwargs={"prompt": prompt}, | ||
return_source_documents=True | ||
) | ||
|
||
def get_answer(self, question): | ||
def create_collection_if_not_exists(self): | ||
collections = self.qdrant_client.get_collections().collections | ||
if not any(collection.name == self.collection_name for collection in collections): | ||
logger.info(f"Creando nueva colección: {self.collection_name}") | ||
self.qdrant_client.create_collection( | ||
collection_name=self.collection_name, | ||
vectors_config=VectorParams( | ||
size=768, distance=Distance.COSINE), | ||
) | ||
else: | ||
logger.info(f"La colección {self.collection_name} ya existe") | ||
|
||
def split_content(self, content, max_length=500): | ||
sections = [] | ||
current_section = "" | ||
for line in content.split('\n'): | ||
if len(current_section) + len(line) > max_length: | ||
sections.append(current_section.strip()) | ||
current_section = line | ||
else: | ||
current_section += "\n" + line | ||
if current_section: | ||
sections.append(current_section.strip()) | ||
return sections | ||
|
||
def load_documents(self, texts): | ||
logger.info(f"Cargando {len(texts)} documentos en Qdrant") | ||
points = [] | ||
for i, text in enumerate(texts): | ||
content = text.page_content | ||
vector = self.embeddings.embed_query(content) | ||
point = PointStruct( | ||
id=str(i), | ||
payload={'text': content, 'metadata': text.metadata}, | ||
vector=vector | ||
) | ||
points.append(point) | ||
|
||
try: | ||
operation_info = self.qdrant_client.upsert( | ||
collection_name=self.collection_name, | ||
points=points | ||
) | ||
logger.info(f"Operación de carga completada. Info: {operation_info}") | ||
except Exception as e: | ||
logger.error(f"Error al cargar documentos en Qdrant: {e}") | ||
raise | ||
|
||
logger.info("Documentos cargados exitosamente en Qdrant") | ||
|
||
def getAnswer(self, question): | ||
logger.info(f"Procesando pregunta: {question}") | ||
if len(question.split()) < 3: | ||
return "Por favor, haz una pregunta más específica sobre cualquier curso." | ||
return "🤔 Por favor, proporciona más detalles para poder ayudarte mejor." | ||
try: | ||
logger.debug("Iniciando búsqueda en Qdrant") | ||
result = self.qa_chain({"question": question, "chat_history": []}) | ||
logger.debug(f"Respuesta generada: {result['answer']}") | ||
return result['answer'] | ||
|
||
# Realizar la búsqueda directamente en Qdrant | ||
query_vector = self.embeddings.embed_query(question) | ||
search_results = self.qdrant_client.search( | ||
collection_name=self.collection_name, | ||
query_vector=query_vector, | ||
limit=3 | ||
) | ||
|
||
full_context = [] | ||
for i, result in enumerate(search_results): | ||
logger.debug(f"Documento {i+1}:") | ||
logger.debug(f"ID: {result.id}, Score: {result.score}") | ||
logger.debug(f"Contenido: {result.payload['text'][:200]}...") | ||
full_context.append(f"Documento {i+1}:\n{result.payload['text']}") | ||
|
||
context = "\n\n".join(full_context) | ||
logger.debug(f"Contexto completo pasado al modelo:\n{context}") | ||
|
||
# Usar el contexto completo para generar la respuesta | ||
prompt = self.qa_chain.combine_docs_chain.llm_chain.prompt.format( | ||
context=context, | ||
question=question | ||
) | ||
response = self.qa_chain.combine_docs_chain.llm_chain.llm.predict(prompt) | ||
|
||
logger.debug(f"Respuesta generada: {response}") | ||
return response | ||
except Exception as e: | ||
logger.error(f"Error al procesar la pregunta: {str(e)}", exc_info=True) | ||
return "Lo siento, no pude procesar tu pregunta. Por favor, intenta reformularla." | ||
return "🙁 Lo siento, tuve un pequeño problema al procesar tu pregunta. ¿Podrías intentar reformularla?" | ||
|
||
def test_retrieval(self, query): | ||
logger.info(f"Probando recuperación para la consulta: {query}") | ||
query_vector = self.embeddings.embed_query(query) | ||
search_result = self.qdrant_client.search( | ||
collection_name=self.collection_name, | ||
query_vector=query_vector, | ||
limit=3 | ||
) | ||
logger.info("Resultados de búsqueda directa en Qdrant:") | ||
for result in search_result: | ||
logger.info(f"ID: {result.id}, Score: {result.score}") | ||
logger.info(f"Contenido: {result.payload['text'][:200]}...") | ||
|
||
# Probar la cadena completa | ||
qa_result = self.getAnswer(query) | ||
logger.info(f"Respuesta del modelo: {qa_result}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
from .twilio_service import sendMessage | ||
from .twilio_service import * |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,28 @@ | ||
from app.models.qa_model import QAModel | ||
from app.utils.data_loader import load_and_split_data | ||
from app.utils.data_loader import loadAndSplitData | ||
import logging | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
logger = logging.getLogger(__name__) | ||
|
||
class ChatbotService: | ||
def __init__(self, data_path): | ||
texts = load_and_split_data(data_path) | ||
logger.info("ChatbotService inicializado") | ||
texts = loadAndSplitData(data_path) | ||
self.qa_model = QAModel(texts) | ||
self.qa_model.test_retrieval("¿Me recomiendas alguna referencia bibliografica del curso de tendencias de mercado?") | ||
self.chat_history = {} | ||
logger.info("ChatbotService inicializado") | ||
|
||
def process_message(self, from_phone, message): | ||
def processMessage(self, from_phone, message): | ||
logger.info(f"Procesando mensaje de {from_phone}: {message}") | ||
if from_phone not in self.chat_history: | ||
self.chat_history[from_phone] = [] | ||
if from_phone not in self.chat_history: self.chat_history[from_phone] = [] | ||
|
||
try: | ||
answer = self.qa_model.get_answer(message) | ||
answer = self.qa_model.getAnswer(message) | ||
self.chat_history[from_phone].append((message, answer)) | ||
logger.info(f"Respuesta generada para {from_phone}: {answer}") | ||
return answer | ||
|
||
except Exception as e: | ||
logger.error(f"Error al procesar mensaje: {str(e)}", exc_info=True) | ||
return "Lo siento, ocurrió un error al procesar tu mensaje. Por favor, intenta de nuevo." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Oops, something went wrong.