์ฌ๊ธฐ์์๋ PDF๋ก ๋ ๊ธฐ์ ์ ๋ณด๋ฅผ ์ํ๋ ํํ์ ๋ณด๊ณ ์๋ก ๋ถ์ํ๋ agentic workflow์ ๋ํด ์ค๋ช ํฉ๋๋ค. ์ฌ๋ฌ ์ข ๋ฅ์ PDF์์ ํน์ ๋ฌธ์๋ฅผ ํ์ฉํด ๋ฐ์ดํฐ๋ฅผ ๋ถ์ํ๋ ค๋ฉด 1/ ๋ฌธ์์ metadata์ ๊ด๋ จ ์ ๋ณด๋ฅผ ์ถ๊ฐํด ํ์ฉํ๊ฑฐ๋, 2/ contextual embedding์ ์ด์ฉํ์ฌ chunk์ ์ถ๊ฐ ์ ๋ณด๋ฅผ ๋ฃ์ด์ ํ์ฉํ๋ ๋ฐฉ์์ด ์์ต๋๋ค. ์ฌ๊ธฐ์์๋ ๋ฌธ์์์ ๊ธฐ์ ๋ช ๊ณผ ๋ฌธ์ ์์ฑ์ผ๊ณผ ๊ฐ์ ์ ๋ณด๋ฅผ metadata์ ์ถ์ถํ๋ ๋ฐฉ๋ฒ๊ณผ prompt๋ฅผ ์ด์ฉํด chunk์ ๋ด์ฉ์ ์์ฝํ๋ ๋ฐฉ๋ฒ์ ๋ชจ๋ ํ์ฉํ์ฌ ๋ฌธ์์์ ํ์ํ ์ ๋ณด๋ฅผ ์ถฉ๋ถํ ์ถ์ถํ๊ณ ์ ํฉ๋๋ค. ๋ณด๊ณ ์์ ์์ฑ์ plan and execute ํจํด์ผ๋ก ๋ชฉ์ฐจ์ ์ด์์ ์์ฑํ๊ณ reflection์ ์ด์ฉํด ์ถฉ๋ถํ context๋ฅผ ์ ๊ณตํฉ๋๋ค. Reflection์ ๊ฐ ๋ฌธ๋จ์์ ์ฃผ์ keyword๋ฅผ ์ถ์ถฃํ์ฌ RAG๋ฅผ ํตํด ์ป์ด์ง ๋ฌธ์๋ฅผ ํ์ฉํฉ๋๋ค. ๋ฌธ์์๋ ํ ์คํธ๋ฟ ์๋๋ผ ์ด๋ฏธ์ง๋ ํ๊ฐ ์์ ์ ์์ผ๋ฏ๋ก, multimodal์ ์ด์ฉํด ๊ทธ๋ฆผ์ด๋ ํ๋ก๋ถํฐ ์ถฉ๋ถํ ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ต๋๋ค. ๊ทธ๋ฆฌ๊ณ RAG ๊ฒ์์ ์ ํ๋๋ฅผ ๋์ด๋ฉด์ ์ถฉ๋ถํ context๋ฅผ ์ ๊ณตํ ์ ์๋๋ก ๊ณ์ธต์ chunking์ ํ์ฉํฉ๋๋ค. ์์ฑ๋ ๋ณด๊ณ ์์ ๋ํด ์ฌ์ฉ์์ ๊ฐ๋ ์ฑ์ ๋์ด๊ธฐ ์ํด ์ฌ๊ธฐ์๋ markdown format์ ํ์ฉํ๊ณ , html์ markdown ๋ฌธ์๋ฅผ ํฌํจํ์ฌ ์ธ๋ถ๋ก ๊ณต์ ํ URL์ ์์ฑํฉ๋๋ค. ์ด ๋ณด๊ณ ์๋ pdf์ ๊ฐ์ ๋ฐฉ์์ผ๋ก ๋ณํํ์ฌ ๋ณ๋ ๋ฌธ์๋ก๋ ๊ด๋ฆฌํ ์ ์์ต๋๋ค.
๊ธฐ์ ์ ๋ณด ์์ฝ์ ์ํ ์ธํ๋ผ๋ Amazon serverless architecture ํจํด์ ๋ฐ๋ผ ์๋์ ๊ฐ์ด ์ค๊ณ๋์์ต๋๋ค. ์ด๋ฌํ architecture๋ ๋ณํํ๋ ํธ๋ํฝ์ ์ ์ ํ ๋์ํ๊ณ ๋น์ฉ์ ์ต์ ํํ ์ ์์ต๋๋ค. ๋ํ AWS CDK๋ฅผ ์ด์ฉํด ํธ๋ฆฌํ๊ฒ ๋ฐฐํฌํ ์ ์์ต๋๋ค.

LangChain์ OpenSearchVectorSearch์ ์ด์ฉํ์ฌ ์ง์์ ์ฅ์์ธ Amazon OpenSearch์ ์ฐ๊ฒฐํฉ๋๋ค. ์ดํ ๊ณ์ธต์ chunking์ ์ด์ฉํ์ฌ ๊ด๋ จ๋ ๋ฌธ์๋ฅผ ์กฐํํฉ๋๋ค.
def get_answer_using_opensearch(chat, text, connectionId, requestId):
global reference_docs
msg = ""
top_k = 4
relevant_docs = []
bedrock_embedding = get_embedding()
vectorstore_opensearch = OpenSearchVectorSearch(
index_name = index_name,
is_aoss = False,
ef_search = 1024, # 512(default)
m=48,
#engine="faiss", # default: nmslib
embedding_function = bedrock_embedding,
opensearch_url=opensearch_url,
http_auth=(opensearch_account, opensearch_passwd), # http_auth=awsauth,
)
if enalbeParentDocumentRetrival == 'true': # parent/child chunking
relevant_documents = get_documents_from_opensearch(vectorstore_opensearch, text, top_k)
for i, document in enumerate(relevant_documents):
parent_doc_id = document[0].metadata['parent_doc_id']
doc_level = document[0].metadata['doc_level']
excerpt, name, url = get_parent_content(parent_doc_id) # use pareant document
relevant_docs.append(
Document(
page_content=excerpt,
metadata={
'name': name,
'url': url,
'doc_level': doc_level,
'from': 'vector'
},
)
)
else:
relevant_documents = vectorstore_opensearch.similarity_search_with_score(
query = text,
k = top_k,
)
for i, document in enumerate(relevant_documents):
name = document[0].metadata['name']
url = document[0].metadata['url']
content = document[0].page_content
relevant_docs.append(
Document(
page_content=content,
metadata={
'name': name,
'url': url,
'from': 'vector'
},
)
)
filtered_docs = grade_documents(text, relevant_docs) # grading
filtered_docs = check_duplication(filtered_docs) # check duplication
relevant_context = ""
for i, document in enumerate(filtered_docs):
if document.page_content:
content = document.page_content
relevant_context = relevant_context + content + "\n\n"
msg = query_using_RAG_context(connectionId, requestId, chat, relevant_context, text)
reference_docs += filtered_docs
return msg
์กฐํํ ๋ฌธ์์ ๊ด๋ จ๋๋ ์๋์ ๊ฐ์ด LLM์ ์ด์ฉํ์ฌ grading์ ์ํํฉ๋๋ค. ๋ฌธ์์ ๊ด๋ จ๋ ํ๊ฐ๋ LLM์ผ๋ก RAG Grading ํ์ฉํ๊ธฐ๋ฅผ ์ฐธ์กฐํฉ๋๋ค.
def grade_documents(question, documents):
print("###### grade_documents ######")
filtered_docs = []
if multi_region == 'enable': # parallel processing
print("start grading...")
filtered_docs = grade_documents_using_parallel_processing(question, documents)
else:
# Score each doc
chat = get_chat()
retrieval_grader = get_retrieval_grader(chat)
for i, doc in enumerate(documents):
score = retrieval_grader.invoke({"question": question, "document": doc.page_content})
grade = score.binary_score
# Document relevant
if grade.lower() == "yes":
print("---GRADE: DOCUMENT RELEVANT---")
filtered_docs.append(doc)
# Document not relevant
else:
print("---GRADE: DOCUMENT NOT RELEVANT---")
continue
return filtered_docs
Agent๋ก RAG๊ฐ ํฌํจ๋ workflow๋ฅผ ์๋์ ๊ฐ์ด ๊ตฌ์ฑํฉ๋๋ค. Tool์๋ ์๊ฐ(get_current_time), ๋์(get_book_list), ๋ ์จ(get_weather_info)์ ๊ฐ์ ๊ธฐ๋ณธ ๊ธฐ๋ฅ๋ฟ ์๋๋ผ, ์น๊ฒ์(search_by_tavily)๊ณผ ๊ธฐ์ ์ ๋ณด ๊ฒ์(search_by_opensearch)์ ์ํ ๊ธฐ๋ฅ์ ํฌํจํ๊ณ ์์ต๋๋ค.
class State(TypedDict):
messages: Annotated[list, add_messages]
tools = [get_current_time, get_book_list, get_weather_info, search_by_tavily, search_by_opensearch]
tool_node = ToolNode(tools)
def buildChatAgent():
workflow = StateGraph(State)
workflow.add_node("agent", call_model)
workflow.add_node("action", tool_node)
workflow.add_edge(START, "agent")
workflow.add_conditional_edges(
"agent",
should_continue,
{
"continue": "action",
"end": END,
},
)
workflow.add_edge("action", "agent")
return workflow.compile()
call_model ๋ ธ๋์์๋ agent์ ์ด๋ฆ๋กธ ์ญํ ์ ์ง์ ํ๊ณ , ์ด์ ๋ํ์ Tool๋ฑ์ผ๋ก ๋ถํฐ ์ป์ด์ง ์ ๋ณด๋ฅผ ํ์ฉํ์ฌ ์ ์ ํ ๋ต๋ณ์ ์ํํฉ๋๋ค.
def call_model(state: State):
print("###### call_model ######")
if isKorean(state["messages"][0].content)==True:
system = (
"๋น์ ์ ์ด๋ฆ์ ์์ฐ์ด๊ณ , ์ง๋ฌธ์ ์น๊ทผํ ๋ฐฉ์์ผ๋ก ๋๋ตํ๋๋ก ์ค๊ณ๋ ๋ํํ AI์
๋๋ค."
"์ํฉ์ ๋ง๋ ๊ตฌ์ฒด์ ์ธ ์ธ๋ถ ์ ๋ณด๋ฅผ ์ถฉ๋ถํ ์ ๊ณตํฉ๋๋ค."
"๋ชจ๋ฅด๋ ์ง๋ฌธ์ ๋ฐ์ผ๋ฉด ์์งํ ๋ชจ๋ฅธ๋ค๊ณ ๋งํฉ๋๋ค."
"์ต์ข
๋ต๋ณ์๋ ์กฐ์ฌํ ๋ด์ฉ์ ๋ฐ๋์ ํฌํจํฉ๋๋ค."
)
else:
system = (
"You are a conversational AI designed to answer in a friendly way to a question."
"If you don't know the answer, just say that you don't know, don't try to make up an answer."
"You will be acting as a thoughtful advisor."
)
prompt = ChatPromptTemplate.from_messages(
[
("system", system),
MessagesPlaceholder(variable_name="messages"),
]
)
chain = prompt | model
response = chain.invoke(state["messages"])
return {"messages": [response]}
ํน์ ํ์ด์ง์ ํ์๋ "Subject company"์ "Rating date"๋ก ํด๋น ๋ฌธ์์ ๋์๊ณผ ์์ฑ์ผ์ ํ์ธํ ์ ์์ต๋๋ค.

๋ฌธ์๋ฅผ Amazon S3์ ์ฌ๋ฆด๋ ๋ฐ์ํ๋ put event๋ฅผ ์ด์ฉํ์ฌ ๋ฌธ์๋ฅผ ์ฝ์ด์ฌ๋ ํน์ ํ์ด์ง์ ์ ๋ณด๋ฅผ ์ด์ฉํด company์ date๋ฅผ ํ์ธํฉ๋๋ค.
lambda-document-manager / lambda_function.py์ ์๋ ์ฝ๋๋ฅผ ์ฐธ์กฐํฉ๋๋ค. ์ฌ๊ธฐ์๋ ๋ฌธ์ฅ์์ Structured Output์ ์ด์ฉํ์ฌ subject_company, rating_date์ ์ถ์ถํฉ๋๋ค.
def get_profile_of_doc(content: str):
"""Provide profile of document."""
class Profile(BaseModel):
subject_company: str = Field(description="The value of 'Subject company'")
rating_date: str = Field(description="The value of 'Rating data'")
subject_company = rating_date = ""
for attempt in range(5):
chat = get_chat()
structured_llm = chat.with_structured_output(Profile, include_raw=True)
info = structured_llm.invoke(content)
if not info['parsed'] == None:
parsed_info = info['parsed']
subject_company = parsed_info.subject_company
rating_date = parsed_info.rating_date
break
return subject_company, rating_date
lambda-document-manager - lambda_function.py์์๋ pdf_profile์ ์ฐธ์กฐํ์ฌ ์ด๋ฏธ์ง์์ ํ ์คํธ ์ถ์ถ์์ header์ footer๋ฅผ ์ ๊ฑฐํฉ๋๋ค. header์ footer์ ์์น๋ pdf์ ๋ง๊ฒ ์กฐ์ ํฉ๋๋ค.
pdf_profile = 'ocean'
def store_image_for_opensearch(key, page, subject_company, rating_date):
image_obj = s3_client.get_object(Bucket=s3_bucket, Key=key)
image_content = image_obj['Body'].read()
img = Image.open(BytesIO(image_content))
width, height = img.size
print(f"(original) width: {width}, height: {height}, size: {width*height}")
pos = key.rfind('/')
prefix = key[pos+1:pos+5]
print('img_prefix: ', prefix)
if pdf_profile=='ocean' and prefix == "img_":
area = (0, 175, width, height-175)
img = img.crop(area)
width, height = img.size
print(f"(croped) width: {width}, height: {height}, size: {width*height}")
Amazon S3์ ์ด๋ฏธ์ง ํ์ผ์ด ์ ๋ก๋๋๋ฉด ์๋์ ๊ฐ์ด PyMuPDF๋ฅผ ์ด์ฉํด ์ฒ๋ฆฌํฉ๋๋ค. Text๋ chunking์ ์ํํ๊ณ ์ด๋ฏธ์ง, Table์ ์ด๋ฏธ์ง๋ก ์ ์ฅํ Multimodal์ ์ด์ฉํด ์ด๋ฏธ์ง์ ํ์ ๋ด์ฉ์ ํด์ํ๊ณ ํ ์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค.
def load_document(file_type, key):
s3r = boto3.resource("s3")
doc = s3r.Object(s3_bucket, key)
files = []
tables = []
contents = ""
subject_company = rating_date = ""
if file_type == 'pdf':
Byte_contents = doc.get()['Body'].read()
texts = []
nImages = []
try:
# pdf reader
reader = PdfReader(BytesIO(Byte_contents))
# extract text
imgList = []
for i, page in enumerate(reader.pages):
if i==0 and pdf_profile == 'ocean': # profile page
print('skip the first page!')
continue
texts.append(page.extract_text())
nImage = 0
if '/Resources' in page:
print(f"Resources[{i}]: {page['/Resources']}")
if '/ProcSet' in page['/Resources']:
print(f"Resources/ProcSet[{i}]: {page['/Resources']['/ProcSet']}")
if '/XObject' in page['/Resources']:
print(f"Resources/XObject[{i}]: {page['/Resources']['/XObject']}")
for j, image in enumerate(page['/Resources']['/XObject']):
print(f"image[{j}]: {image}")
if image in imgList:
print('Duplicated...')
continue
else:
imgList.append(image)
Im = page['/Resources']['/XObject'][image]
print(f"{image}[{j}]: {Im}")
nImage = nImage+1
print(f"# of images of page[{i}] = {nImage}")
nImages.append(nImage)
# extract metadata
if pdf_profile == 'ocean' and i==1:
print("---> extract metadata from document")
pageText = page.extract_text()
print('pageText: ', pageText)
subject_company, rating_date_ori = get_profile_of_doc(pageText)
print('subject_company: ', subject_company)
from datetime import datetime
d = datetime.strptime(rating_date_ori, '%d %B %Y')
rating_date = str(d)[:10]
print('rating_date: ', rating_date)
contents = '\n'.join(texts)
pages = fitz.open(stream=Byte_contents, filetype='pdf')
# extract table data
table_count = 0
for i, page in enumerate(pages):
page_tables = page.find_tables()
if page_tables.tables:
print('page_tables.tables: ', len(page_tables.tables))
for tab in page_tables.tables:
if tab.row_count>=2:
table_image = extract_table_image(page, i, table_count, tab.bbox, key, subject_company, rating_date)
table_count += 1
tables.append({
"body": tab.to_markdown(),
"page": str(i),
"name": table_image
})
files.append(table_image)
# extract page images
if enablePageImageExraction=='true':
for i, page in enumerate(pages):
imgInfo = page.get_image_info()
width = height = 0
for j, info in enumerate(imgInfo):
bbox = info['bbox']
print(f"nImages[{i}]: {nImages[i]}") # number of XObjects
if nImages[i]>=4 or \
(nImages[i]>=1 and (width==0 and height==0)) or \
(nImages[i]>=1 and (width>=100 or height>=100)):
pixmap = page.get_pixmap(dpi=200) # dpi=300
# convert to png
img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
pixels = BytesIO()
img.save(pixels, format='PNG')
pixels.seek(0, 0)
# get path from key
objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
folder = s3_prefix+'/captures/'+objectName+'/'
fname = 'img_'+key.split('/')[-1].split('.')[0]+f"_{i}"
print('fname: ', fname)
if pdf_profile == 'ocean':
img_meta = {
"ext": 'png',
"page": str(i),
"company": subject_company,
"date": rating_date
}
else:
img_meta = {
"ext": 'png',
"page": str(i)
}
print('img_meta: ', img_meta)
response = s3_client.put_object(
Bucket=s3_bucket,
Key=folder+fname+'.png',
ContentType='image/png',
Metadata = img_meta,
Body=pixels
)
files.append(folder+fname+'.png')
contents = '\n'.join(texts)
elif enableImageExtraction == 'true':
image_files = extract_images_from_pdf(reader, key)
for img in image_files:
files.append(img)
except Exception:
err_msg = traceback.format_exc()
print('err_msg: ', err_msg)
pdf์ ์ด๋ฏธ์ง ํ์ผ๋ค์ด ์๋ค๋ฉด pypdf๋ฅผ ์ด์ฉํ์ฌ Amazon S3์ ์ ์ฅํฉ๋๋ค.
from pypdf import PdfReader
reader = PdfReader(BytesIO(Byte_contents))
image_files = extract_images_from_pdf(reader, key)
for img in image_files:
files.append(img)
def extract_images_from_pdf(reader, key):
picture_count = 1
extracted_image_files = []
print('pages: ', len(reader.pages))
for i, page in enumerate(reader.pages):
for image_file_object in page.images:
img_name = image_file_object.name
if img_name in extracted_image_files:
print('skip....')
continue
extracted_image_files.append(img_name)
ext = img_name.split('.')[-1]
contentType = ""
if ext == 'png':
contentType = 'image/png'
elif ext == 'jpg' or ext == 'jpeg':
contentType = 'image/jpeg'
elif ext == 'gif':
contentType = 'image/gif'
elif ext == 'bmp':
contentType = 'image/bmp'
elif ext == 'tiff' or ext == 'tif':
contentType = 'image/tiff'
elif ext == 'svg':
contentType = 'image/svg+xml'
elif ext == 'webp':
contentType = 'image/webp'
elif ext == 'ico':
contentType = 'image/x-icon'
elif ext == 'eps':
contentType = 'image/eps'
if contentType:
image_bytes = image_file_object.data
pixels = BytesIO(image_bytes)
pixels.seek(0, 0)
# get path from key
objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
folder = s3_prefix+'/files/'+objectName+'/'
img_key = folder+img_name
response = s3_client.put_object(
Bucket=s3_bucket,
Key=img_key,
ContentType=contentType,
Body=pixels
)
picture_count += 1
extracted_image_files.append(img_key)
return extracted_image_files
์ฝ์ด์จ ๋ฌธ์์์ ์ถ์ถ๋ ํ ์คํธ์ ํ ์ด๋ธ์ Document ํ์ ์ผ๋ก ๋ชจ์ผ๊ณ ๋ฒกํฐ ์ ์ฅ์(vectorstore)์ธ OpenSearch์ ์ถ๊ฐํฉ๋๋ค.
def store_document_for_opensearch(file_type, key):
contents, files, tables, subject_company, rating_date = load_document(file_type, key)
if len(contents) == 0:
print('no contents: ', key)
return [], files
print('length: ', len(contents))
docs = []
# text
docs.append(Document(
page_content=contents,
metadata={
'name': key,
'url': path+parse.quote(key),
'subject_company': subject_company,
'rating_date': rating_date
}
))
# table
for table in tables:
docs.append(Document(
page_content=table['body'],
metadata={
'name': table['name'],
'url': path+parse.quote(table['name']),
'page': table['page'],
'subject_company': subject_company,
'rating_date': rating_date
}
))
ids = add_to_opensearch(docs, key)
return ids, files
๋ฌธ์๋ฅผ OpenSearch์ ๋ฃ์๋์๋ ์๋์ ๊ฐ์ด chunking์ ์ํํฉ๋๋ค.
def add_to_opensearch(docs, key):
if len(docs) == 0:
return []
objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
print('objectName: ', objectName)
metadata_key = meta_prefix+objectName+'.metadata.json'
print('meta file name: ', metadata_key)
delete_document_if_exist(metadata_key)
ids = []
if enalbeParentDocumentRetrival == 'true':
parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=100,
separators=["\n\n", "\n", ".", " ", ""],
length_function = len,
)
child_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=50,
# separators=["\n\n", "\n", ".", " ", ""],
length_function = len,
)
parent_docs = parent_splitter.split_documents(docs)
print('len(parent_docs): ', len(parent_docs))
print('parent chunk[0]: ', parent_docs[0].page_content)
parent_docs = get_contexual_docs(docs[-1], parent_docs)
print('parent contextual chunk[0]: ', parent_docs[0].page_content)
if len(parent_docs):
for i, doc in enumerate(parent_docs):
doc.metadata["doc_level"] = "parent"
try:
parent_doc_ids = vectorstore.add_documents(parent_docs, bulk_size = 10000)
print('parent_doc_ids: ', parent_doc_ids)
print('len(parent_doc_ids): ', len(parent_doc_ids))
child_docs = []
for i, doc in enumerate(parent_docs):
_id = parent_doc_ids[i]
sub_docs = child_splitter.split_documents([doc])
for _doc in sub_docs:
_doc.metadata["parent_doc_id"] = _id
_doc.metadata["doc_level"] = "child"
child_docs.extend(sub_docs)
print('child chunk[0]: ', child_docs[0].page_content)
child_docs = get_contexual_docs(docs[-1], child_docs)
print('child contextual chunk[0]: ', child_docs[0].page_content)
child_doc_ids = vectorstore.add_documents(child_docs, bulk_size = 10000)
print('child_doc_ids: ', child_doc_ids)
print('len(child_doc_ids): ', len(child_doc_ids))
ids = parent_doc_ids+child_doc_ids
except Exception:
err_msg = traceback.format_exc()
print('error message: ', err_msg)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
separators=["\n\n", "\n", ".", " ", ""],
length_function = len,
)
documents = text_splitter.split_documents(docs)
print('len(documents): ', len(documents))
if len(documents):
if enableContexualRetrieval == 'true':
print('chunk[0]: ', documents[0].page_content)
documents = get_contexual_docs(docs[-1], documents)
print('contextual chunk[0]: ', documents[0].page_content)
else:
print('documents[0]: ', documents[0])
try:
ids = vectorstore.add_documents(documents, bulk_size = 10000)
print('response of adding documents: ', ids)
except Exception:
err_msg = traceback.format_exc()
print('error message: ', err_msg)
#raise Exception ("Not able to add docs in opensearch")
print('len(ids): ', len(ids))
return ids
Contextual Retrieval์ ๊ฐ์ด contextual embedding์ ์ด์ฉํ์ฌ chunk์ ๋ํ ์ค๋ช ์ ์ถ๊ฐํ๋ฉด, ๊ฒ์์ ์ ํ๋๋ฅผ ๋์ผ ์ ์์ต๋๋ค. ๋ํ BM25(keyword) ๊ฒ์์ OpenSearch์ hybrid ๊ฒ์์ ํตํด ๊ตฌํํ ์ ์์ต๋๋ค. ์์ธํ ์ฝ๋๋ lambda_function.py๋ฅผ ์ฐธ์กฐํฉ๋๋ค.
def get_contexual_docs(whole_doc, splitted_docs):
contextual_template = (
"<document>"
"{WHOLE_DOCUMENT}"
"</document>"
"Here is the chunk we want to situate within the whole document."
"<chunk>"
"{CHUNK_CONTENT}"
"</chunk>"
"Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk."
"Answer only with the succinct context and nothing else."
"Put it in <result> tags."
)
contextual_prompt = ChatPromptTemplate([
('human', contextual_template)
])
docs = []
for i, doc in enumerate(splitted_docs):
chat = get_contexual_retrieval_chat()
contexual_chain = contextual_prompt | chat
response = contexual_chain.invoke(
{
"WHOLE_DOCUMENT": whole_doc.page_content,
"CHUNK_CONTENT": doc.page_content
}
)
output = response.content
contextualized_chunk = output[output.find('<result>')+8:len(output)-9]
docs.append(
Document(
page_content=contextualized_chunk+"\n\n"+doc.page_content,
metadata=doc.metadata
)
)
return docs
์๋์ ๊ฒฝ์ฐ๋ ๊ธฐ์ ์ ์ง๋ถ์จ์ ๋ํ ๋ฐ์ดํฐ๋ก ์๋ chunk์๋ ๋จ์ํ ์ง๋ถ์จ ์ด๊ฑฐํ๊ณ ์์ต๋๋ค.
structure as of 3 January 2024 (date of last disclosure) is as follows:
Suzano Holding SA, Brazil - 27.76%
David Feffer - 4.04%
Daniel Feffer - 3.63%
Jorge Feffer - 3.60%
Ruben Feffer - 3.54%
Alden Fundo De Investimento Em Aรงรตes, Brazil - 1.98%
Other investors hold the remaining 55.45%
Suzano Holding SA is majority-owned by the founding Feffer family
Ultimate Beneficial Owners
and/or Persons with Significant
ControlFilings show that the beneficial owners/persons with significant control
are members of the Feffer family, namely David Feffer, Daniel Feffer,
Jorge Feffer, and Ruben Feffer
Directors Executive Directors:
Walter Schalka - Chief Executive Officer
Aires Galhardo - Executive Officer - Pulp Operation
Carlos Anรญbal de Almeida Jr - Executive Officer - Forestry, Logistics and
Procurement
Christian Orglmeister - Executive Officer - New Businesses, Strategy, IT,
Digital and Communication
์๋๋ contexualized chunk์ ๋๋ค. ์๋ณธ chunk์ ์๋ ํ์ฌ๋ช ๊ณผ ownership์ ๋ํ ์ ๋ณด๋ฅผ ํฌํจํ๊ณ ์์ต๋๋ค.
This chunk provides details on the ownership structure and key executives of Suzano SA,
the company that is the subject of the overall document.
It is likely included to provide background information on the company's corporate structure and leadership.
์๋๋ ์ด๋ค ๊ธฐ์ ์ financial ์ ๋ณด์ ๋ํ chunk ์ ๋๋ค.
Type of Compilation Consolidated Consolidated Consolidated
Currency / UnitsBRL โ000 (USD 1 =
BRL 5.04)BRL โ000 (USD 1 =
BRL 5.29)BRL โ000 (USD 1 =
BRL 5.64)
Turnover 29,384,030 49,830,946 40,965,431
Gross results 11,082,919 25,009,658 20,349,843
Depreciation (5,294,748) (7,206,125) (6,879,132)
Operating profit (loss) 9,058,460 22,222,781 18,180,191
Interest income 1,215,644 967,010 272,556
Interest expense (3,483,674) (4,590,370) (4,221,301)
Other income (expense) 3,511,470 6,432,800 (9,347,234)
Profit (loss) before tax 12,569,930 8,832,957 (17,642,129)
Tax (2,978,271) (197,425) (6,928,009)
Net profit (loss) 9,591,659 23,394,887 8,635,532
Net profit (loss) attributable to
minorities/non-controlling
interests14,154 13,270 9,146
Net profit (loss) attributable to the
company9,575,938 23,119,235 8,751,864
Long-term assets 103,391,275 96,075,318 84,872,211
Fixed assets 57,718,542 50,656,634 38,169,703
Goodwill and other intangibles 14,877,234 15,192,971 16,034,339
์๋๋ contexualized chunk์ ๋๋ค. chunk์ ์๋ ํ์ฌ๋ช ์ ํฌํจํ ์ ๋ณด๋ฅผ ์ ๊ณตํฉ๋๋ค.
This chunk provides detailed financial information about Suzano SA,
including its turnover, gross results, operating profit, net profit, and asset details.
It is part of the overall assessment and rating of Suzano SA presented in the document.
์๋๋ ํ์ฌ ์ฐ๋ฝ์ฒ์ ๋ํ chunk์ ๋๋ค.
|Telephone|+55 11 3503&#45;9000|
|Email|ri@suzano.com.br|
|Company Details||
|Company Type|Publicly Listed|
|Company Status|Operating|
|Sector|Industrial|
|Place of Incorporation|Brazil|
|Region of Incorporation|Bahia|
|Date of Incorporation|17 December 1987|
|Company Registered Number|CNPJ (Tax Id. No.): 16.404.287/0001&#45;55|
์ด๋์ contexualized chunk์ ๊ฒฐ๊ณผ๋ ์๋์ ๊ฐ์ต๋๋ค. chunk์ ์๋ ํ์ฌ์ ์ฐ๋ฝ์ฒ์ ๋ํ ์ ๋ณด๋ฅผ ์ ๊ณตํ ์ ์์ต๋๋ค.
This chunk provides detailed company information about Suzano SA,
including its contact details, company type, status, sector, place and date of incorporation, and registered number.
This information is part of the overall assessment and rating of Suzano SA presented in the document.
์ด๋ฏธ์ง๋ LLM์์ ์ฒ๋ฆฌํ ์ ์๋๋ก resizeํ์ ํ ์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค. ์ด๋ LLM์ด ๋ฌธ์์ ๋ด์ฉ์ ์ถ์ถํ ์ ์๋๋ก ํ์ฌ๋ช ๋ฑ์ ์ด์ฉํด ์ ๋ณด๋ฅผ ์ ๊ณตํฉ๋๋ค.
def store_image_for_opensearch(key, page, subject_company, rating_date):
image_obj = s3_client.get_object(Bucket=s3_bucket, Key=key)
image_content = image_obj['Body'].read()
img = Image.open(BytesIO(image_content))
width, height = img.size
pos = key.rfind('/')
prefix = key[pos+1:pos+5]
print('img_prefix: ', prefix)
if pdf_profile=='ocean' and prefix == "img_":
area = (0, 175, width, height-175)
img = img.crop(area)
width, height = img.size
print(f"(croped) width: {width}, height: {height}, size: {width*height}")
if width < 100 or height < 100: # skip small size image
return []
isResized = False
while(width*height > 5242880):
width = int(width/2)
height = int(height/2)
isResized = True
print(f"(resized) width: {width}, height: {height}, size: {width*height}")
try:
if isResized:
img = img.resize((width, height))
buffer = BytesIO()
img.save(buffer, format="PNG")
img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
# extract text from the image
chat = get_multimodal()
text = extract_text(chat, img_base64, subject_company)
extracted_text = text[text.find('<result>')+8:len(text)-9] # remove <result> tag
summary = summary_image(chat, img_base64, subject_company)
image_summary = summary[summary.find('<result>')+8:len(summary)-9] # remove <result> tag
if len(extracted_text) > 30:
contents = f"[์ด๋ฏธ์ง ์์ฝ]\n{image_summary}\n\n[์ถ์ถ๋ ํ
์คํธ]\n{extracted_text}"
else:
contents = f"[์ด๋ฏธ์ง ์์ฝ]\n{image_summary}"
print('image contents: ', contents)
docs = []
if len(contents) > 30:
docs.append(
Document(
page_content=contents,
metadata={
'name': key,
'url': path+parse.quote(key),
'page': page,
'subject_company': subject_company,
'rating_date': rating_date
}
)
)
print('docs size: ', len(docs))
return add_to_opensearch(docs, key)
except Exception:
err_msg = traceback.format_exc()
print('error message: ', err_msg)
return []
๋ฌธ์์ ๋ชฉ์ฐจ์ ์ด์ ๋ฐ๋ฅธ ์์ฑ๊ณผ์ ์ plan and execute ํจํด๊ณผ reflection์ ํ์ฉํฉ๋๋ค.
Plan and execute ํจํด์ ์ด์ ์์ฑ๋ ๋ฌธ์๋ฅผ ์ฐธ๊ณ ํ ์ ์์ด์ ๋ฌธ์ฅ์ ์ค๋ณต ๋ฐ ์์ฐ์ค๋ฌ์ด ์ฐ๊ฒฐ์ ์ํด ์ ์ฉํฉ๋๋ค. ๋ฌธ์์ ๊ฒ์๊ณผ ์์ฑ์ workflow๋ฅผ ์ด์ฉํด ๊ตฌ์ฑํฉ๋๋ค.
def buildPlanAndExecuteOceanWorkflow():
workflow = StateGraph(State)
# Add nodes
workflow.add_node("plan", plan_node)
workflow.add_node("retrieve", retrieve_node)
workflow.add_node("generate", generate_node)
workflow.add_node("revise_answers", revise_answers) # reflection
# Add edges
workflow.add_edge(START, "plan")
workflow.add_edge("plan", "retrieve")
workflow.add_edge("retrieve", "generate")
workflow.add_edge("generate", "revise_answers")
workflow.add_edge("revise_answers", END)
return workflow.compile()
์ด๋ reflection ํจํด์ workflow๋ ์๋์ ๊ฐ์ต๋๋ค.
def buildReflection():
workflow = StateGraph(ReflectionState)
# Add nodes
workflow.add_node("reflect_node", reflect_node)
workflow.add_node("revise_draft", revise_draft)
# Set entry point
workflow.set_entry_point("reflect_node")
workflow.add_conditional_edges(
"revise_draft",
should_continue,
{
"end": END,
"continue": "reflect_node"}
)
# Add edges
workflow.add_edge("reflect_node", "revise_draft")
return workflow.compile()
๋ณด๊ณ ์ plan์ ์๋์ ๊ฐ์ด ๋ฏธ๋ฆฌ ์ ์ํ ๋ชฉ์ฐจ์ keyword๋ฅผ ์ด์ฉํฉ๋๋ค.
def plan_node(state: State):
print('###### plan_node ######')
subject_company = state["subject_company"]
planning_steps = [
"1. ํ์ฌ ์๊ฐ",
"2. ์ฃผ์ ์์
ํ๋",
"3. ์ฌ๋ฌด ํํฉ",
"4. ์ ๋ ํํฉ",
"5. ์ข
ํฉ ํ๊ฐ"
]
sub_queries = [
[
"establish",
"location",
"management",
"affiliated"
],
[
"cargo",
"route",
"owned/chartered",
"strategy"
],
[
"financial performance",
"route",
"financial risk",
"payment"
],
[
"fleet"
],
[
"rating",
"assessment"
]
]
return {
"subject_company": subject_company,
"planning_steps": planning_steps,
"sub_queries": sub_queries
}
๊ฒ์์ sub_query๋ฅผ ์ด์ฉํด retrieve ๋ ธ๋๊ฐ RAG๋ฅผ ์กฐํํ๋ ๋ฐฉ๋ฒ์ผ๋ก ์ํํฉ๋๋ค. ์๋ ํฅ์์ ์ํ์ฌ multi region์ ์ด์ฉํ ๋ณ๋ ฌ์ฒ๋ฆฌ๋ฅผ ์ ์ฉํ์์ต๋๋ค.
def retrieve_node(state: State):
print('###### retrieve_node ######')
subject_company = state["subject_company"]
planning_steps = state["planning_steps"]
print(f"subject_company: {subject_company}, planning_steps: {planning_steps}")
relevant_contexts = []
references = []
sub_queries = state["sub_queries"]
for i, step in enumerate(planning_steps):
print(f"{i}: {step}")
contents = ""
if multi_region == 'enable':
relevant_docs = retrieve_for_parallel_processing(sub_queries[i], subject_company)
for doc in relevant_docs:
contents += doc.page_content
references += relevant_docs
else:
for q in sub_queries[i]:
docs = retrieve(q, subject_company)
print(f"---> q: {sub_queries[i]}, docs: {docs}")
for doc in docs:
contents += doc.page_content
references += docs
relevant_contexts.append(contents)
return {
"subject_company": subject_company,
"planning_steps": planning_steps,
"relevant_contexts": relevant_contexts,
"references": references
}
์ด๋ RAG ๊ฒ์์ ์๋์ ๊ฐ์ต๋๋ค. ๊ณ์ธต์ /๊ณ ์ chunking์ ๋ชจ๋ ๊ตฌํํ์์ง๋ง, ๊ฒ์ ์ฑ๋ฅ ํฅ์์ ์ํด ๊ณ์ธต์ chunking ๋ฐฉ์์ ํ์ฉํฉ๋๋ค. ๋ํ ๊ฒ์ํ ๋ฌธ์๋ ๊ฒ์ ์ฉ์ด์ ๊ฐ์ฅ ๊ฐ๊น์ด(matching) ๋ฌธ์๋ฅผ ์ฐพ์์ ์ด์ฉํฉ๋๋ค.
def retrieve(query: str, subject_company: str):
print(f'###### retrieve: {query} ######')
global reference_docs
top_k = 4
docs = []
bedrock_embedding = get_embedding()
vectorstore_opensearch = OpenSearchVectorSearch(
index_name = index_name,
is_aoss = False,
ef_search = 1024, # 512(default)
m=48,
#engine="faiss", # default: nmslib
embedding_function = bedrock_embedding,
opensearch_url=opensearch_url,
http_auth=(opensearch_account, opensearch_passwd), # http_auth=awsauth,
)
if enalbeParentDocumentRetrival == 'true': # parent/child chunking
relevant_documents = get_documents_from_opensearch_for_subject_company(vectorstore_opensearch, query, top_k, subject_company)
for i, document in enumerate(relevant_documents):
parent_doc_id = document[0].metadata['parent_doc_id']
doc_level = document[0].metadata['doc_level']
excerpt, name, url = get_parent_content(parent_doc_id) # use pareant document
docs.append(
Document(
page_content=excerpt,
metadata={
'name': name,
'url': url,
'doc_level': doc_level,
'from': 'vector'
},
)
)
else:
boolean_filter = {
"bool": {
"filter":[
{"match" : {"metadata.subject_company":subject_company}},
{"term" : {"metadata.doc_level":"child"}}
]
}
}
relevant_documents = vectorstore_opensearch.similarity_search_with_score(
query = query,
k = top_k,
search_type="script_scoring",
pre_filter=boolean_filter
)
for i, document in enumerate(relevant_documents):
name = document[0].metadata['name']
url = document[0].metadata['url']
content = document[0].page_content
docs.append(
Document(
page_content=content,
metadata={
'name': name,
'url': url,
'from': 'vector'
},
)
)
filtered_docs = grade_documents(query, docs) # grading
filtered_docs = check_duplication(filtered_docs) # check duplication
reference_docs += filtered_docs # add to reference
return filtered_docs
๋ฌธ์์ ์์ฑ์ ์๋์ ๊ฐ์ด ์ฃผ์ ์ ๋ํ ๋ณด๊ณ ์ ์ง์์ฌํญ, ๋ณด๊ณ ์ ๋จ๊ณ, ์ด๋ฏธ ์์ฑํ ํ ์คํธ, ์ฐธ๊ณ ๋ฌธ์๋ฅผ ํ์ฉํด ์์ฑํฉ๋๋ค.
def generate_node(state: State):
print('###### generate_node ######')
write_template = (
"๋น์ ์ ๊ธฐ์
์ ๋ํ ๋ณด๊ณ ์๋ฅผ ์์ฑํ๋ ํ๋ฅญํ ๊ธ์ฐ๊ธฐ ๋์ฐ๋ฏธ์
๋๋ค."
"์๋์ ๊ฐ์ด ์๋ณธ ๋ณด๊ณ ์ ์ง์์ฌํญ๊ณผ ๊ณํํ ๋ณด๊ณ ์ ๋จ๊ณ๋ฅผ ์ ๊ณตํ๊ฒ ์ต๋๋ค."
"๋ํ ์ ๊ฐ ์ด๋ฏธ ์์ฑํ ํ
์คํธ๋ฅผ ์ ๊ณตํฉ๋๋ค."
"๋ณด๊ณ ์ ์ง์์ฌํญ:"
"<instruction>"
"{instruction}"
"</instruction>"
"๋ณด๊ณ ์ ๋จ๊ณ:"
"<plan>"
"{plan}"
"</plan>"
"์ด๋ฏธ ์์ฑํ ํ
์คํธ:"
"<text>"
"{text}"
"</text>"
"์ฐธ๊ณ ๋ฌธ์"
"<context>"
"{context}"
"</context>"
"๋ณด๊ณ ์ ์ง์ ์ฌํญ, ๋ณด๊ณ ์ ๋จ๊ณ, ์ด๋ฏธ ์์ฑ๋ ํ
์คํธ, ์ฐธ๊ณ ๋ฌธ์๋ฅผ ์ฐธ์กฐํ์ฌ ๋ค์ ๋จ๊ณ์ ๊ณ์ ์์ฑํฉ๋๋ค."
"๊ธฐ์
์ ๋ํ ๊ตฌ์ฒด์ ์ธ ์ ๋ณด๋ ๋ฐ๋์ ์ฐธ๊ณ ๋ฌธ์๋ฅผ ์ด์ฉํด ์์ฑํ๊ณ , ๋ชจ๋ฅด๋ ๋ถ๋ถ์ ํฌํจํ์ง ์์ต๋๋ค."
"๋ค์ ๋จ๊ณ:"
"<step>"
"{STEP}"
"</step>"
"๋ณด๊ณ ์์ ๋ด์ฉ์ด ๋์ด์ง์ง ์๊ณ ์ ์ดํด๋๋๋ก ํ๋์ ๋ฌธ๋จ์ ์ถฉ๋ถํ ๊ธธ๊ฒ ์์ฑํฉ๋๋ค."
"ํ์ํ๋ค๋ฉด ์์ ์์ ๋ถ์ ๋ฅผ ์ถ๊ฐํ ์ ์์ต๋๋ค."
"์ด๋ฏธ ์์ฑ๋ ํ
์คํธ๋ฅผ ๋ฐ๋ณตํ์ง ๋ง๊ณ ์์ฑํ ๋ฌธ๋จ๋ง ์ถ๋ ฅํ์ธ์."
"Markdown ํฌ๋งท์ผ๋ก ์์์ ์์ฑํ์ธ์."
"์ต์ข
๊ฒฐ๊ณผ์ <result> tag๋ฅผ ๋ถ์ฌ์ฃผ์ธ์."
)
write_prompt = ChatPromptTemplate.from_messages([
("human", write_template)
])
instruction = f"{state['subject_company']} ํ์ฌ์ ๋ํด ์๊ฐํด ์ฃผ์ธ์."
planning_steps = state["planning_steps"]
text = ""
drafts = []
for i, step in enumerate(planning_steps):
context = state["relevant_contexts"][i]
chat = get_chat()
write_chain = write_prompt | chat
try:
result = write_chain.invoke({
"instruction": instruction,
"plan": planning_steps,
"text": text,
"context": context,
"STEP": step
})
output = result.content
draft = output[output.find('<result>')+8:len(output)-9] # remove <result> tag
if draft.find('#')!=-1 and draft.find('#')!=0:
draft = draft[draft.find('#'):]
text += draft + '\n\n'
drafts.append(draft)
except Exception:
err_msg = traceback.format_exc()
print('error message: ', err_msg)
raise Exception ("Not able to request to LLM")
return {
"drafts": drafts
}
์ด ์๋ฃจ์ ์ ์ฌ์ฉํ๊ธฐ ์ํด์๋ ์ฌ์ ์ ์๋์ ๊ฐ์ ์ค๋น๊ฐ ๋์ด์ผ ํฉ๋๋ค.
- AWS Account ์์ฑ์ ๋ฐ๋ผ ๊ณ์ ์ ์ค๋นํฉ๋๋ค.
๋ณธ ์ค์ต์์๋ us-west-2 ๋ฆฌ์ ์ ์ฌ์ฉํฉ๋๋ค. ์ธํ๋ผ ์ค์น์ ๋ฐ๋ผ CDK๋ก ์ธํ๋ผ ์ค์น๋ฅผ ์งํํฉ๋๋ค.
์ฑํ ๋ฉ๋ด์์ "RAG"๋ฅผ ์ ํํ๊ณ "Suzano์ ๋ํ ์ฐ๋ฝ์ฒ ์ ๋ณด๋?"๋ก ์ ๋ ฅ ํ์ ๊ฒฐ๊ณผ๋ฅผ ํ์ธํฉ๋๋ค.
์ฑํ ๋ฉ๋ด์์ "Agentic RAG"๋ฅผ ์ ํํ ํ์ "Suzano๋ ์ด๋ค ํ์ฌ์ด์ง?"๋ผ๊ณ ์ ๋ ฅํ๋ฉด ์๋์ ๊ฐ์ด RAG์ ์น๊ฒ์์ ํตํด ์ป์ด์ง ์ ๋ณด์ ๊ด๋ จ ๋ฌธ์๋ฅผ ํ์ธํ ์ ์์ต๋๋ค.
"RAG (OpenSearch)" ๋ฉ๋ด๋ก ์ง์ ํด์ "Suzano์ Delta Corp Shipping์ ๋น๊ตํด์ฃผ์ธ์."๋ผ๊ณ ์ ๋ ฅํฉ๋๋ค. RAG๋ ์ฌ์ฉ์์ ์ง๋ฌธ์ 2๊ฐ์ง ๊ฒ์์ด ํ์ํ ์ฌํญ๋ค์ด ์์์๋ ์ง๋ฌธ์ ๊ทธ๋๋ก ๊ฒ์ํฉ๋๋ค. ๋ฐ๋ผ์, ์๋์ ๊ฐ์ด ์ผ๋ถ ์๋ฃ๋ง ๊ฒ์์ด ๋ ์ ์์ต๋๋ค.
์ด์ , "Delta Corp Shipping์ ๋ํด ์ค๋ช ํด์ฃผ์ธ์."๋ผ๊ณ ์ ๋ ฅํ๋ฉด ์๋์ ๊ฐ์ด ํด๋น ํ์ฌ์ ๋ํ ์ ๋ณด๋ฅผ RAG๊ฐ ์ถฉ๋ถํ ๊ฐ์ง๊ณ ์์์ ์ ์ ์์ต๋๋ค.
์ด์ "Agentic RAG" ๋ฉ๋ด๋ก ์ด๋ํ์ฌ "Suzano์ Delta Corp Shipping์ ๋น๊ตํด์ฃผ์ธ์."๋ผ๊ณ ๋ค์ ์ ๋ ฅํฉ๋๋ค. RAG์์๋ 2๊ฐ์ง ๊ฒ์์ด ํ์ํ ์ง๋ฌธ์ ์ ์ฒ๋ฆฌํ์ง ๋ชปํ์์ง๋ง, Agentic RAG๋ ์๋์ ๊ฐ์ด ๋ ํ์ฌ๋ฅผ ์ ๋น๊ตํ๊ณ ์์ต๋๋ค.
์ด๋์ LangSmith์ ๋ก๊ทธ๋ฅผ ํ์ธํ๋ฉด, ์๋์ ๊ฐ์ด OpenSearch๋ก "Suzano"์ "Delta Corp Shipping"์ ๊ฐ๊ฐ ์กฐํํ์ฌ ์ป์ ๊ฒฐ๊ณผ๋ฅผ ๊ฐ์ง๊ณ ์ต์ข ๋ต๋ณ์ ์ป์๊ฒ์ ์ ์ ์์ต๋๋ค. ์ด์๊ฐ์ด query decomposition์ ์ด์ฉํ๋ฉด, RAG ๊ฒ์์ ๊ฒฐ๊ณผ๋ฅผ ํฅ์ ์ํฌ ์ ์์ต๋๋ค.
๋ฉ๋ด์์ "Ocean Agent"๋ฅผ ์ ํํ๋ฉด, plan-and-execute๋ก ๋ต๋ณ์ ๊ตฌํ ์ ์์ต๋๋ค. ์ ๋ ฅ์ฐฝ์์ "Suzano SA"๋ก ํ์ฌ์ ์ด๋ฆ๋ง์ ์ ๋ ฅํฉ๋๋ค. ์๋ ๊ฒฐ๊ณผ์ฒ๋ผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋งํฌ๋ฅผ ๋๋ฅด๋ฉด html๋ก ๋ฌธ์ ๋ด์ฉ์ ๊ณต์ ํ ์ ์์ต๋๋ค. ๋ค์ด๋ก๋ ๋งํฌ๋ฅผ ๋๋ฅด๋ฉด markdown ํํ๋ก ํ์ผ์ ๋ค์ด๋ก๋ ํด์ github ๋๋ ๋ฉ์ผ๋ก ๊ณต์ ํ ์ ์์ต๋๋ค. Markdown ๋ฌธ์์ ์๋ ํธ์์ ๋ฐ๋ผ pdf๋ฑ์ ๋ฌธ์๋ก ๋ณํํ ์ ์์ต๋๋ค.
๋ฉ๋ด์์ "Ocean Agent (Reflection)"์ ์ ํํ๊ณ "Suzano SA"์ ์ ๋ ฅํ๋ฉด plan and execute์ reflection์ด ์ ์ฉ๋ ๊ฒฐ๊ณผ๋ฅผ ์๋ ๋งํฌ์ ๊ฐ์ด ์ป์ ์ ์์ต๋๋ค. plan and execute๋ฅผ ์ ์ฉํ์ ๋๋ณด๋ค ๋ ์์ธํ๊ณ ๋ง์ ์ ๋ณด๋ฅผ ์ป์ ์ ์์ง๋ง, reflection์ ์ํด ๋ ๋ง์ token๊ณผ ์ํ ์๊ฐ์ ํ์๋ก ํฉ๋๋ค.
OpenSearch๋ฅผ ํ์ฉํ์ฌ RAG๋ฅผ ์์ฑํ๊ณ , PDF๋ก ๋ ๊ธฐ์ ์ ๋ณด๋ฅผ ๋ถ์ํ์ฌ ๋ณด๊ณ ์๋ฅผ ์์ฑํ ์ ์์์ต๋๋ค. Agentic Workflow๋ฅผ ๊ตฌ์ฑํ์ฌ ์ฌ๋ฌ ๋จ๊ณ๋ก ์ด๋ฃจ์ด์ง๋ ๋ณต์กํ ์์ ์ ์์ฝ๊ฒ ๊ตฌํํ๊ณ , ๋ชฉ์ ์ ๋ง๋ workflow๋ฅผ ์์ฑํ ์ ์์ต๋๋ค. ์ฌ๊ธฐ์๋ ์ธํ๋ผ๋ฅผ ํจ์จ์ ์ผ๋ก ๊ด๋ฆฌํ๊ธฐ ์ํ์ฌ AWS CDK๋ก OpenSearch๋ฅผ ์ค์นํ๊ณ ์ ์ง๋ณด์ ๋ฐ ๋ณํํ๋ ํธ๋ํฝ ์ฒ๋ฆฌ์ ์ ์ฉํ ์๋ฒ๋ฆฌ์ค ์๋น์ค ์ค์ฌ์ผ๋ก ์์คํ ์ ๊ตฌ์ฑํ์์ต๋๋ค.
์๋ณธ ๋ฌธ์์ ์์ฑ๋ ๋ฌธ์๋ฅผ ๋น๊ตํ๋ฉด์ Hallucination ํ์ธ์ด ํ์ํฉ๋๋ค. Anthropic Sonnet์ ๊ธฐ๋ณธ ์ ๋ณด๊ฐ ์ฐธ์กฐ๋๋ ์ผ์ด์ค๊ฐ ์์ผ๋ฏ๋ก, Hallucination์ด ๋ฌธ์ ๊ฐ ๋ ๋๋ prompt๋ก ์ข๋ ๊ฐํ ์ ํ์ด ํ์ํ ์ ์์ต๋๋ค.
"Panama Shipping"์ ๊ฒ์ํ๋ฉด "Delta Corp Shipping"์ด ์ ์ฌํ ๋ฌธ์๋ก ๊ฒ์์ด ๋ ์ ์์ต๋๋ค. ์ด ๊ฒฝ์ฐ์ ๋ฌธ์ฅ์ ์๋ถ๋ถ์ ์์ฝ์ด contextual retrieval๋ก ์ ๊ณต๋จ์ผ๋ก์ ๋ฌธ์ ๊ฐ ๋ฐ์ํ ๊ฐ๋ฅ์ฑ์ด ๋ฎ์์ง์ง๋ง, ์ ์ฌํ ์ด๋ฆ์ ๊ฐ์ง๋ ๋ฌธ์๋ค์ด ๊ฐ์ด ๊ฒ์์ด ๋๋ฉด ๊ฒฐ๊ณผ์ ์ํฅ์ ๋ฏธ์น ์ ์์ต๋๋ค. ๋ง์ฝ ๋ฌธ์ ๊ฐ ๋๋ ๊ฒฝ์ฐ๊ฐ ๋ฐ์ํ๋ฉด ๊ฒ์์ match์์ term์ผ๋ก ๋ณ๊ฒฝํ์ฌ์ผ ํฉ๋๋ค. ๋ง์ฝ term์ผ๋ก ์ง์ ํ๋ฉด ๊ฒ์์ ํ์ฌ๋ช ์ด full name์ผ๋ก ์ ํํ๊ฒ ์ ๋ ฅ๋์ด์ผ ํฉ๋๋ค.
boolean_filter = {
"bool": {
"filter":[
{"match" : {"metadata.subject_company":subject_company}},
{"term" : {"metadata.doc_level":"child"}}
]
}
}
result = vectorstore_opensearch.similarity_search_with_score(
query = query,
k = top_k*2,
search_type="script_scoring",
pre_filter = boolean_filter
)
๋์ด์ ์ธํ๋ผ๋ฅผ ์ฌ์ฉํ์ง ์๋ ๊ฒฝ์ฐ์ ์๋์ฒ๋ผ ๋ชจ๋ ๋ฆฌ์์ค๋ฅผ ์ญ์ ํ ์ ์์ต๋๋ค.
-
API Gateway Console๋ก ์ ์ํ์ฌ "api-chatbot-for-ocean-agent", "api-ocean-agent"์ ์ญ์ ํฉ๋๋ค.
-
Cloud9 Console์ ์ ์ํ์ฌ ์๋์ ๋ช ๋ น์ด๋ก ์ ์ฒด ์ญ์ ๋ฅผ ํฉ๋๋ค.
cd ~/environment/ocean-agent/cdk-ocean-agent/ && cdk destroy --all