Skip to content

Commit 7d3e7aa

Browse files
authored
Update lambda_function.py
1 parent 5d3613b commit 7d3e7aa

File tree

1 file changed

+68
-6
lines changed

1 file changed

+68
-6
lines changed

src/transcription-job-state-change/lambda_function.py

Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
s3 = boto3.client('s3')
99
ses = boto3.client('ses')
1010
transcribe = boto3.client('transcribe')
11+
bedrock = boto3.client(service_name='bedrock-runtime', region_name='us-east-1')
1112

1213
media_bucket_name = os.environ['MEDIA_BUCKET_NAME']
1314

@@ -127,6 +128,20 @@ def send_email(to, subject, body):
127128
}
128129
)
129130

131+
def split_string_into_chunks(input_string, chunk_size=50000):
132+
133+
# Splits an input string into chunks of specified size.
134+
135+
# Args:
136+
# input_string (str): The original string to be split.
137+
# chunk_size (int): The desired size of each chunk (default is 50,000 bytes).
138+
139+
#Returns:
140+
# list: A list of substrings, each with a maximum length of chunk_size.
141+
142+
return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]
143+
144+
130145

131146
def lambda_handler(event, context):
132147
transcription_job_name = event['detail']['TranscriptionJobName']
@@ -158,6 +173,7 @@ def lambda_handler(event, context):
158173
subject = f"Transcription failed for {media_uri}"
159174
body = f"Reason: {transcription_job['FailureReason']}"
160175
send_email(to, subject, body)
176+
161177
elif transcription_job_status == 'COMPLETED':
162178
# If the job is complete, get the transcript and send it in an email
163179
print(f"Transcription job complete: {transcription_job_name}")
@@ -168,18 +184,64 @@ def lambda_handler(event, context):
168184

169185
parsed_transcript = parse_transcript_data(transcript_data)
170186

187+
print(len(parsed_transcript))
188+
189+
190+
# We need to chunk the transcript to 50K bytes, to keep under the prompt limits for this Claud model.
191+
192+
chunks = split_string_into_chunks(parsed_transcript)
193+
overall_summary = ""
194+
195+
for chunk in chunks:
196+
197+
llm_start = "Human: Here is a recording of a meeting. Start of Transcript:" + chunk
198+
199+
llm_end = """End of Transcript.
200+
Provide a chronology of the meeting, with time windows and topics. For each topic, provide a summary from the perspective of the meeting organizer.
201+
If possible, identify speakers by name.
202+
Then list any action items and next steps.
203+
Assistant:"""
204+
205+
concatenation = llm_start + llm_end
206+
207+
print("Processing 50K chunk")
208+
print(len(concatenation))
209+
210+
prompt = json.dumps({
211+
"prompt": concatenation,
212+
"max_tokens_to_sample": 4096,
213+
"top_p": 0.999,
214+
"top_k": 250,
215+
"temperature": 0.8:
216+
})
217+
218+
model = "anthropic.claude-instant-v1"
219+
#model = "anthropic.claude-v1"
220+
221+
response = bedrock.invoke_model(body=prompt, modelId=model)
222+
print(response)
223+
response_body = json.loads(response.get("body").read())
224+
225+
overall_summary += response_body.get("completion") + """
226+
\n ========================== End of Segment (~ 35 minutes) ========================== \n
227+
"""
228+
171229
body = ''.join([
172230
f"Original file: {s3_key_from_url(media_uri)}",
173-
'\n\n',
174-
'Below are a parsed and raw transcript of your audio.',
175-
'\n\n==========================\n\n',
176-
parsed_transcript,
177-
'\n\n==========================\n\n',
178-
transcript
231+
'\n',
232+
'Below is a summary of your recorded Chime meeting.',
233+
' Note: If your meetig is longet than (approx) 45 minutes, the output is processed in chunks'
234+
'\n========================== First Meeting Segment ( ~35 minutes) ========================== \n',
235+
overall_summary,
236+
'\n==========================\n',
237+
'Below is the transcript of your recorded Chime meeting. \n\n',
238+
parsed_transcript
239+
# chunk
179240
])
180241

181242
print(f"Sending notification to {notification_email}")
182243
send_email(notification_email, 'Transcript is complete', body)
244+
183245
else:
184246
# TODO
185247
pass

0 commit comments

Comments
 (0)