8
8
s3 = boto3 .client ('s3' )
9
9
ses = boto3 .client ('ses' )
10
10
transcribe = boto3 .client ('transcribe' )
11
+ bedrock = boto3 .client (service_name = 'bedrock-runtime' , region_name = 'us-east-1' )
11
12
12
13
media_bucket_name = os .environ ['MEDIA_BUCKET_NAME' ]
13
14
@@ -127,6 +128,20 @@ def send_email(to, subject, body):
127
128
}
128
129
)
129
130
131
+ def split_string_into_chunks (input_string , chunk_size = 50000 ):
132
+
133
+ # Splits an input string into chunks of specified size.
134
+
135
+ # Args:
136
+ # input_string (str): The original string to be split.
137
+ # chunk_size (int): The desired size of each chunk (default is 50,000 bytes).
138
+
139
+ #Returns:
140
+ # list: A list of substrings, each with a maximum length of chunk_size.
141
+
142
+ return [input_string [i :i + chunk_size ] for i in range (0 , len (input_string ), chunk_size )]
143
+
144
+
130
145
131
146
def lambda_handler (event , context ):
132
147
transcription_job_name = event ['detail' ]['TranscriptionJobName' ]
@@ -158,6 +173,7 @@ def lambda_handler(event, context):
158
173
subject = f"Transcription failed for { media_uri } "
159
174
body = f"Reason: { transcription_job ['FailureReason' ]} "
160
175
send_email (to , subject , body )
176
+
161
177
elif transcription_job_status == 'COMPLETED' :
162
178
# If the job is complete, get the transcript and send it in an email
163
179
print (f"Transcription job complete: { transcription_job_name } " )
@@ -168,18 +184,64 @@ def lambda_handler(event, context):
168
184
169
185
parsed_transcript = parse_transcript_data (transcript_data )
170
186
187
+ print (len (parsed_transcript ))
188
+
189
+
190
+ # We need to chunk the transcript to 50K bytes, to keep under the prompt limits for this Claud model.
191
+
192
+ chunks = split_string_into_chunks (parsed_transcript )
193
+ overall_summary = ""
194
+
195
+ for chunk in chunks :
196
+
197
+ llm_start = "Human: Here is a recording of a meeting. Start of Transcript:" + chunk
198
+
199
+ llm_end = """End of Transcript.
200
+ Provide a chronology of the meeting, with time windows and topics. For each topic, provide a summary from the perspective of the meeting organizer.
201
+ If possible, identify speakers by name.
202
+ Then list any action items and next steps.
203
+ Assistant:"""
204
+
205
+ concatenation = llm_start + llm_end
206
+
207
+ print ("Processing 50K chunk" )
208
+ print (len (concatenation ))
209
+
210
+ prompt = json .dumps ({
211
+ "prompt" : concatenation ,
212
+ "max_tokens_to_sample" : 4096 ,
213
+ "top_p" : 0.999 ,
214
+ "top_k" : 250 ,
215
+ "temperature" : 0.8 :
216
+ })
217
+
218
+ model = "anthropic.claude-instant-v1"
219
+ #model = "anthropic.claude-v1"
220
+
221
+ response = bedrock .invoke_model (body = prompt , modelId = model )
222
+ print (response )
223
+ response_body = json .loads (response .get ("body" ).read ())
224
+
225
+ overall_summary += response_body .get ("completion" ) + """
226
+ \n ========================== End of Segment (~ 35 minutes) ========================== \n
227
+ """
228
+
171
229
body = '' .join ([
172
230
f"Original file: { s3_key_from_url (media_uri )} " ,
173
- '\n \n ' ,
174
- 'Below are a parsed and raw transcript of your audio.' ,
175
- '\n \n ==========================\n \n ' ,
176
- parsed_transcript ,
177
- '\n \n ==========================\n \n ' ,
178
- transcript
231
+ '\n ' ,
232
+ 'Below is a summary of your recorded Chime meeting.' ,
233
+ ' Note: If your meetig is longet than (approx) 45 minutes, the output is processed in chunks'
234
+ '\n ========================== First Meeting Segment ( ~35 minutes) ========================== \n ' ,
235
+ overall_summary ,
236
+ '\n ==========================\n ' ,
237
+ 'Below is the transcript of your recorded Chime meeting. \n \n ' ,
238
+ parsed_transcript
239
+ # chunk
179
240
])
180
241
181
242
print (f"Sending notification to { notification_email } " )
182
243
send_email (notification_email , 'Transcript is complete' , body )
244
+
183
245
else :
184
246
# TODO
185
247
pass
0 commit comments