from openai import OpenAI import assemblyai as aai from openai_client import OpenAIClient import math def split_into_chunks(text, num_chunks=5): """Split text into specified number of chunks of roughly equal size.""" words = text.split() chunk_size = math.ceil(len(words) / num_chunks) chunks = [] if chunk_size < 1: return chunks for i in range(0, len(words), chunk_size): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) return chunks def summarize_chunk(client, chunk, previous_summary=None, previous_chunk=None): """Summarize a single chunk considering previous context.""" # Split previous_chunk into n parts def split_into_n_parts(text, n): part_length = len(text) // n parts = [text[i * part_length:(i + 1) * part_length] for i in range(n - 1)] parts.append(text[(n - 1) * part_length:]) # Add the remaining text to the last part return parts parts = [] if previous_chunk: parts = split_into_n_parts(previous_chunk, 10) last_part = parts[-1] if parts else "" context = "" if previous_summary and last_part: context = f"""Previous chunk: {last_part} Elaboration: {previous_summary} """ # system_prompt = f"""You will find next a transcript of a Teams Meeting divided into several chunks (you will receive a chunk at a # time and the last parte of the previous one so that you can continue any interrupetd sentence or thought). # [PAY ATTENTION: The text may not be perfect as it was transcribed from video] # {context} # Actual chunk: {chunk} # [GOAL] # Return a text in line with the user request. The text will be called "elaboration" in your context. # Please update the "Elaboration" considering the additional information contained in the chunk. # If you find no useful information in the chunk, just copy the "Elaboration" without modifications otherwise: # add all the new information to the previous "Elaboration" and in case remove information that is no more relevant. # Be complete and detailed in your response but avoid unnecessary details. # The final result should be as long and detailed as possible. # The result should be in business italian language. # [PAY ATTENTION: due to the bad quality of the audio some naming could be incorrectly interpreted. Try to uniform names based on the whole context] # \n""" system_prompt = f"""You will find next a transcript of a Teams Meeting divided into several chunks (you will receive a chunk at a time and the last part of the previous one so that you can continue any interrupetd sentence or thought). [PAY ATTENTION: The text may not be perfect as it was transcribed from video try your best to find the correct meaning] {context} Actual chunk: {chunk} [GOAL] Return a text in line with the user request. The text will be called "elaboration" in your context. Please update the "Elaboration" considering the additional information contained in the chunk. If you find no useful information in the chunk, just copy the "Elaboration" without modifications otherwise: add all the new information to the previous "Elaboration" and in case remove information that is no more relevant. The result should be in business italian language. [PAY ATTENTION: due to the bad quality of the audio some naming could be incorrectly interpreted. Try to uniform names based on the whole context] \n""" question = "Mi servono gli appunti di questa riunione come fossi uno studente che deve studiarla [dettaglia quante più informazioni possibile]. Segnala in coda un resoconto degli open-point e/o delle decisioni prese." #question = "Mi serve il verbale di questa riunione (in formato società di consulenza in stile Deloitte). Di particolare importanza sono le decisioni prese e le cose da approfondire nei prossimi incontri." response = client.create_completion( model='gpt-4o', messages=[ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': question} ], temperature=0.1, top_p=0.5, max_tokens=16000, ) if response.choices: tmp = response.choices[0].message.content.strip() print(tmp) print("--------------------------------------------- \n\n\n") return tmp return "" def clean_summary(client,summary): system_prompt = f"""Following an automated generated text from a video transcript. Due to audio quality and lack of knowledge about the context, the text may not be complete or accurate. [ORIGINAL TEXT] {summary} [CONTEXT INFORMATION] 1 - The context of the conversation: is a meeting for the development the "PDC Project" where PDC stands for "Piattaforma di Consulenza". 2 - The PdC is a whealt management platform for the Fideuram Bank used by it's 6000 Private Banker. 3 - Related to the PDC there is te PO (Piattaforma Operativa) which is an appliation which manage the orders generated by the PDC. 4 - The platform is written through a framework called DARWIN which as front-end as e library called XDCE and as front a library called BEAR one based in angular the other in java spring boot) 5 - WEADD è un'altra piattaforma di consulenza di Intesa Private che dovrà convergere all'interno della PDC 6 - FFIT o FF (Forward Fit) è un progetto in corso di allineamento di tutte le anagrafiche clienti e portafogli del mondo Intesa (ISPB, Fideuram etc.) 7 - FINV Sistema di invio ordini di Intesa 8 - CJ (customer journey) Moduli software includibili all'interno della PDC 9 - MUP (Motore unico Portafoglio) 10 - AUS (Anagrafica unica Strumenti) 11 - LDD (Layer di Disaccoppiamento). E' sufficiente l'acronimo (non va mai messo per intero) 12 - Convergenza Fondi è un altro progetto di interesse per il mondo Intesa/Fideuram che andrà in parallelo con quello della PDC 13 - Il concetto di "pianificazione" è molto usato in questo contesto (parole simili potrebbero fare riferimento a questo) 14 - PDC is the name of the platform so it's often used in this context (similar words may refer to this) 15 - Pay attention to the word "Pianificazione" it could be the name of a PDC Module or the word "Pianificazione" in italian. When it's the module name put it into brakets. [GOAL] Based on ORIGINAL TEXT and CONTEXT INFORMATION return what asked by the user. \n""" question = "Ritorna ORIGINAL TEXT corretto secondo il contesto passato in ingresso (correggi parole o acronimi errati)" response = client.create_completion( model='gpt-4o', messages=[ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': question} ], temperature=0.5, max_tokens=16000, ) if response.choices: tmp = response.choices[0].message.content.strip() print(tmp) print("--------------------------------------------- \n\n\n") return tmp return "" def verbale(client,summary): system_prompt = f"""L'utente ti passerà il resoconto dettagliato di una riunione. [CONTEXT INFORMATION] 1 - The context of the conversation: is a meeting for the development the "PDC Project" where PDC stands for "Piattaforma di Consulenza". 2 - The PdC is a whealt management platform for the Fideuram Bank used by it's 6000 Private Banker. 3 - Related to the PDC there is te PO (Piattaforma Operativa) which is an appliation which manage the orders generated by the PDC. 4 - The platform is written through a framework called DARWIN which as front-end as e library called XDCE and as front a library called BEAR one based in angular the other in java spring boot) 5 - WEADD è un'altra piattaforma di consulenza di Intesa Private che dovrà convergere all'interno della PDC 6 - FFIT o FF (Forward Fit) è un progetto in corso di allineamento di tutte le anagrafiche clienti e portafogli del mondo Intesa (ISPB, Fideuram etc.) 7 - FINV Sistema di invio ordini di Intesa 8 - CJ (customer journey) Moduli software includibili all'interno della PDC 9 - MUP (Motore unico Portafoglio) 10 - AUS (Anagrafica unica Strumenti) 11 - LDD (Layer di Disaccoppiamento). E' sufficiente l'acronimo (non va mai messo per intero) 12 - Convergenza Fondi è un altro progetto di interesse per il mondo Intesa/Fideuram che andrà in parallelo con quello della PDC 13 - Il concetto di "pianificazione" è molto usato in questo contesto (parole simili potrebbero fare riferimento a questo) 14 - PDC is the name of the platform so it's often used in this context (similar words may refer to this) 15 - Pay attention to the word "Pianificazione" it could be the name of a PDC Module or the word "Pianificazione" in italian. When it's the module name put it into brakets. 16 - Si può parlare di Stored Procedure (quelle del database) [OBIETTIVO] Basandoti sul resoconto e dal contesto Mi serve che mi crei un verbale della riunione. Mantieni l'ordine di quanto è scritto nel resoconto. Il contesto è quello di una riunione in una banca per parlare dello sviluppo di un software. Il contesto è business e software oriented. Evita l'ordine del giorno. Se trovi una sezione open-point anche questa va verbalizzata in modo riassuntivo (metti quelli più importanti o raggruppa dove puoi) \n""" question = summary response = client.create_completion( model='gpt-4o', messages=[ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': question} ], temperature=0.5, max_tokens=16000, ) if response.choices: tmp = response.choices[0].message.content.strip() print(tmp) print("--------------------------------------------- \n\n\n") return tmp return "" def summarize_transcription(client, transcript): """Use GPT-4 to summarize the transcription using recursive chunking.""" # Split transcript into chunks chunks = split_into_chunks(transcript) # Initialize variables for recursive summarization current_summary = None previous_chunk = None i = 0 # Process each chunk recursively for chunk in chunks: print("**************** " + f"Chunk {i} \n\n") i += 1 current_summary = summarize_chunk(client, chunk, current_summary, previous_chunk) previous_chunk = chunk return current_summary