228 lines
12 KiB
Python
228 lines
12 KiB
Python
from openai import OpenAI
|
|
import assemblyai as aai
|
|
from openai_client import OpenAIClient
|
|
import math
|
|
|
|
def split_into_chunks(text, num_chunks=5):
|
|
"""Split text into specified number of chunks of roughly equal size."""
|
|
words = text.split()
|
|
chunk_size = math.ceil(len(words) / num_chunks)
|
|
chunks = []
|
|
|
|
if chunk_size < 1:
|
|
return chunks
|
|
|
|
for i in range(0, len(words), chunk_size):
|
|
chunk = ' '.join(words[i:i + chunk_size])
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
def summarize_chunk(client, chunk, previous_summary=None, previous_chunk=None):
|
|
"""Summarize a single chunk considering previous context."""
|
|
|
|
# Split previous_chunk into n parts
|
|
def split_into_n_parts(text, n):
|
|
part_length = len(text) // n
|
|
parts = [text[i * part_length:(i + 1) * part_length] for i in range(n - 1)]
|
|
parts.append(text[(n - 1) * part_length:]) # Add the remaining text to the last part
|
|
return parts
|
|
|
|
parts = []
|
|
if previous_chunk:
|
|
parts = split_into_n_parts(previous_chunk, 10)
|
|
|
|
last_part = parts[-1] if parts else ""
|
|
|
|
context = ""
|
|
if previous_summary and last_part:
|
|
context = f"""Previous chunk: {last_part}
|
|
Elaboration: {previous_summary}
|
|
"""
|
|
|
|
# system_prompt = f"""You will find next a transcript of a Teams Meeting divided into several chunks (you will receive a chunk at a
|
|
# time and the last parte of the previous one so that you can continue any interrupetd sentence or thought).
|
|
# [PAY ATTENTION: The text may not be perfect as it was transcribed from video]
|
|
|
|
# {context}
|
|
|
|
# Actual chunk: {chunk}
|
|
|
|
# [GOAL]
|
|
# Return a text in line with the user request. The text will be called "elaboration" in your context.
|
|
|
|
# Please update the "Elaboration" considering the additional information contained in the chunk.
|
|
# If you find no useful information in the chunk, just copy the "Elaboration" without modifications otherwise:
|
|
# add all the new information to the previous "Elaboration" and in case remove information that is no more relevant.
|
|
|
|
# Be complete and detailed in your response but avoid unnecessary details.
|
|
|
|
# The final result should be as long and detailed as possible.
|
|
# The result should be in business italian language.
|
|
|
|
# [PAY ATTENTION: due to the bad quality of the audio some naming could be incorrectly interpreted. Try to uniform names based on the whole context]
|
|
|
|
# \n"""
|
|
|
|
system_prompt = f"""You will find next a transcript of a Teams Meeting divided into several chunks (you will receive a chunk at a
|
|
time and the last part of the previous one so that you can continue any interrupetd sentence or thought).
|
|
[PAY ATTENTION: The text may not be perfect as it was transcribed from video try your best to find the correct meaning]
|
|
|
|
{context}
|
|
|
|
Actual chunk: {chunk}
|
|
|
|
[GOAL]
|
|
Return a text in line with the user request. The text will be called "elaboration" in your context.
|
|
|
|
Please update the "Elaboration" considering the additional information contained in the chunk.
|
|
If you find no useful information in the chunk, just copy the "Elaboration" without modifications otherwise:
|
|
add all the new information to the previous "Elaboration" and in case remove information that is no more relevant.
|
|
|
|
The result should be in business italian language.
|
|
|
|
[PAY ATTENTION: due to the bad quality of the audio some naming could be incorrectly interpreted. Try to uniform names based on the whole context]
|
|
|
|
\n"""
|
|
question = "Mi servono gli appunti di questa riunione come fossi uno studente che deve studiarla [dettaglia quante più informazioni possibile]. Segnala in coda un resoconto degli open-point e/o delle decisioni prese."
|
|
#question = "Mi serve il verbale di questa riunione (in formato società di consulenza in stile Deloitte). Di particolare importanza sono le decisioni prese e le cose da approfondire nei prossimi incontri."
|
|
|
|
|
|
|
|
response = client.create_completion(
|
|
model='gpt-4o',
|
|
messages=[
|
|
{'role': 'system', 'content': system_prompt},
|
|
{'role': 'user', 'content': question}
|
|
],
|
|
temperature=0.1,
|
|
top_p=0.5,
|
|
max_tokens=16000,
|
|
)
|
|
|
|
if response.choices:
|
|
tmp = response.choices[0].message.content.strip()
|
|
print(tmp)
|
|
print("--------------------------------------------- \n\n\n")
|
|
return tmp
|
|
return ""
|
|
|
|
def clean_summary(client,summary):
|
|
|
|
system_prompt = f"""Following an automated generated text from a video transcript. Due to audio quality and
|
|
lack of knowledge about the context, the text may not be complete or accurate.
|
|
|
|
[ORIGINAL TEXT]
|
|
{summary}
|
|
|
|
[CONTEXT INFORMATION]
|
|
1 - The context of the conversation: is a meeting for the development the "PDC Project" where PDC stands for "Piattaforma di Consulenza".
|
|
2 - The PdC is a whealt management platform for the Fideuram Bank used by it's 6000 Private Banker.
|
|
3 - Related to the PDC there is te PO (Piattaforma Operativa) which is an appliation which manage the orders generated by the PDC.
|
|
4 - The platform is written through a framework called DARWIN which as front-end as e library called XDCE and as front a library called BEAR
|
|
one based in angular the other in java spring boot)
|
|
5 - WEADD è un'altra piattaforma di consulenza di Intesa Private che dovrà convergere all'interno della PDC
|
|
6 - FFIT o FF (Forward Fit) è un progetto in corso di allineamento di tutte le anagrafiche clienti e portafogli del mondo Intesa (ISPB, Fideuram etc.)
|
|
7 - FINV Sistema di invio ordini di Intesa
|
|
8 - CJ (customer journey) Moduli software includibili all'interno della PDC
|
|
9 - MUP (Motore unico Portafoglio)
|
|
10 - AUS (Anagrafica unica Strumenti)
|
|
11 - LDD (Layer di Disaccoppiamento). E' sufficiente l'acronimo (non va mai messo per intero)
|
|
12 - Convergenza Fondi è un altro progetto di interesse per il mondo Intesa/Fideuram che andrà in parallelo con quello della PDC
|
|
13 - Il concetto di "pianificazione" è molto usato in questo contesto (parole simili potrebbero fare riferimento a questo)
|
|
14 - PDC is the name of the platform so it's often used in this context (similar words may refer to this)
|
|
15 - Pay attention to the word "Pianificazione" it could be the name of a PDC Module or the word "Pianificazione" in italian. When it's the module name put it into brakets.
|
|
|
|
[GOAL]
|
|
Based on ORIGINAL TEXT and CONTEXT INFORMATION return what asked by the user.
|
|
|
|
\n"""
|
|
question = "Ritorna ORIGINAL TEXT corretto secondo il contesto passato in ingresso (correggi parole o acronimi errati)"
|
|
|
|
response = client.create_completion(
|
|
model='gpt-4o',
|
|
messages=[
|
|
{'role': 'system', 'content': system_prompt},
|
|
{'role': 'user', 'content': question}
|
|
],
|
|
temperature=0.5,
|
|
max_tokens=16000,
|
|
)
|
|
|
|
if response.choices:
|
|
tmp = response.choices[0].message.content.strip()
|
|
print(tmp)
|
|
print("--------------------------------------------- \n\n\n")
|
|
return tmp
|
|
return ""
|
|
|
|
def verbale(client,summary):
|
|
|
|
system_prompt = f"""L'utente ti passerà il resoconto dettagliato di una riunione.
|
|
|
|
[CONTEXT INFORMATION]
|
|
1 - The context of the conversation: is a meeting for the development the "PDC Project" where PDC stands for "Piattaforma di Consulenza".
|
|
2 - The PdC is a whealt management platform for the Fideuram Bank used by it's 6000 Private Banker.
|
|
3 - Related to the PDC there is te PO (Piattaforma Operativa) which is an appliation which manage the orders generated by the PDC.
|
|
4 - The platform is written through a framework called DARWIN which as front-end as e library called XDCE and as front a library called BEAR
|
|
one based in angular the other in java spring boot)
|
|
5 - WEADD è un'altra piattaforma di consulenza di Intesa Private che dovrà convergere all'interno della PDC
|
|
6 - FFIT o FF (Forward Fit) è un progetto in corso di allineamento di tutte le anagrafiche clienti e portafogli del mondo Intesa (ISPB, Fideuram etc.)
|
|
7 - FINV Sistema di invio ordini di Intesa
|
|
8 - CJ (customer journey) Moduli software includibili all'interno della PDC
|
|
9 - MUP (Motore unico Portafoglio)
|
|
10 - AUS (Anagrafica unica Strumenti)
|
|
11 - LDD (Layer di Disaccoppiamento). E' sufficiente l'acronimo (non va mai messo per intero)
|
|
12 - Convergenza Fondi è un altro progetto di interesse per il mondo Intesa/Fideuram che andrà in parallelo con quello della PDC
|
|
13 - Il concetto di "pianificazione" è molto usato in questo contesto (parole simili potrebbero fare riferimento a questo)
|
|
14 - PDC is the name of the platform so it's often used in this context (similar words may refer to this)
|
|
15 - Pay attention to the word "Pianificazione" it could be the name of a PDC Module or the word "Pianificazione" in italian. When it's the module name put it into brakets.
|
|
16 - Si può parlare di Stored Procedure (quelle del database)
|
|
|
|
[OBIETTIVO]
|
|
Basandoti sul resoconto e dal contesto Mi serve che mi crei un verbale della riunione.
|
|
Mantieni l'ordine di quanto è scritto nel resoconto.
|
|
Il contesto è quello di una riunione in una banca per parlare dello sviluppo di un software.
|
|
Il contesto è business e software oriented.
|
|
Evita l'ordine del giorno.
|
|
Se trovi una sezione open-point anche questa va verbalizzata in modo riassuntivo (metti quelli più importanti o raggruppa dove puoi)
|
|
\n"""
|
|
question = summary
|
|
|
|
response = client.create_completion(
|
|
model='gpt-4o',
|
|
messages=[
|
|
{'role': 'system', 'content': system_prompt},
|
|
{'role': 'user', 'content': question}
|
|
],
|
|
temperature=0.5,
|
|
max_tokens=16000,
|
|
)
|
|
|
|
if response.choices:
|
|
tmp = response.choices[0].message.content.strip()
|
|
print(tmp)
|
|
print("--------------------------------------------- \n\n\n")
|
|
return tmp
|
|
return ""
|
|
|
|
|
|
def summarize_transcription(client, transcript):
|
|
"""Use GPT-4 to summarize the transcription using recursive chunking."""
|
|
|
|
# Split transcript into chunks
|
|
chunks = split_into_chunks(transcript)
|
|
|
|
# Initialize variables for recursive summarization
|
|
current_summary = None
|
|
previous_chunk = None
|
|
|
|
i = 0
|
|
# Process each chunk recursively
|
|
for chunk in chunks:
|
|
print("**************** " + f"Chunk {i} \n\n")
|
|
i += 1
|
|
current_summary = summarize_chunk(client, chunk, current_summary, previous_chunk)
|
|
previous_chunk = chunk
|
|
|
|
return current_summary |