252 lines
9.2 KiB
Python

import os
import tempfile
from moviepy import VideoFileClip
from openai import OpenAI
import math
from pydub import AudioSegment
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import pypandoc
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configuration
MAX_FILE_SIZE = 26214400 # 25MB in bytes
CHUNK_SIZE = MAX_FILE_SIZE // 2 # Split into ~12MB chunks
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'outputs')
os.makedirs(OUTPUT_DIR, exist_ok=True)
def setup_api_keys():
"""Setup API keys and configurations."""
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key not found in environment variables")
client = OpenAI(api_key=api_key)
return client
def extract_audio(video_path):
"""Extract audio from video file and return path to audio file."""
print("Extracting audio from video file...")
temp_audio_path = os.path.join(OUTPUT_DIR, "temp_audio.mp3")
video = VideoFileClip(video_path)
video.audio.write_audiofile(temp_audio_path)
video.close()
return temp_audio_path
def split_audio_by_duration(file_path, duration_seconds=600):
"""Split an audio file into chunks of a specified duration (default 10 minutes)."""
chunks_dir = os.path.join(OUTPUT_DIR, 'chunks')
os.makedirs(chunks_dir, exist_ok=True)
try:
audio = AudioSegment.from_file(file_path)
except Exception as e:
print(f"Error loading audio file: {e}")
return []
chunks = []
chunk_length_ms = duration_seconds * 1000 # Convert seconds to milliseconds
for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)):
chunk_end = chunk_start + chunk_length_ms
chunk = audio[chunk_start:chunk_end]
chunk_filename = os.path.join(chunks_dir, f"chunk_{i}.mp3")
try:
chunk.export(chunk_filename, format="mp3")
chunks.append(chunk_filename)
print(f"Created chunk: {chunk_filename}")
except Exception as e:
print(f"Error exporting chunk: {e}")
return chunks
def transcribe_audio(client, audio_chunks):
"""Transcribe audio chunks using OpenAI's Whisper API."""
transcripts = []
for chunk in audio_chunks:
print(f"Transcribing chunk: {chunk}")
audio_file = open(chunk, "rb")
try:
transcript = client.audio.transcriptions.create(
file=audio_file,
model="whisper-1",
response_format="text",
language="it",
prompt="L'audio è il pezzo di una riunione registrata; potrebbe essere la continuazione di un precedente pezzo di audio"
)
transcripts.append(transcript)
except Exception as e:
print(f"Error in transcription: {e}")
return ' '.join(transcripts)
def summarize_transcription(client, transcript):
"""Use GPT-4 to summarize the transcription."""
system_message = """You will find next a transcript of a Teams Meeting.
[PAY ATTENTION: The text may not be perfect as it was transcribed from video]
Your task is to create detailed notes from this meeting transcript,
including as much information as possible. At the end, include a summary
of open points and decisions made during the meeting.
The result should be in business Italian language.
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": transcript}
],
temperature=0.1,
max_tokens=16000
)
return response.choices[0].message.content
def clean_summary(client, summary):
"""Clean and format the summary using GPT."""
system_message = """Following is an automated generated text from a video transcript.
Due to audio quality and lack of knowledge about the context, the text may not be complete or accurate.
[ORIGINAL TEXT]
{0}
Please clean this text, fixing any inconsistencies, formatting issues, or errors in transcription.
The result should be well-structured and in business Italian language.
""".format(summary)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": "Please clean and format this text."}
],
temperature=0.5,
max_tokens=16000
)
return response.choices[0].message.content
def generate_verbale(client, summary_clean):
"""Generate a formal meeting report (verbale) from the cleaned summary."""
system_message = """You are tasked with creating a formal meeting report (verbale) from the following meeting summary.
[SUMMARY]
{0}
Please structure the report in a professional format typical of consulting firms (like Deloitte).
Focus particularly on decisions made and action items for future meetings.
The report should be in business Italian language.
""".format(summary_clean)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": "Generate a formal meeting report (verbale)."}
],
temperature=0.5,
max_tokens=16000
)
return response.choices[0].message.content
def convert_markdown_to_word(markdown_text, output_file):
"""Convert markdown text to MS Word document format using pypandoc."""
try:
# Write markdown to a temporary file
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.md', encoding='utf-8') as temp_file:
temp_file.write(markdown_text)
temp_md_file = temp_file.name
# Ensure output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# Convert to docx using pypandoc
pypandoc.convert_file(temp_md_file, 'docx', outputfile=output_file)
# Clean up temp file
os.unlink(temp_md_file)
print(f"Successfully converted to Word: {output_file}")
return True
except Exception as e:
print(f"Error converting to Word: {e}")
return False
def process_video(file_path):
"""Process a video file and generate transcription, summaries, and documents."""
base_filename = os.path.splitext(os.path.basename(file_path))[0]
client = setup_api_keys()
# Output file paths
transcription_path = os.path.join(OUTPUT_DIR, f"{base_filename}_transcription.txt")
summary_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary.md")
summary_clean_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary_clean.md")
verbale_path = os.path.join(OUTPUT_DIR, f"{base_filename}_verbale.md")
docx_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary.docx")
docx_verbale_path = os.path.join(OUTPUT_DIR, f"{base_filename}_verbale.docx")
try:
# Step 1: Extract audio from video
audio_path = extract_audio(file_path)
# Step 2: Split audio into chunks
audio_chunks = split_audio_by_duration(audio_path)
# Step 3: Transcribe audio
transcription = transcribe_audio(client, audio_chunks)
with open(transcription_path, "w", encoding='utf-8') as f:
f.write(transcription)
print(f"Saved transcription to: {transcription_path}")
# Step 4: Generate summary
summary = summarize_transcription(client, transcription)
with open(summary_path, "w", encoding='utf-8') as f:
f.write(summary)
print(f"Saved summary to: {summary_path}")
# Step 5: Clean and format summary
summary_clean = clean_summary(client, summary)
with open(summary_clean_path, "w", encoding='utf-8') as f:
f.write(summary_clean)
print(f"Saved cleaned summary to: {summary_clean_path}")
# Step 6: Generate formal report (verbale)
verbale = generate_verbale(client, summary_clean)
with open(verbale_path, "w", encoding='utf-8') as f:
f.write(verbale)
print(f"Saved verbale to: {verbale_path}")
# Step 7: Convert markdown to Word documents
convert_markdown_to_word(summary_clean, docx_path)
convert_markdown_to_word(verbale, docx_verbale_path)
# Step 8: Clean up temporary files
for chunk in audio_chunks:
if os.path.exists(chunk):
os.remove(chunk)
if os.path.exists(audio_path):
os.remove(audio_path)
# Return paths to all generated files
return {
'transcription_path': transcription_path,
'summary_path': summary_path,
'summary_clean_path': summary_clean_path,
'verbale_path': verbale_path,
'docx_path': docx_path,
'docx_verbale_path': docx_verbale_path
}
except Exception as e:
print(f"Error processing video: {e}")
raise