import os import tempfile from moviepy import VideoFileClip from openai import OpenAI import math from pydub import AudioSegment from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_PARAGRAPH_ALIGNMENT import pypandoc from dotenv import load_dotenv # Load environment variables load_dotenv() # Configuration MAX_FILE_SIZE = 26214400 # 25MB in bytes CHUNK_SIZE = MAX_FILE_SIZE // 2 # Split into ~12MB chunks OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'outputs') os.makedirs(OUTPUT_DIR, exist_ok=True) def setup_api_keys(): """Setup API keys and configurations.""" api_key = os.getenv('OPENAI_API_KEY') if not api_key: raise ValueError("OpenAI API key not found in environment variables") client = OpenAI(api_key=api_key) return client def extract_audio(video_path): """Extract audio from video file and return path to audio file.""" print("Extracting audio from video file...") temp_audio_path = os.path.join(OUTPUT_DIR, "temp_audio.mp3") video = VideoFileClip(video_path) video.audio.write_audiofile(temp_audio_path) video.close() return temp_audio_path def split_audio_by_duration(file_path, duration_seconds=600): """Split an audio file into chunks of a specified duration (default 10 minutes).""" chunks_dir = os.path.join(OUTPUT_DIR, 'chunks') os.makedirs(chunks_dir, exist_ok=True) try: audio = AudioSegment.from_file(file_path) except Exception as e: print(f"Error loading audio file: {e}") return [] chunks = [] chunk_length_ms = duration_seconds * 1000 # Convert seconds to milliseconds for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)): chunk_end = chunk_start + chunk_length_ms chunk = audio[chunk_start:chunk_end] chunk_filename = os.path.join(chunks_dir, f"chunk_{i}.mp3") try: chunk.export(chunk_filename, format="mp3") chunks.append(chunk_filename) print(f"Created chunk: {chunk_filename}") except Exception as e: print(f"Error exporting chunk: {e}") return chunks def transcribe_audio(client, audio_chunks): """Transcribe audio chunks using OpenAI's Whisper API.""" transcripts = [] for chunk in audio_chunks: print(f"Transcribing chunk: {chunk}") audio_file = open(chunk, "rb") try: transcript = client.audio.transcriptions.create( file=audio_file, model="whisper-1", response_format="text", language="it", prompt="L'audio รจ il pezzo di una riunione registrata; potrebbe essere la continuazione di un precedente pezzo di audio" ) transcripts.append(transcript) except Exception as e: print(f"Error in transcription: {e}") return ' '.join(transcripts) def summarize_transcription(client, transcript): """Use GPT-4 to summarize the transcription.""" system_message = """You will find next a transcript of a Teams Meeting. [PAY ATTENTION: The text may not be perfect as it was transcribed from video] Your task is to create detailed notes from this meeting transcript, including as much information as possible. At the end, include a summary of open points and decisions made during the meeting. The result should be in business Italian language. """ response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": system_message}, {"role": "user", "content": transcript} ], temperature=0.1, max_tokens=16000 ) return response.choices[0].message.content def clean_summary(client, summary): """Clean and format the summary using GPT.""" system_message = """Following is an automated generated text from a video transcript. Due to audio quality and lack of knowledge about the context, the text may not be complete or accurate. [ORIGINAL TEXT] {0} Please clean this text, fixing any inconsistencies, formatting issues, or errors in transcription. The result should be well-structured and in business Italian language. """.format(summary) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": system_message}, {"role": "user", "content": "Please clean and format this text."} ], temperature=0.5, max_tokens=16000 ) return response.choices[0].message.content def generate_verbale(client, summary_clean): """Generate a formal meeting report (verbale) from the cleaned summary.""" system_message = """You are tasked with creating a formal meeting report (verbale) from the following meeting summary. [SUMMARY] {0} Please structure the report in a professional format typical of consulting firms (like Deloitte). Focus particularly on decisions made and action items for future meetings. The report should be in business Italian language. """.format(summary_clean) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": system_message}, {"role": "user", "content": "Generate a formal meeting report (verbale)."} ], temperature=0.5, max_tokens=16000 ) return response.choices[0].message.content def convert_markdown_to_word(markdown_text, output_file): """Convert markdown text to MS Word document format using pypandoc.""" try: # Write markdown to a temporary file with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.md', encoding='utf-8') as temp_file: temp_file.write(markdown_text) temp_md_file = temp_file.name # Ensure output directory exists os.makedirs(os.path.dirname(output_file), exist_ok=True) # Convert to docx using pypandoc pypandoc.convert_file(temp_md_file, 'docx', outputfile=output_file) # Clean up temp file os.unlink(temp_md_file) print(f"Successfully converted to Word: {output_file}") return True except Exception as e: print(f"Error converting to Word: {e}") return False def process_video(file_path): """Process a video file and generate transcription, summaries, and documents.""" base_filename = os.path.splitext(os.path.basename(file_path))[0] client = setup_api_keys() # Output file paths transcription_path = os.path.join(OUTPUT_DIR, f"{base_filename}_transcription.txt") summary_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary.md") summary_clean_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary_clean.md") verbale_path = os.path.join(OUTPUT_DIR, f"{base_filename}_verbale.md") docx_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary.docx") docx_verbale_path = os.path.join(OUTPUT_DIR, f"{base_filename}_verbale.docx") try: # Step 1: Extract audio from video audio_path = extract_audio(file_path) # Step 2: Split audio into chunks audio_chunks = split_audio_by_duration(audio_path) # Step 3: Transcribe audio transcription = transcribe_audio(client, audio_chunks) with open(transcription_path, "w", encoding='utf-8') as f: f.write(transcription) print(f"Saved transcription to: {transcription_path}") # Step 4: Generate summary summary = summarize_transcription(client, transcription) with open(summary_path, "w", encoding='utf-8') as f: f.write(summary) print(f"Saved summary to: {summary_path}") # Step 5: Clean and format summary summary_clean = clean_summary(client, summary) with open(summary_clean_path, "w", encoding='utf-8') as f: f.write(summary_clean) print(f"Saved cleaned summary to: {summary_clean_path}") # Step 6: Generate formal report (verbale) verbale = generate_verbale(client, summary_clean) with open(verbale_path, "w", encoding='utf-8') as f: f.write(verbale) print(f"Saved verbale to: {verbale_path}") # Step 7: Convert markdown to Word documents convert_markdown_to_word(summary_clean, docx_path) convert_markdown_to_word(verbale, docx_verbale_path) # Step 8: Clean up temporary files for chunk in audio_chunks: if os.path.exists(chunk): os.remove(chunk) if os.path.exists(audio_path): os.remove(audio_path) # Return paths to all generated files return { 'transcription_path': transcription_path, 'summary_path': summary_path, 'summary_clean_path': summary_clean_path, 'verbale_path': verbale_path, 'docx_path': docx_path, 'docx_verbale_path': docx_verbale_path } except Exception as e: print(f"Error processing video: {e}") raise