Versione con array di folder.

This commit is contained in:
c.rosati 2025-03-21 11:05:30 +01:00
parent 3c4904521f
commit 014b45bbe2
19 changed files with 14448 additions and 135 deletions

3
.gitignore vendored
View File

@ -5,3 +5,6 @@
venv
__pycache__
videos/*.*
elaborati
node_modules

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -387,32 +387,16 @@ def processo_verbale():
convert_markdown_to_word(verbaleGen, docx_full_verbale_file_name)
if __name__ == "__main__":
folder = "C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\03 Mar 2025 FFIT Replatforming PDC PO - Analisi Tecnica - Follow up pt2"
folders = [
"C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\18 mar 2025 TAVOLO 1 - CONSOLIDAMENTO MACRO ANALISI DI SESSIONE UTENTE E PIANIFICAZIONE CON IMPATTI CONV. FONDI",
"C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\20 Mar 2025 PDC Armundia DB Schemi Cat Prodotti e Consulenza Unica + Mappatura - Configurazione dei Prodotti e Relativi Servizi",
"C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\20 Mar 2025 PDC Armundia Accesso Sorgenti Batch + Giro NEO4J + Microservizi",
"C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\20 Mar 2025 FFIT Replatforming PDC PO Analisi Funzionale"
]
# for file in os.listdir(folder):
# if file.endswith(".mp4"):
# main(os.path.join(folder, file))
# folder = "C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\03 Mar 2025 FFIT PDC ALLINEAMENTO PERIODICO"
# for file in os.listdir(folder):
# if file.endswith(".mp4"):
# main(os.path.join(folder, file))
# folder = "C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\24 Feb 2025 FFIT - Replatforming PDC-PO - Definizione soluzione da adottare - Analisi tecnica"
# for file in os.listdir(folder):
# if file.endswith(".mp4"):
# main(os.path.join(folder, file))
main("E:\\obs\\PdC Review Piano Parte1 - 2025-03-05 09-11-44.mp4")
# folder = "C:\\Users\\rosat\\ARMUNDIA GROUP SRL\\Trasformazione PdC - Documents\\03 - Analisi e Disegno\\meetings\\24 Feb 2025 FFIT - Replatforming PDC-PO - Definizione soluzione da adottare - Analisi tecnica"
# for file in os.listdir(folder):
# if file.endswith(".mp4"):
# main(os.path.join(folder, file))
for folder in folders:
for file in os.listdir(folder):
if file.endswith(".mp4"):
main(os.path.join(folder, file))
#processo_verbale()

View File

@ -0,0 +1,3 @@
registry=https://registry.npmjs.org/
//nexus.armundia.com/repository/:_authToken=NpmToken.514b83be-94d3-3327-95aa-2f723888b405
strict-ssl=false

13881
react-flask-app/client/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
{
"name": "video-ai-client",
"version": "0.1.0",
"private": true,
"dependencies": {
"@emotion/react": "^11.11.1",
"@emotion/styled": "^11.11.0",
"@mui/icons-material": "^5.14.15",
"@mui/material": "^5.14.15",
"@mui/x-data-grid": "^6.16.3",
"@testing-library/jest-dom": "^5.17.0",
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0",
"axios": "^1.5.1",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-router-dom": "^6.17.0",
"react-scripts": "5.0.1",
"web-vitals": "^2.1.4"
},
"scripts": {
"start": "react-scripts start",
"build": "react-scripts build",
"test": "react-scripts test",
"eject": "react-scripts eject"
},
"eslintConfig": {
"extends": [
"react-app",
"react-app/jest"
]
},
"browserslist": {
"production": [
">0.2%",
"not dead",
"not op_mini all"
],
"development": [
"last 1 chrome version",
"last 1 firefox version",
"last 1 safari version"
]
},
"proxy": "http://localhost:5000"
}

View File

@ -0,0 +1,24 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="theme-color" content="#000000" />
<meta
name="description"
content="Video AI Processing Application"
/>
<link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
<link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
<link
rel="stylesheet"
href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"
/>
<title>Video AI Processor</title>
</head>
<body>
<noscript>You need to enable JavaScript to run this app.</noscript>
<div id="root"></div>
</body>
</html>

View File

@ -0,0 +1,25 @@
{
"short_name": "Video AI",
"name": "Video AI Processing Application",
"icons": [
{
"src": "favicon.ico",
"sizes": "64x64 32x32 24x24 16x16",
"type": "image/x-icon"
},
{
"src": "logo192.png",
"type": "image/png",
"sizes": "192x192"
},
{
"src": "logo512.png",
"type": "image/png",
"sizes": "512x512"
}
],
"start_url": ".",
"display": "standalone",
"theme_color": "#000000",
"background_color": "#ffffff"
}

View File

@ -0,0 +1,2 @@
OPENAI_API_KEY=your_openai_api_key_here
MONGO_URI=mongodb://localhost:27017/video_ai

View File

@ -0,0 +1,192 @@
import os
import uuid
import tempfile
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
from werkzeug.utils import secure_filename
from pymongo import MongoClient
from datetime import datetime
from bson.objectid import ObjectId
import json
from dotenv import load_dotenv
from processing import process_video
# Load environment variables
load_dotenv()
app = Flask(__name__)
CORS(app)
# MongoDB configuration
MONGO_URI = os.getenv('MONGO_URI', 'mongodb+srv://human:kY5ORJCzW0unboME@cluster0.8cnqn.mongodb.net/VIDEO-AI')
client = MongoClient(MONGO_URI)
db = client.get_database()
documents_collection = db.documents
# Configure upload folder
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
DOCUMENT_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'documents')
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(DOCUMENT_FOLDER, exist_ok=True)
# Allowed file extensions
ALLOWED_EXTENSIONS = {'mp4'}
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/api/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if not allowed_file(file.filename):
return jsonify({'error': 'File type not allowed'}), 400
# Generate a unique filename
filename = secure_filename(file.filename)
unique_filename = f"{uuid.uuid4()}_{filename}"
file_path = os.path.join(UPLOAD_FOLDER, unique_filename)
# Save the file
file.save(file_path)
# Create document record in database
document_id = documents_collection.insert_one({
'original_filename': filename,
'stored_filename': unique_filename,
'upload_date': datetime.now(),
'status': 'uploaded',
'file_path': file_path
}).inserted_id
# Return the document ID to the client
return jsonify({
'message': 'File uploaded successfully',
'document_id': str(document_id)
}), 201
@app.route('/api/process/<document_id>', methods=['POST'])
def process_document(document_id):
# Find the document in MongoDB
document = documents_collection.find_one({'_id': ObjectId(document_id)})
if not document:
return jsonify({'error': 'Document not found'}), 404
# Update status
documents_collection.update_one(
{'_id': ObjectId(document_id)},
{'$set': {'status': 'processing'}}
)
try:
# Process the video
result = process_video(document['file_path'])
# Update document with generated files
documents_collection.update_one(
{'_id': ObjectId(document_id)},
{'$set': {
'status': 'completed',
'transcription_path': result['transcription_path'],
'summary_path': result['summary_path'],
'summary_clean_path': result['summary_clean_path'],
'verbale_path': result['verbale_path'],
'docx_path': result['docx_path'],
'docx_verbale_path': result['docx_verbale_path'],
'completion_date': datetime.now()
}}
)
return jsonify({
'message': 'Document processed successfully',
'document_id': document_id
}), 200
except Exception as e:
# Update status to failed
documents_collection.update_one(
{'_id': ObjectId(document_id)},
{'$set': {
'status': 'failed',
'error': str(e)
}}
)
return jsonify({
'error': f'Processing failed: {str(e)}'
}), 500
@app.route('/api/documents', methods=['GET'])
def get_documents():
documents = list(documents_collection.find({}, {
'original_filename': 1,
'upload_date': 1,
'status': 1,
'completion_date': 1
}))
# Convert ObjectId to string
for doc in documents:
doc['_id'] = str(doc['_id'])
if 'upload_date' in doc:
doc['upload_date'] = doc['upload_date'].isoformat()
if 'completion_date' in doc:
doc['completion_date'] = doc['completion_date'].isoformat()
return jsonify(documents), 200
@app.route('/api/documents/<document_id>', methods=['GET'])
def get_document(document_id):
document = documents_collection.find_one({'_id': ObjectId(document_id)})
if not document:
return jsonify({'error': 'Document not found'}), 404
# Convert ObjectId to string
document['_id'] = str(document['_id'])
if 'upload_date' in document:
document['upload_date'] = document['upload_date'].isoformat()
if 'completion_date' in document:
document['completion_date'] = document['completion_date'].isoformat()
return jsonify(document), 200
@app.route('/api/download/<document_id>/<file_type>', methods=['GET'])
def download_file(document_id, file_type):
document = documents_collection.find_one({'_id': ObjectId(document_id)})
if not document:
return jsonify({'error': 'Document not found'}), 404
file_type_mapping = {
'docx': 'docx_path',
'verbale_docx': 'docx_verbale_path',
'summary': 'summary_path',
'summary_clean': 'summary_clean_path',
'verbale': 'verbale_path',
'transcription': 'transcription_path'
}
if file_type not in file_type_mapping:
return jsonify({'error': 'Invalid file type'}), 400
file_path_key = file_type_mapping[file_type]
if file_path_key not in document or not document[file_path_key]:
return jsonify({'error': f'{file_type} not available for this document'}), 404
file_path = document[file_path_key]
try:
return send_file(file_path, as_attachment=True)
except Exception as e:
return jsonify({'error': f'Error downloading file: {str(e)}'}), 500
if __name__ == '__main__':
app.run(debug=True, port=5000)

View File

@ -0,0 +1,251 @@
import os
import tempfile
from moviepy import VideoFileClip
from openai import OpenAI
import math
from pydub import AudioSegment
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import pypandoc
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configuration
MAX_FILE_SIZE = 26214400 # 25MB in bytes
CHUNK_SIZE = MAX_FILE_SIZE // 2 # Split into ~12MB chunks
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'outputs')
os.makedirs(OUTPUT_DIR, exist_ok=True)
def setup_api_keys():
"""Setup API keys and configurations."""
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key not found in environment variables")
client = OpenAI(api_key=api_key)
return client
def extract_audio(video_path):
"""Extract audio from video file and return path to audio file."""
print("Extracting audio from video file...")
temp_audio_path = os.path.join(OUTPUT_DIR, "temp_audio.mp3")
video = VideoFileClip(video_path)
video.audio.write_audiofile(temp_audio_path)
video.close()
return temp_audio_path
def split_audio_by_duration(file_path, duration_seconds=600):
"""Split an audio file into chunks of a specified duration (default 10 minutes)."""
chunks_dir = os.path.join(OUTPUT_DIR, 'chunks')
os.makedirs(chunks_dir, exist_ok=True)
try:
audio = AudioSegment.from_file(file_path)
except Exception as e:
print(f"Error loading audio file: {e}")
return []
chunks = []
chunk_length_ms = duration_seconds * 1000 # Convert seconds to milliseconds
for i, chunk_start in enumerate(range(0, len(audio), chunk_length_ms)):
chunk_end = chunk_start + chunk_length_ms
chunk = audio[chunk_start:chunk_end]
chunk_filename = os.path.join(chunks_dir, f"chunk_{i}.mp3")
try:
chunk.export(chunk_filename, format="mp3")
chunks.append(chunk_filename)
print(f"Created chunk: {chunk_filename}")
except Exception as e:
print(f"Error exporting chunk: {e}")
return chunks
def transcribe_audio(client, audio_chunks):
"""Transcribe audio chunks using OpenAI's Whisper API."""
transcripts = []
for chunk in audio_chunks:
print(f"Transcribing chunk: {chunk}")
audio_file = open(chunk, "rb")
try:
transcript = client.audio.transcriptions.create(
file=audio_file,
model="whisper-1",
response_format="text",
language="it",
prompt="L'audio è il pezzo di una riunione registrata; potrebbe essere la continuazione di un precedente pezzo di audio"
)
transcripts.append(transcript)
except Exception as e:
print(f"Error in transcription: {e}")
return ' '.join(transcripts)
def summarize_transcription(client, transcript):
"""Use GPT-4 to summarize the transcription."""
system_message = """You will find next a transcript of a Teams Meeting.
[PAY ATTENTION: The text may not be perfect as it was transcribed from video]
Your task is to create detailed notes from this meeting transcript,
including as much information as possible. At the end, include a summary
of open points and decisions made during the meeting.
The result should be in business Italian language.
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": transcript}
],
temperature=0.1,
max_tokens=16000
)
return response.choices[0].message.content
def clean_summary(client, summary):
"""Clean and format the summary using GPT."""
system_message = """Following is an automated generated text from a video transcript.
Due to audio quality and lack of knowledge about the context, the text may not be complete or accurate.
[ORIGINAL TEXT]
{0}
Please clean this text, fixing any inconsistencies, formatting issues, or errors in transcription.
The result should be well-structured and in business Italian language.
""".format(summary)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": "Please clean and format this text."}
],
temperature=0.5,
max_tokens=16000
)
return response.choices[0].message.content
def generate_verbale(client, summary_clean):
"""Generate a formal meeting report (verbale) from the cleaned summary."""
system_message = """You are tasked with creating a formal meeting report (verbale) from the following meeting summary.
[SUMMARY]
{0}
Please structure the report in a professional format typical of consulting firms (like Deloitte).
Focus particularly on decisions made and action items for future meetings.
The report should be in business Italian language.
""".format(summary_clean)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": "Generate a formal meeting report (verbale)."}
],
temperature=0.5,
max_tokens=16000
)
return response.choices[0].message.content
def convert_markdown_to_word(markdown_text, output_file):
"""Convert markdown text to MS Word document format using pypandoc."""
try:
# Write markdown to a temporary file
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.md', encoding='utf-8') as temp_file:
temp_file.write(markdown_text)
temp_md_file = temp_file.name
# Ensure output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)
# Convert to docx using pypandoc
pypandoc.convert_file(temp_md_file, 'docx', outputfile=output_file)
# Clean up temp file
os.unlink(temp_md_file)
print(f"Successfully converted to Word: {output_file}")
return True
except Exception as e:
print(f"Error converting to Word: {e}")
return False
def process_video(file_path):
"""Process a video file and generate transcription, summaries, and documents."""
base_filename = os.path.splitext(os.path.basename(file_path))[0]
client = setup_api_keys()
# Output file paths
transcription_path = os.path.join(OUTPUT_DIR, f"{base_filename}_transcription.txt")
summary_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary.md")
summary_clean_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary_clean.md")
verbale_path = os.path.join(OUTPUT_DIR, f"{base_filename}_verbale.md")
docx_path = os.path.join(OUTPUT_DIR, f"{base_filename}_summary.docx")
docx_verbale_path = os.path.join(OUTPUT_DIR, f"{base_filename}_verbale.docx")
try:
# Step 1: Extract audio from video
audio_path = extract_audio(file_path)
# Step 2: Split audio into chunks
audio_chunks = split_audio_by_duration(audio_path)
# Step 3: Transcribe audio
transcription = transcribe_audio(client, audio_chunks)
with open(transcription_path, "w", encoding='utf-8') as f:
f.write(transcription)
print(f"Saved transcription to: {transcription_path}")
# Step 4: Generate summary
summary = summarize_transcription(client, transcription)
with open(summary_path, "w", encoding='utf-8') as f:
f.write(summary)
print(f"Saved summary to: {summary_path}")
# Step 5: Clean and format summary
summary_clean = clean_summary(client, summary)
with open(summary_clean_path, "w", encoding='utf-8') as f:
f.write(summary_clean)
print(f"Saved cleaned summary to: {summary_clean_path}")
# Step 6: Generate formal report (verbale)
verbale = generate_verbale(client, summary_clean)
with open(verbale_path, "w", encoding='utf-8') as f:
f.write(verbale)
print(f"Saved verbale to: {verbale_path}")
# Step 7: Convert markdown to Word documents
convert_markdown_to_word(summary_clean, docx_path)
convert_markdown_to_word(verbale, docx_verbale_path)
# Step 8: Clean up temporary files
for chunk in audio_chunks:
if os.path.exists(chunk):
os.remove(chunk)
if os.path.exists(audio_path):
os.remove(audio_path)
# Return paths to all generated files
return {
'transcription_path': transcription_path,
'summary_path': summary_path,
'summary_clean_path': summary_clean_path,
'verbale_path': verbale_path,
'docx_path': docx_path,
'docx_verbale_path': docx_verbale_path
}
except Exception as e:
print(f"Error processing video: {e}")
raise

View File

@ -0,0 +1,11 @@
flask==2.3.3
flask-cors==4.0.0
pymongo==4.5.0
python-dotenv==1.0.0
pydub==0.25.1
moviepy==1.0.3
python-docx==0.8.11
pypandoc==1.11
openpyxl==3.1.2
openai==1.3.7
werkzeug==2.3.7

File diff suppressed because one or more lines are too long