conversion_project/main.py

#!/usr/bin/env python3
import argparse
import csv
import json
import os
import shutil
import subprocess
from pathlib import Path
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed

from core.config_helper import load_config_xml
from core.logger_helper import setup_logger

# =============================
# Setup logger
# =============================
LOG_FOLDER = Path(__file__).parent / "logs"
logger = setup_logger(LOG_FOLDER)

# =============================
# Tracker CSV
# =============================
TRACKER_FILE = Path(__file__).parent / "conversion_tracker.csv"
if not TRACKER_FILE.exists():
    with open(TRACKER_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "type","show","filename","original_size_MB","processed_size_MB","percentage","method"
        ])

# =============================
# FFPROBE CACHING
# =============================
@lru_cache(maxsize=256)
def get_audio_streams_cached(input_file_str: str):
    """Cached ffprobe call to avoid redundant queries"""
    input_file = Path(input_file_str)
    cmd = [
        "ffprobe","-v","error","-select_streams","a",
        "-show_entries","stream=index,channels,duration,bit_rate,tags=language",
        "-of","json", str(input_file)
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return json.loads(result.stdout)

# =============================
# AUDIO BUCKET LOGIC
# =============================
def choose_audio_bitrate(channels: int, bitrate_kbps: int, audio_config: dict) -> int:
    if channels == 2:
        if bitrate_kbps < 100:
            return audio_config["stereo"]["low"]
        elif bitrate_kbps < 130:
            return audio_config["stereo"]["medium"]
        else:
            return audio_config["stereo"]["high"]
    else:
        if bitrate_kbps < 390:
            return audio_config["multi_channel"]["low"]
        elif bitrate_kbps < 515:
            return audio_config["multi_channel"]["medium"]
        else:
            return audio_config["multi_channel"]["high"]

# =============================
# PATH NORMALIZATION
# =============================
def normalize_path_for_service(local_path: str, path_mappings: dict) -> str:
    for win_path, linux_path in path_mappings.items():
        if local_path.lower().startswith(win_path.lower()):
            return local_path.replace(win_path, linux_path).replace("\\", "/")
    return local_path.replace("\\", "/")

# =============================
# AUDIO STREAMS DETECTION
# =============================
def get_audio_streams(input_file: Path):
    cmd = [
        "ffprobe","-v","error","-select_streams","a",
        "-show_entries","stream=index,channels,duration,bit_rate,tags=language",
        "-of","json", str(input_file)
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    data = json.loads(result.stdout)
    streams = []
    for s in data.get("streams", []):
        index = s["index"]
        channels = s.get("channels", 2)
        src_lang = s.get("tags", {}).get("language", "und")
        bit_rate_meta = int(s.get("bit_rate", 0)) if s.get("bit_rate") else 0
        try:
            duration = float(s.get("duration", 0))
            if duration and bit_rate_meta == 0:
                fmt_cmd = [
                    "ffprobe","-v","error","-show_entries","format=size,duration",
                    "-of","json", str(input_file)
                ]
                fmt_result = subprocess.run(fmt_cmd, capture_output=True, text=True)
                fmt_data = json.loads(fmt_result.stdout)
                size_bytes = int(fmt_data.get("format", {}).get("size", 0))
                total_duration = float(fmt_data.get("format", {}).get("duration", duration))
                n_streams = len(data.get("streams", []))
                avg_bitrate_kbps = int((size_bytes*8/n_streams)/total_duration/1000)
            elif duration and bit_rate_meta:
                avg_bitrate_kbps = int(bit_rate_meta / 1000)
            else:
                avg_bitrate_kbps = 128
        except Exception:
            avg_bitrate_kbps = 128
        streams.append((index, channels, avg_bitrate_kbps, src_lang, int(bit_rate_meta / 1000)))
    return streams

# =============================
# OUTPUT VALIDATION
# =============================
def validate_output(input_file: Path, output_file: Path, expected_width: int, expected_height: int) -> bool:
    """Validate that output file has correct resolution and audio tracks"""
    try:
        cmd = [
            "ffprobe", "-v", "error",
            "-select_streams", "v:0",
            "-show_entries", "stream=width,height",
            "-of", "json", str(output_file)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
        data = json.loads(result.stdout)

        if not data.get("streams"):
            logger.warning(f"❌ Validation failed: No video stream in {output_file.name}")
            return False

        width = data["streams"][0].get("width", 0)
        height = data["streams"][0].get("height", 0)

        # Allow small variance for scaling
        if abs(width - expected_width) > 10 or abs(height - expected_height) > 10:
            logger.warning(f"❌ Validation failed: Resolution {width}x{height}, expected ~{expected_width}x{expected_height}")
            return False

        logger.info(f"✅ Validation passed: {output_file.name} ({width}x{height})")
        return True
    except Exception as e:
        logger.warning(f"⚠️ Validation skipped (probe error): {e}")
        return True  # Don't fail on validation errors

# =============================
# FFmpeg ENCODE (GPU + CPU fallback, per-resolution CPU preset)
# =============================
def run_ffmpeg(input_file: Path, output_file: Path, cq: int, scale_width: int, scale_height: int,
               filter_flags: str, audio_config: dict, method: str, crf_cpu: int, verbose: bool = False):
    streams = get_audio_streams(input_file)
    encoder_name = "av1_nvenc"
    pix_fmt = "p010le"
    header = (
        f"\n🧩 ENCODE SETTINGS\n"
        f" • Resolution: {scale_width}x{scale_height}\n"
        f" • Scale Filter: {filter_flags}\n"
        f" • CQ: {cq if method=='CQ' else 'N/A'}\n"
        f" • CPU CRF: {crf_cpu}\n"
        f" • Video Encoder: {encoder_name} (preset p1, pix_fmt {pix_fmt})\n"
        f" • Audio Streams:"
    )
    logger.info(header)
    print(header)

    for (index, channels, avg_bitrate, src_lang, meta_bitrate) in streams:
        output_channels = 2 if scale_height <= 720 else (6 if channels >= 6 else 2)
        br = choose_audio_bitrate(output_channels, avg_bitrate, audio_config)
        line = (
            f"    - Stream #{index}: {channels}ch→{output_channels}ch, src={src_lang}, "
            f"avg_bitrate={avg_bitrate}kbps, metadata={meta_bitrate}kbps, bucket_target={br/1000:.1f}kbps"
        )
        print(line)
        logger.info(line)

    cmd = [
        "ffmpeg", "-y", "-i", str(input_file),
        "-vf", f"scale={scale_width}:{scale_height}:flags={filter_flags}:force_original_aspect_ratio=decrease",
        "-map", "0:v", "-map", "0:a", "-map", "0:s?",
        "-c:v", encoder_name, "-preset", "p1", "-pix_fmt", pix_fmt
    ]

    # Video quality
    if method == "CQ":
        cmd += ["-cq", str(cq)]
    else:
        if scale_height >= 1080:
            vb, maxrate, bufsize = "1500k", "1750k", "2250k"
        else:
            vb, maxrate, bufsize = "900k", "1250k", "1600k"
        cmd += ["-b:v", vb, "-maxrate", maxrate, "-bufsize", bufsize]

    # Audio streams
    for i, (index, channels, avg_bitrate, src_lang, meta_bitrate) in enumerate(streams):
        # Determine output channels: 720p -> 2ch, 1080p -> 6ch if input>=6 else 2ch
        output_channels = 2 if scale_height <= 720 else (6 if channels >= 6 else 2)
        # Choose bitrate based on OUTPUT channels, not input
        br = choose_audio_bitrate(output_channels, avg_bitrate, audio_config)
        cmd += [f"-c:a:{i}", "aac", f"-b:a:{i}", str(br), f"-ac:{i}", str(output_channels)]

    cmd += ["-c:s", "copy", str(output_file)]

    print(f"\n🎬 Running {method} encode: {output_file.name}")
    logger.info(f"Running {method} encode: {output_file.name}")
    if verbose:
        logger.info(f"FFmpeg command: {' '.join(cmd)}")

    # Try GPU encoder first
    try:
        if verbose:
            subprocess.run(cmd, check=True)
        else:
            subprocess.run(cmd, check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"❌ FFmpeg failed with GPU encoder on {input_file.name}: {e}")
        logger.error(f"GPU encode failed for {input_file.name}. Command: {' '.join(cmd)}")

        # CPU fallback
        cmd_cpu = cmd.copy()
        idx = cmd_cpu.index(encoder_name)
        cmd_cpu[idx] = "libsvtav1"

        # CPU preset based on resolution
        cpu_preset = "8" if scale_height <= 720 else "6"  # faster for 720p, slower for 1080p
        preset_idx = cmd_cpu.index("p1")
        cmd_cpu[preset_idx] = cpu_preset

        # Replace -cq with -crf
        if "-cq" in cmd_cpu:
            cq_idx = cmd_cpu.index("-cq")
            cmd_cpu[cq_idx] = "-crf"
            cmd_cpu[cq_idx + 1] = str(crf_cpu)

        try:
            if verbose:
                subprocess.run(cmd_cpu, check=True)
            else:
                subprocess.run(cmd_cpu, check=True, capture_output=True)
            print("✅ CPU fallback succeeded")
            logger.info("CPU fallback succeeded")
        except subprocess.CalledProcessError as e_cpu:
            print(f"❌ CPU fallback also failed for {input_file.name}: {e_cpu}")
            logger.error(f"CPU fallback failed for {input_file.name}. Command: {' '.join(cmd_cpu)}")
            raise e_cpu

    orig_size = input_file.stat().st_size
    out_size = output_file.stat().st_size
    reduction_ratio = out_size / orig_size
    msg = f"📦 Original: {orig_size/1e6:.2f} MB → Encoded: {out_size/1e6:.2f} MB ({reduction_ratio:.1%} of original)"
    print(msg)
    logger.info(msg)

    return orig_size, out_size, reduction_ratio


# =============================
# PROCESS FOLDER
# =============================
def process_folder(folder: Path, cq: int, resolution: str, config: dict, dry_run: bool = False,
                  verbose: bool = False, backup: bool = False, cleanup: bool = False, parallel: int = 1):
    if not folder.exists():
        print(f"❌ Folder not found: {folder}")
        logger.error(f"Folder not found: {folder}")
        return

    audio_config = config["audio"]
    filters_config = config["encode"]["filters"]
    suffix = config["suffix"]
    extensions = config["extensions"]
    ignore_tags = config["ignore_tags"]
    reduction_ratio_threshold = config["reduction_ratio_threshold"]
    res_height = 1080 if resolution == "1080" else 720
    res_width = 1920 if resolution == "1080" else 1280

    # Determine type and resolution keys
    folder_lower = str(folder).lower()
    if "\\tv\\" in folder_lower or "/tv/" in folder_lower:
        type_key = "tv"
        filter_flags = filters_config.get("tv", "bicubic")
    else:
        type_key = "movie"
        filter_flags = filters_config.get("default", "lanczos")

    res_key = "1080" if resolution == "1080" else "720"

    # Get CQ and CRF from config
    cq_default = config["encode"]["cq"].get(f"{type_key}_{res_key}", 32)
    crf_cpu = config["encode"]["crf"].get(f"{type_key}_{res_key}", 32)
    if cq is None:
        cq = cq_default

    processing_folder = Path(config["processing_folder"])
    processing_folder.mkdir(parents=True, exist_ok=True)

    # Cleanup old processing folder if requested
    if cleanup and processing_folder.exists():
        print(f"🧹 Cleaning up old processing folder: {processing_folder}")
        logger.info(f"Cleaning up old processing folder: {processing_folder}")
        shutil.rmtree(processing_folder, ignore_errors=True)
        processing_folder.mkdir(parents=True, exist_ok=True)

    # Backup folder setup
    backup_folder = None
    if backup:
        backup_folder = folder.parent / f"{folder.name}_backup"
        backup_folder.mkdir(parents=True, exist_ok=True)
        print(f"💾 Backup enabled: {backup_folder}")
        logger.info(f"Backup folder: {backup_folder}")

    # Dry-run message
    if dry_run:
        print("🔍 DRY-RUN MODE: No files will be encoded or deleted")
        logger.info("DRY-RUN MODE: No files will be encoded or deleted")

    # Track if we switch to bitrate mode
    use_bitrate = False

    # Collect all files to process first
    files_to_process = []
    for file in folder.rglob("*"):
        if file.suffix.lower() not in extensions:
            continue
        if any(tag.lower() in file.name.lower() for tag in ignore_tags):
            print(f"⏭️ Skipping: {file.name}")
            logger.info(f"Skipping: {file.name}")
            continue
        files_to_process.append(file)

    if not files_to_process:
        print("❌ No files found to process")
        logger.info("No files found to process")
        return

    print(f"📋 Found {len(files_to_process)} file(s) to process")

    # Define the encoding task
    def encode_file(file: Path):
        """Encodes a single file - used for parallel processing"""
        try:
            print("="*60)
            logger.info(f"Processing: {file.name}")
            print(f"📁 Processing: {file.name}")

            temp_input = processing_folder / file.name
            shutil.copy2(file, temp_input)
            logger.info(f"Copied {file.name} → {temp_input.name}")
            temp_output = processing_folder / f"{file.stem}{suffix}{file.suffix}"

            method = "Bitrate" if use_bitrate else "CQ"

            if dry_run:
                print(f"🔍 [DRY-RUN] Would encode: {temp_output}")
                logger.info(f"[DRY-RUN] Would encode: {temp_output}")
                return None

            try:
                orig_size, out_size, reduction_ratio = run_ffmpeg(
                    temp_input, temp_output, cq, res_width, res_height, filter_flags,
                    audio_config, method, crf_cpu, verbose
                )
            except subprocess.CalledProcessError as e:
                print(f"❌ FFmpeg failed: {e}")
                logger.error(f"FFmpeg failed: {e}")
                temp_input.unlink(missing_ok=True)
                temp_output.unlink(missing_ok=True)
                return None

            # Validate output
            if not validate_output(temp_input, temp_output, res_width, res_height):
                print(f"⚠️ Validation failed for {temp_output.name}, keeping original")
                logger.warning(f"Validation failed for {temp_output.name}")
                temp_input.unlink(missing_ok=True)
                temp_output.unlink(missing_ok=True)
                return None

            # Handle fallback if CQ/Bitrate didn't reach target
            if method == "CQ" and reduction_ratio >= reduction_ratio_threshold:
                print(f"⚠️ CQ encode did not achieve target size ({reduction_ratio:.1%} >= {reduction_ratio_threshold:.1%}). Retrying with Bitrate.")
                logger.warning(f"CQ encode failed target ({reduction_ratio:.1%}). Retrying with Bitrate.")
                try:
                    temp_output.unlink(missing_ok=True)
                    orig_size, out_size, reduction_ratio = run_ffmpeg(
                        temp_input, temp_output, cq, res_width, res_height, filter_flags,
                        audio_config, "Bitrate", crf_cpu, verbose
                    )
                    if reduction_ratio >= reduction_ratio_threshold:
                        print("❌ Bitrate encode also failed target.")
                        logger.error("Bitrate encode failed target.")
                        temp_input.unlink(missing_ok=True)
                        temp_output.unlink(missing_ok=True)
                        return None
                except subprocess.CalledProcessError as e:
                    print(f"❌ Bitrate retry failed: {e}")
                    logger.error(f"Bitrate retry failed: {e}")
                    temp_input.unlink(missing_ok=True)
                    temp_output.unlink(missing_ok=True)
                    return None
            elif reduction_ratio >= reduction_ratio_threshold:
                print("❌ Encode failed target. Stopping.")
                logger.error("Encode failed target.")
                temp_input.unlink(missing_ok=True)
                temp_output.unlink(missing_ok=True)
                return None

            # Move final file back to original folder
            dest_file = file.parent / temp_output.name
            if not dry_run:
                shutil.move(temp_output, dest_file)
            print(f"🚚 Moved {temp_output.name} → {dest_file.name}")
            logger.info(f"Moved {temp_output.name} → {dest_file.name}")

            # Backup original if requested
            if backup and not dry_run:
                backup_dest = backup_folder / file.name
                shutil.copy2(file, backup_dest)
                logger.info(f"Backed up original to {backup_dest}")

            # Determine folder type and show
            folder_parts = [p.lower() for p in folder.parts]
            if "tv" in folder_parts:
                f_type = "tv"
                tv_index = folder_parts.index("tv")
                show = folder.parts[tv_index + 1] if len(folder.parts) > tv_index + 1 else "Unknown"
            elif "anime" in folder_parts:
                f_type = "anime"
                anime_index = folder_parts.index("anime")
                show = folder.parts[anime_index + 1] if len(folder.parts) > anime_index + 1 else "Unknown"
            else:
                f_type = "movie"
                show = "N/A"

            orig_size_mb = round(orig_size / 1e6, 2)
            proc_size_mb = round(out_size / 1e6, 2)
            percentage = round(proc_size_mb / orig_size_mb * 100, 1)

            # Log conversion in tracker CSV (skip in dry-run)
            if not dry_run:
                with open(TRACKER_FILE, "a", newline="", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([f_type, show, dest_file.name, orig_size_mb, proc_size_mb, percentage, method])

            logger.info(f"Tracked conversion: {dest_file.name}, {orig_size_mb}MB → {proc_size_mb}MB ({percentage}%), method={method}")
            print(f"📝 Logged conversion: {dest_file.name} ({percentage}%), method={method}")

            # Delete temporary and original files
            if not dry_run:
                try:
                    temp_input.unlink()
                    file.unlink()
                    logger.info(f"Deleted original and processing copy for {file.name}")
                except Exception as e:
                    print(f"⚠️ Could not delete files: {e}")
                    logger.warning(f"Could not delete files: {e}")

            return {"file": file.name, "orig": orig_size_mb, "proc": proc_size_mb, "pct": percentage}

        except Exception as e:
            logger.error(f"Unexpected error processing {file.name}: {e}", exc_info=True)
            return None

    # Process files sequentially or in parallel
    if parallel > 1:
        with ThreadPoolExecutor(max_workers=parallel) as executor:
            futures = [executor.submit(encode_file, f) for f in files_to_process]
            for future in as_completed(futures):
                result = future.result()
    else:
        for file in files_to_process:
            encode_file(file)

    if dry_run:
        print("🔍 DRY-RUN COMPLETE: No actual changes made")
    else:
        print("✅ Processing complete!")


# =============================
# MAIN
# =============================
def main():
    parser = argparse.ArgumentParser(description="Batch encode videos with logging and tracker")
    parser.add_argument("folder", help="Path to folder containing videos")
    parser.add_argument("--cq", type=int, help="Override default CQ")
    parser.add_argument("--r", "--resolution", dest="resolution", default="1080", choices=["720","1080"], help="Target resolution")
    parser.add_argument("--dry-run", action="store_true", help="Preview files without encoding")
    parser.add_argument("--verbose", "-v", action="store_true", help="Show FFmpeg output")
    parser.add_argument("--backup", action="store_true", help="Backup original files before encoding")
    parser.add_argument("--cleanup", action="store_true", help="Clean old processing folder on startup")
    parser.add_argument("--parallel", type=int, default=1, metavar="N", help="Encode N files in parallel (experimental)")
    parser.add_argument("--ratio", type=float, help="Reduction ratio threshold (default 0.5 from config)")
    args = parser.parse_args()

    config_path = Path(__file__).parent / "config.xml"
    config = load_config_xml(config_path)

    # Override reduction ratio if provided
    if args.ratio:
        config["reduction_ratio_threshold"] = args.ratio

    process_folder(Path(args.folder), args.cq, args.resolution, config,
                  dry_run=args.dry_run, verbose=args.verbose, backup=args.backup,
                  cleanup=args.cleanup, parallel=args.parallel)


if __name__ == "__main__":
    main()