import os
import shutil
import subprocess
import sys
import tempfile
import threading
import unittest
from typing import List, Optional


def run_cli_redact(
    script_path: str,
    input_file: str,
    output_dir: str,
    task: str = "redact",
    timeout: int = 600,  # 10-minute timeout
    # --- General Arguments ---
    input_dir: Optional[str] = None,
    language: Optional[str] = None,
    allow_list: Optional[str] = None,
    pii_detector: Optional[str] = None,
    username: Optional[str] = None,
    save_to_user_folders: Optional[bool] = None,
    local_redact_entities: Optional[List[str]] = None,
    aws_redact_entities: Optional[List[str]] = None,
    aws_access_key: Optional[str] = None,
    aws_secret_key: Optional[str] = None,
    cost_code: Optional[str] = None,
    aws_region: Optional[str] = None,
    s3_bucket: Optional[str] = None,
    do_initial_clean: Optional[bool] = None,
    save_logs_to_csv: Optional[bool] = None,
    save_logs_to_dynamodb: Optional[bool] = None,
    display_file_names_in_logs: Optional[bool] = None,
    upload_logs_to_s3: Optional[bool] = None,
    s3_logs_prefix: Optional[str] = None,
    # --- PDF/Image Redaction Arguments ---
    ocr_method: Optional[str] = None,
    page_min: Optional[int] = None,
    page_max: Optional[int] = None,
    images_dpi: Optional[float] = None,
    chosen_local_ocr_model: Optional[str] = None,
    preprocess_local_ocr_images: Optional[bool] = None,
    compress_redacted_pdf: Optional[bool] = None,
    return_pdf_end_of_redaction: Optional[bool] = None,
    deny_list_file: Optional[str] = None,
    allow_list_file: Optional[str] = None,
    redact_whole_page_file: Optional[str] = None,
    handwrite_signature_extraction: Optional[List[str]] = None,
    extract_forms: Optional[bool] = None,
    extract_tables: Optional[bool] = None,
    extract_layout: Optional[bool] = None,
    # --- Word/Tabular Anonymisation Arguments ---
    anon_strategy: Optional[str] = None,
    text_columns: Optional[List[str]] = None,
    excel_sheets: Optional[List[str]] = None,
    fuzzy_mistakes: Optional[int] = None,
    match_fuzzy_whole_phrase_bool: Optional[bool] = None,
    # --- Duplicate Detection Arguments ---
    duplicate_type: Optional[str] = None,
    similarity_threshold: Optional[float] = None,
    min_word_count: Optional[int] = None,
    min_consecutive_pages: Optional[int] = None,
    greedy_match: Optional[bool] = None,
    combine_pages: Optional[bool] = None,
    remove_duplicate_rows: Optional[bool] = None,
    # --- Textract Batch Operations Arguments ---
    textract_action: Optional[str] = None,
    job_id: Optional[str] = None,
    extract_signatures: Optional[bool] = None,
    textract_bucket: Optional[str] = None,
    textract_input_prefix: Optional[str] = None,
    textract_output_prefix: Optional[str] = None,
    s3_textract_document_logs_subfolder: Optional[str] = None,
    local_textract_document_logs_subfolder: Optional[str] = None,
    poll_interval: Optional[int] = None,
    max_poll_attempts: Optional[int] = None,
) -> bool:
    """
    Executes the cli_redact.py script with specified arguments using a subprocess.

    Args:
        script_path (str): The path to the cli_redact.py script.
        input_file (str): The path to the input file to process.
        output_dir (str): The path to the directory for output files.
        task (str): The main task to perform ('redact', 'deduplicate', or 'textract').
        timeout (int): Timeout in seconds for the subprocess.

        # General Arguments
        input_dir (str): Directory for all input files.
        language (str): Language of the document content.
        allow_list (str): Path to a CSV file with words to exclude from redaction.
        pii_detector (str): Core PII detection method (Local, AWS Comprehend, or None).
        username (str): Username for the session.
        save_to_user_folders (bool): Whether to save to user folders or not.
        local_redact_entities (List[str]): Local redaction entities to use.
        aws_redact_entities (List[str]): AWS redaction entities to use.
        aws_access_key (str): Your AWS Access Key ID.
        aws_secret_key (str): Your AWS Secret Access Key.
        cost_code (str): Cost code for tracking usage.
        aws_region (str): AWS region for cloud services.
        s3_bucket (str): S3 bucket name for cloud operations.
        do_initial_clean (bool): Perform initial text cleaning for tabular data.
        save_logs_to_csv (bool): Save processing logs to CSV files.
        save_logs_to_dynamodb (bool): Save processing logs to DynamoDB.
        display_file_names_in_logs (bool): Include file names in log outputs.
        upload_logs_to_s3 (bool): Upload log files to S3 after processing.
        s3_logs_prefix (str): S3 prefix for usage log files.

        # PDF/Image Redaction Arguments
        ocr_method (str): OCR method for text extraction from images.
        page_min (int): First page to redact.
        page_max (int): Last page to redact.
        images_dpi (float): DPI for image processing.
        chosen_local_ocr_model (str): Local OCR model to use.
        preprocess_local_ocr_images (bool): Preprocess images before OCR.
        compress_redacted_pdf (bool): Compress the final redacted PDF.
        return_pdf_end_of_redaction (bool): Return PDF at end of redaction process.
        deny_list_file (str): Custom words file to recognize for redaction.
        allow_list_file (str): Custom words file to recognize for redaction.
        redact_whole_page_file (str): File for pages to redact completely.
        handwrite_signature_extraction (List[str]): Handwriting and signature extraction options.
        extract_forms (bool): Extract forms during Textract analysis.
        extract_tables (bool): Extract tables during Textract analysis.
        extract_layout (bool): Extract layout during Textract analysis.

        # Word/Tabular Anonymisation Arguments
        anon_strategy (str): The anonymisation strategy to apply.
        text_columns (List[str]): A list of column names to anonymise or deduplicate.
        excel_sheets (List[str]): Specific Excel sheet names to process.
        fuzzy_mistakes (int): Number of allowed spelling mistakes for fuzzy matching.
        match_fuzzy_whole_phrase_bool (bool): Match fuzzy whole phrase boolean.

        # Duplicate Detection Arguments
        duplicate_type (str): Type of duplicate detection (pages or tabular).
        similarity_threshold (float): Similarity threshold (0-1) to consider content as duplicates.
        min_word_count (int): Minimum word count for text to be considered.
        min_consecutive_pages (int): Minimum number of consecutive pages to consider as a match.
        greedy_match (bool): Use greedy matching strategy for consecutive pages.
        combine_pages (bool): Combine text from the same page number within a file.
        remove_duplicate_rows (bool): Remove duplicate rows from the output.

        # Textract Batch Operations Arguments
        textract_action (str): Textract action to perform (submit, retrieve, or list).
        job_id (str): Textract job ID for retrieve action.
        extract_signatures (bool): Extract signatures during Textract analysis.
        textract_bucket (str): S3 bucket name for Textract operations.
        textract_input_prefix (str): S3 prefix for input files in Textract operations.
        textract_output_prefix (str): S3 prefix for output files in Textract operations.
        s3_textract_document_logs_subfolder (str): S3 prefix for logs in Textract operations.
        local_textract_document_logs_subfolder (str): Local prefix for logs in Textract operations.
        poll_interval (int): Polling interval in seconds for Textract job status.
        max_poll_attempts (int): Maximum number of polling attempts for Textract job completion.

    Returns:
        bool: True if the script executed successfully, False otherwise.
    """
    # 1. Get absolute paths and perform pre-checks
    script_abs_path = os.path.abspath(script_path)
    output_abs_dir = os.path.abspath(output_dir)

    # Handle input file based on task and action
    if task == "textract" and textract_action in ["retrieve", "list"]:
        # For retrieve and list actions, input file is not required
        input_abs_path = None
    else:
        # For all other cases, input file is required
        if input_file is None:
            raise ValueError("Input file is required for this task")
        input_abs_path = os.path.abspath(input_file)
        if not os.path.isfile(input_abs_path):
            raise FileNotFoundError(f"Input file not found: {input_abs_path}")

    if not os.path.isfile(script_abs_path):
        raise FileNotFoundError(f"Script not found: {script_abs_path}")

    if not os.path.isdir(output_abs_dir):
        # Create the output directory if it doesn't exist
        print(f"Output directory not found. Creating: {output_abs_dir}")
        os.makedirs(output_abs_dir)

    script_folder = os.path.dirname(script_abs_path)

    # 2. Dynamically build the command list
    command = [
        "python",
        script_abs_path,
        "--output_dir",
        output_abs_dir,
        "--task",
        task,
    ]

    # Add input_file only if it's not None
    if input_abs_path is not None:
        command.extend(["--input_file", input_abs_path])

    # Add general arguments
    if input_dir:
        command.extend(["--input_dir", input_dir])
    if language:
        command.extend(["--language", language])
    if allow_list and os.path.isfile(allow_list):
        command.extend(["--allow_list", os.path.abspath(allow_list)])
    if pii_detector:
        command.extend(["--pii_detector", pii_detector])
    if username:
        command.extend(["--username", username])
    if save_to_user_folders is not None:
        command.extend(["--save_to_user_folders", str(save_to_user_folders)])
    if local_redact_entities:
        command.append("--local_redact_entities")
        command.extend(local_redact_entities)
    if aws_redact_entities:
        command.append("--aws_redact_entities")
        command.extend(aws_redact_entities)
    if aws_access_key:
        command.extend(["--aws_access_key", aws_access_key])
    if aws_secret_key:
        command.extend(["--aws_secret_key", aws_secret_key])
    if cost_code:
        command.extend(["--cost_code", cost_code])
    if aws_region:
        command.extend(["--aws_region", aws_region])
    if s3_bucket:
        command.extend(["--s3_bucket", s3_bucket])
    if do_initial_clean is not None:
        command.extend(["--do_initial_clean", str(do_initial_clean)])
    if save_logs_to_csv is not None:
        command.extend(["--save_logs_to_csv", str(save_logs_to_csv)])
    if save_logs_to_dynamodb is not None:
        command.extend(["--save_logs_to_dynamodb", str(save_logs_to_dynamodb)])
    if display_file_names_in_logs is not None:
        command.extend(
            ["--display_file_names_in_logs", str(display_file_names_in_logs)]
        )
    if upload_logs_to_s3 is not None:
        command.extend(["--upload_logs_to_s3", str(upload_logs_to_s3)])
    if s3_logs_prefix:
        command.extend(["--s3_logs_prefix", s3_logs_prefix])

    # Add PDF/Image redaction arguments
    if ocr_method:
        command.extend(["--ocr_method", ocr_method])
    if page_min is not None:
        command.extend(["--page_min", str(page_min)])
    if page_max is not None:
        command.extend(["--page_max", str(page_max)])
    if images_dpi is not None:
        command.extend(["--images_dpi", str(images_dpi)])
    if chosen_local_ocr_model:
        command.extend(["--chosen_local_ocr_model", chosen_local_ocr_model])
    if preprocess_local_ocr_images is not None:
        command.extend(
            ["--preprocess_local_ocr_images", str(preprocess_local_ocr_images)]
        )
    if compress_redacted_pdf is not None:
        command.extend(["--compress_redacted_pdf", str(compress_redacted_pdf)])
    if return_pdf_end_of_redaction is not None:
        command.extend(
            ["--return_pdf_end_of_redaction", str(return_pdf_end_of_redaction)]
        )
    if deny_list_file and os.path.isfile(deny_list_file):
        command.extend(["--deny_list_file", os.path.abspath(deny_list_file)])
    if allow_list_file and os.path.isfile(allow_list_file):
        command.extend(["--allow_list_file", os.path.abspath(allow_list_file)])
    if redact_whole_page_file and os.path.isfile(redact_whole_page_file):
        command.extend(
            ["--redact_whole_page_file", os.path.abspath(redact_whole_page_file)]
        )
    if handwrite_signature_extraction:
        command.append("--handwrite_signature_extraction")
        command.extend(handwrite_signature_extraction)
    if extract_forms:
        command.append("--extract_forms")
    if extract_tables:
        command.append("--extract_tables")
    if extract_layout:
        command.append("--extract_layout")

    # Add Word/Tabular anonymisation arguments
    if anon_strategy:
        command.extend(["--anon_strategy", anon_strategy])
    if text_columns:
        command.append("--text_columns")
        command.extend(text_columns)
    if excel_sheets:
        command.append("--excel_sheets")
        command.extend(excel_sheets)
    if fuzzy_mistakes is not None:
        command.extend(["--fuzzy_mistakes", str(fuzzy_mistakes)])
    if match_fuzzy_whole_phrase_bool is not None:
        command.extend(
            ["--match_fuzzy_whole_phrase_bool", str(match_fuzzy_whole_phrase_bool)]
        )

    # Add duplicate detection arguments
    if duplicate_type:
        command.extend(["--duplicate_type", duplicate_type])
    if similarity_threshold is not None:
        command.extend(["--similarity_threshold", str(similarity_threshold)])
    if min_word_count is not None:
        command.extend(["--min_word_count", str(min_word_count)])
    if min_consecutive_pages is not None:
        command.extend(["--min_consecutive_pages", str(min_consecutive_pages)])
    if greedy_match is not None:
        command.extend(["--greedy_match", str(greedy_match)])
    if combine_pages is not None:
        command.extend(["--combine_pages", str(combine_pages)])
    if remove_duplicate_rows is not None:
        command.extend(["--remove_duplicate_rows", str(remove_duplicate_rows)])

    # Add Textract batch operations arguments
    if textract_action:
        command.extend(["--textract_action", textract_action])
    if job_id:
        command.extend(["--job_id", job_id])
    if extract_signatures is not None:
        if extract_signatures:
            command.append("--extract_signatures")
    if textract_bucket:
        command.extend(["--textract_bucket", textract_bucket])
    if textract_input_prefix:
        command.extend(["--textract_input_prefix", textract_input_prefix])
    if textract_output_prefix:
        command.extend(["--textract_output_prefix", textract_output_prefix])
    if s3_textract_document_logs_subfolder:
        command.extend(
            [
                "--s3_textract_document_logs_subfolder",
                s3_textract_document_logs_subfolder,
            ]
        )
    if local_textract_document_logs_subfolder:
        command.extend(
            [
                "--local_textract_document_logs_subfolder",
                local_textract_document_logs_subfolder,
            ]
        )
    if poll_interval is not None:
        command.extend(["--poll_interval", str(poll_interval)])
    if max_poll_attempts is not None:
        command.extend(["--max_poll_attempts", str(max_poll_attempts)])

    # Filter out None values before joining
    command_str = " ".join(str(arg) for arg in command if arg is not None)
    print(f"Executing command: {command_str}")

    # 3. Execute the command using subprocess
    try:
        result = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            cwd=script_folder,  # Important for relative paths within the script
        )

        # Communicate with the process to get output and handle timeout
        stdout, stderr = result.communicate(timeout=timeout)

        print("--- SCRIPT STDOUT ---")
        if stdout:
            print(stdout)
        print("--- SCRIPT STDERR ---")
        if stderr:
            print(stderr)
        print("---------------------")

        # Analyze the output for errors and success indicators
        analysis = analyze_test_output(stdout, stderr)

        if analysis["has_errors"]:
            print("❌ Errors detected in output:")
            for i, error_type in enumerate(analysis["error_types"]):
                print(f"   {i+1}. {error_type}")
            if analysis["error_messages"]:
                print("   Error messages:")
                for msg in analysis["error_messages"][
                    :3
                ]:  # Show first 3 error messages
                    print(f"     - {msg}")
            return False
        elif result.returncode == 0:
            success_msg = "✅ Script executed successfully."
            if analysis["success_indicators"]:
                success_msg += f" (Success indicators: {', '.join(analysis['success_indicators'][:3])})"
            print(success_msg)
            return True
        else:
            print(f"❌ Command failed with return code {result.returncode}")
            return False

    except subprocess.TimeoutExpired:
        result.kill()
        print(f"❌ Subprocess timed out after {timeout} seconds.")
        return False
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        return False


def analyze_test_output(stdout: str, stderr: str) -> dict:
    """
    Analyze test output to provide detailed error information.

    Args:
        stdout (str): Standard output from the test
        stderr (str): Standard error from the test

    Returns:
        dict: Analysis results with error details
    """
    combined_output = (stdout or "") + (stderr or "")

    analysis = {
        "has_errors": False,
        "error_types": [],
        "error_messages": [],
        "success_indicators": [],
        "warning_indicators": [],
    }

    # Error patterns
    error_patterns = {
        "An error occurred": "General error message",
        "Error:": "Error prefix",
        "Exception:": "Exception occurred",
        "Traceback": "Python traceback",
        "Failed to": "Operation failure",
        "Cannot": "Operation not possible",
        "Unable to": "Operation not possible",
        "KeyError:": "Missing key/dictionary error",
        "AttributeError:": "Missing attribute error",
        "TypeError:": "Type mismatch error",
        "ValueError:": "Invalid value error",
        "FileNotFoundError:": "File not found",
        "ImportError:": "Import failure",
        "ModuleNotFoundError:": "Module not found",
    }

    # Success indicators
    success_patterns = [
        "Successfully",
        "Completed",
        "Finished",
        "Processed",
        "Redacted",
        "Extracted",
    ]

    # Warning indicators
    warning_patterns = ["Warning:", "WARNING:", "Deprecated", "DeprecationWarning"]

    # Check for errors
    for pattern, description in error_patterns.items():
        if pattern.lower() in combined_output.lower():
            analysis["has_errors"] = True
            analysis["error_types"].append(description)

            # Extract the actual error message
            lines = combined_output.split("\n")
            for line in lines:
                if pattern.lower() in line.lower():
                    analysis["error_messages"].append(line.strip())

    # Check for success indicators
    for pattern in success_patterns:
        if pattern.lower() in combined_output.lower():
            analysis["success_indicators"].append(pattern)

    # Check for warnings
    for pattern in warning_patterns:
        if pattern.lower() in combined_output.lower():
            analysis["warning_indicators"].append(pattern)

    return analysis


class TestCLIRedactExamples(unittest.TestCase):
    """Test suite for CLI redaction examples from the epilog."""

    @classmethod
    def setUpClass(cls):
        """Set up test environment before running tests."""
        cls.script_path = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), "cli_redact.py"
        )
        cls.example_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), "example_data"
        )
        cls.temp_output_dir = tempfile.mkdtemp(prefix="test_output_")

        # Verify script exists
        if not os.path.isfile(cls.script_path):
            raise FileNotFoundError(f"CLI script not found: {cls.script_path}")

        print(f"Test setup complete. Script: {cls.script_path}")
        print(f"Example data directory: {cls.example_data_dir}")
        print(f"Temp output directory: {cls.temp_output_dir}")

        # Debug: Check if example data directory exists and list contents
        if os.path.exists(cls.example_data_dir):
            print("Example data directory exists. Contents:")
            for item in os.listdir(cls.example_data_dir):
                item_path = os.path.join(cls.example_data_dir, item)
                if os.path.isfile(item_path):
                    print(f"  File: {item} ({os.path.getsize(item_path)} bytes)")
                else:
                    print(f"  Directory: {item}")
        else:
            print(f"Example data directory does not exist: {cls.example_data_dir}")

    @classmethod
    def tearDownClass(cls):
        """Clean up test environment after running tests."""
        if os.path.exists(cls.temp_output_dir):
            shutil.rmtree(cls.temp_output_dir)
        print(f"Cleaned up temp directory: {cls.temp_output_dir}")

    def test_pdf_redaction_default_settings(self):
        """Test: Redact a PDF with default settings (local OCR)"""
        print("\n=== Testing PDF redaction with default settings ===")
        input_file = os.path.join(
            self.example_data_dir,
            "example_of_emails_sent_to_a_professor_before_applying.pdf",
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
        )

        self.assertTrue(result, "PDF redaction with default settings should succeed")
        print("✅ PDF redaction with default settings passed")

    def test_pdf_text_extraction_only(self):
        """Test: Extract text from a PDF only (i.e. no redaction), using local OCR"""
        print("\n=== Testing PDF text extraction only ===")
        input_file = os.path.join(
            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
        )
        whole_page_file = os.path.join(
            self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")
        if not os.path.isfile(whole_page_file):
            self.skipTest(f"Whole page file not found: {whole_page_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            redact_whole_page_file=whole_page_file,
            pii_detector="None",
        )

        self.assertTrue(result, "PDF text extraction should succeed")
        print("✅ PDF text extraction only passed")

    def test_pdf_text_extraction_with_whole_page_redaction(self):
        """Test: Extract text from a PDF only with a whole page redaction list"""
        print("\n=== Testing PDF text extraction with whole page redaction ===")
        input_file = os.path.join(
            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
        )
        whole_page_file = os.path.join(
            self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")
        if not os.path.isfile(whole_page_file):
            self.skipTest(f"Whole page file not found: {whole_page_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            redact_whole_page_file=whole_page_file,
            pii_detector="Local",
            local_redact_entities=["CUSTOM"],
        )

        self.assertTrue(
            result, "PDF text extraction with whole page redaction should succeed"
        )
        print("✅ PDF text extraction with whole page redaction passed")

    def test_pdf_redaction_with_allow_list(self):
        """Test: Redact a PDF with allow list (local OCR) and custom list of redaction entities"""
        print("\n=== Testing PDF redaction with allow list ===")
        input_file = os.path.join(
            self.example_data_dir, "graduate-job-example-cover-letter.pdf"
        )
        allow_list_file = os.path.join(
            self.example_data_dir, "test_allow_list_graduate.csv"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")
        if not os.path.isfile(allow_list_file):
            self.skipTest(f"Allow list file not found: {allow_list_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            allow_list_file=allow_list_file,
            local_redact_entities=["TITLES", "PERSON", "DATE_TIME"],
        )

        self.assertTrue(result, "PDF redaction with allow list should succeed")
        print("✅ PDF redaction with allow list passed")

    def test_pdf_redaction_limited_pages_with_custom_fuzzy(self):
        """Test: Redact a PDF with limited pages and text extraction method with custom fuzzy matching"""
        print("\n=== Testing PDF redaction with limited pages and fuzzy matching ===")
        input_file = os.path.join(
            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
        )
        deny_list_file = os.path.join(
            self.example_data_dir,
            "Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv",
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")
        if not os.path.isfile(deny_list_file):
            self.skipTest(f"Deny list file not found: {deny_list_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            deny_list_file=deny_list_file,
            local_redact_entities=["CUSTOM_FUZZY"],
            page_min=1,
            page_max=3,
            ocr_method="Local text",
            fuzzy_mistakes=3,
        )

        self.assertTrue(
            result, "PDF redaction with limited pages and fuzzy matching should succeed"
        )
        print("✅ PDF redaction with limited pages and fuzzy matching passed")

    def test_pdf_redaction_with_custom_lists(self):
        """Test: Redaction with custom deny list, allow list, and whole page redaction list"""
        print("\n=== Testing PDF redaction with custom lists ===")
        input_file = os.path.join(
            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
        )
        deny_list_file = os.path.join(
            self.example_data_dir, "partnership_toolkit_redact_custom_deny_list.csv"
        )
        whole_page_file = os.path.join(
            self.example_data_dir, "partnership_toolkit_redact_some_pages.csv"
        )
        allow_list_file = os.path.join(
            self.example_data_dir, "test_allow_list_partnership.csv"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")
        if not os.path.isfile(deny_list_file):
            self.skipTest(f"Deny list file not found: {deny_list_file}")
        if not os.path.isfile(whole_page_file):
            self.skipTest(f"Whole page file not found: {whole_page_file}")
        if not os.path.isfile(allow_list_file):
            self.skipTest(f"Allow list file not found: {allow_list_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            deny_list_file=deny_list_file,
            redact_whole_page_file=whole_page_file,
            allow_list_file=allow_list_file,
        )

        self.assertTrue(result, "PDF redaction with custom lists should succeed")
        print("✅ PDF redaction with custom lists passed")

    def test_image_redaction(self):
        """Test: Redact an image"""
        print("\n=== Testing image redaction ===")
        input_file = os.path.join(self.example_data_dir, "example_complaint_letter.jpg")

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
        )

        self.assertTrue(result, "Image redaction should succeed")
        print("✅ Image redaction passed")

    def test_csv_anonymisation_specific_columns(self):
        """Test: Anonymise csv file with specific columns"""
        print("\n=== Testing CSV anonymisation with specific columns ===")
        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            text_columns=["Case Note", "Client"],
            anon_strategy="replace_redacted",
        )

        self.assertTrue(
            result, "CSV anonymisation with specific columns should succeed"
        )
        print("✅ CSV anonymisation with specific columns passed")

    def test_csv_anonymisation_different_strategy(self):
        """Test: Anonymise csv file with a different strategy (remove text completely)"""
        print("\n=== Testing CSV anonymisation with different strategy ===")
        input_file = os.path.join(self.example_data_dir, "combined_case_notes.csv")

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            text_columns=["Case Note", "Client"],
            anon_strategy="redact",
        )

        self.assertTrue(
            result, "CSV anonymisation with different strategy should succeed"
        )
        print("✅ CSV anonymisation with different strategy passed")

    def test_word_document_anonymisation(self):
        """Test: Anonymise a word document"""
        print("\n=== Testing Word document anonymisation ===")
        input_file = os.path.join(
            self.example_data_dir, "Bold minimalist professional cover letter.docx"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            anon_strategy="replace_redacted",
        )

        self.assertTrue(result, "Word document anonymisation should succeed")
        print("✅ Word document anonymisation passed")

    def test_aws_textract_comprehend_redaction(self):
        """Test: Use Textract and Comprehend for redaction"""
        print("\n=== Testing AWS Textract and Comprehend redaction ===")
        input_file = os.path.join(
            self.example_data_dir,
            "example_of_emails_sent_to_a_professor_before_applying.pdf",
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        # Skip this test if AWS credentials are not available
        # This is a conditional test that may not work in all environments
        run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            ocr_method="AWS Textract",
            pii_detector="AWS Comprehend",
        )

        # Note: This test may fail if AWS credentials are not configured
        # We'll mark it as passed if it runs without crashing
        print("✅ AWS Textract and Comprehend redaction test completed")

    def test_aws_textract_signature_extraction(self):
        """Test: Redact specific pages with AWS OCR and signature extraction"""
        print("\n=== Testing AWS Textract with signature extraction ===")
        input_file = os.path.join(
            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        # Skip this test if AWS credentials are not available
        run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            page_min=6,
            page_max=7,
            ocr_method="AWS Textract",
            handwrite_signature_extraction=[
                "Extract handwriting",
                "Extract signatures",
            ],
        )

        # Note: This test may fail if AWS credentials are not configured
        print("✅ AWS Textract with signature extraction test completed")

    def test_duplicate_pages_detection(self):
        """Test: Find duplicate pages in OCR files"""
        print("\n=== Testing duplicate pages detection ===")
        input_file = os.path.join(
            self.example_data_dir,
            "example_outputs",
            "doubled_output_joined.pdf_ocr_output.csv",
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example OCR file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            task="deduplicate",
            duplicate_type="pages",
            similarity_threshold=0.95,
        )

        self.assertTrue(result, "Duplicate pages detection should succeed")
        print("✅ Duplicate pages detection passed")

    def test_duplicate_line_level_detection(self):
        """Test: Find duplicate in OCR files at the line level"""
        print("\n=== Testing duplicate line level detection ===")
        input_file = os.path.join(
            self.example_data_dir,
            "example_outputs",
            "doubled_output_joined.pdf_ocr_output.csv",
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example OCR file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            task="deduplicate",
            duplicate_type="pages",
            similarity_threshold=0.95,
            combine_pages=False,
            min_word_count=3,
        )

        self.assertTrue(result, "Duplicate line level detection should succeed")
        print("✅ Duplicate line level detection passed")

    def test_duplicate_tabular_detection(self):
        """Test: Find duplicate rows in tabular data"""
        print("\n=== Testing duplicate tabular detection ===")
        input_file = os.path.join(
            self.example_data_dir, "Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example CSV file not found: {input_file}")

        result = run_cli_redact(
            script_path=self.script_path,
            input_file=input_file,
            output_dir=self.temp_output_dir,
            task="deduplicate",
            duplicate_type="tabular",
            text_columns=["text"],
            similarity_threshold=0.95,
        )

        self.assertTrue(result, "Duplicate tabular detection should succeed")
        print("✅ Duplicate tabular detection passed")

    def test_textract_submit_document(self):
        """Test: Submit document to Textract for basic text analysis"""
        print("\n=== Testing Textract document submission ===")
        input_file = os.path.join(
            self.example_data_dir,
            "example_of_emails_sent_to_a_professor_before_applying.pdf",
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        # Skip this test if AWS credentials are not available
        try:
            run_cli_redact(
                script_path=self.script_path,
                input_file=input_file,
                output_dir=self.temp_output_dir,
                task="textract",
                textract_action="submit",
            )
        except Exception as e:
            print(f"Textract test failed (expected without AWS credentials): {e}")

        # Note: This test may fail if AWS credentials are not configured
        print("✅ Textract document submission test completed")

    def test_textract_submit_with_signatures(self):
        """Test: Submit document to Textract for analysis with signature extraction"""
        print("\n=== Testing Textract submission with signature extraction ===")
        input_file = os.path.join(
            self.example_data_dir, "Partnership-Agreement-Toolkit_0_0.pdf"
        )

        if not os.path.isfile(input_file):
            self.skipTest(f"Example file not found: {input_file}")

        # Skip this test if AWS credentials are not available
        try:
            run_cli_redact(
                script_path=self.script_path,
                input_file=input_file,
                output_dir=self.temp_output_dir,
                task="textract",
                textract_action="submit",
                extract_signatures=True,
            )
        except Exception as e:
            print(f"Textract test failed (expected without AWS credentials): {e}")

        # Note: This test may fail if AWS credentials are not configured
        print("✅ Textract submission with signature extraction test completed")

    def test_textract_retrieve_results(self):
        """Test: Retrieve Textract results by job ID"""
        print("\n=== Testing Textract results retrieval ===")

        # Skip this test if AWS credentials are not available
        # This would require a valid job ID from a previous submission
        # For retrieve and list actions, we don't need a real input file
        try:
            run_cli_redact(
                script_path=self.script_path,
                input_file=None,  # No input file needed for retrieve action
                output_dir=self.temp_output_dir,
                task="textract",
                textract_action="retrieve",
                job_id="12345678-1234-1234-1234-123456789012",  # Dummy job ID
            )
        except Exception as e:
            print(f"Textract test failed (expected without AWS credentials): {e}")

        # Note: This test will likely fail with a dummy job ID, but that's expected
        print("✅ Textract results retrieval test completed")

    def test_textract_list_jobs(self):
        """Test: List recent Textract jobs"""
        print("\n=== Testing Textract jobs listing ===")

        # Skip this test if AWS credentials are not available
        # For list action, we don't need a real input file
        try:
            run_cli_redact(
                script_path=self.script_path,
                input_file=None,  # No input file needed for list action
                output_dir=self.temp_output_dir,
                task="textract",
                textract_action="list",
            )
        except Exception as e:
            print(f"Textract test failed (expected without AWS credentials): {e}")

        # Note: This test may fail if AWS credentials are not configured
        print("✅ Textract jobs listing test completed")


class TestGUIApp(unittest.TestCase):
    """Test suite for GUI application loading and basic functionality."""

    @classmethod
    def setUpClass(cls):
        """Set up test environment for GUI tests."""
        cls.app_path = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), "app.py"
        )

        # Verify app.py exists
        if not os.path.isfile(cls.app_path):
            raise FileNotFoundError(f"App file not found: {cls.app_path}")

        print(f"GUI test setup complete. App: {cls.app_path}")

    def test_app_import_and_initialization(self):
        """Test: Import app.py and check if the Gradio app object is created successfully."""
        print("\n=== Testing GUI app import and initialization ===")

        try:
            # Add the parent directory to the path so we can import app
            parent_dir = os.path.dirname(os.path.dirname(__file__))
            if parent_dir not in sys.path:
                sys.path.insert(0, parent_dir)

            # Import the app module
            import app

            # Check if the app object exists and is a Gradio Blocks object
            self.assertTrue(
                hasattr(app, "blocks"), "App object should exist in the module"
            )

            # Check if it's a Gradio Blocks instance
            import gradio as gr

            self.assertIsInstance(
                app.blocks, gr.Blocks, "App should be a Gradio Blocks instance"
            )

            print("✅ GUI app import and initialisation passed")

        except ImportError as e:
            error_msg = f"Failed to import app module: {e}"
            if "gradio_image_annotation" in str(e):
                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
                error_msg += "\nPlease run: conda activate redaction"
                error_msg += "\nThen run this test again."
            self.fail(error_msg)
        except Exception as e:
            self.fail(f"Unexpected error during app initialization: {e}")

    def test_app_launch_headless(self):
        """Test: Launch the app in headless mode to verify it starts without errors."""
        print("\n=== Testing GUI app launch in headless mode ===")

        try:
            # Add the parent directory to the path
            parent_dir = os.path.dirname(os.path.dirname(__file__))
            if parent_dir not in sys.path:
                sys.path.insert(0, parent_dir)

            # Import the app module

            import app

            # Set up a flag to track if the app launched successfully
            app_launched = threading.Event()
            launch_error = None

            def launch_app():
                try:
                    # Launch the app in headless mode with a short timeout
                    app.app.launch(
                        show_error=True,
                        inbrowser=False,  # Don't open browser
                        server_port=0,  # Use any available port
                        quiet=True,  # Suppress output
                        prevent_thread_lock=True,  # Don't block the main thread
                    )
                    app_launched.set()
                except Exception:
                    app_launched.set()

            # Start the app in a separate thread
            launch_thread = threading.Thread(target=launch_app)
            launch_thread.daemon = True
            launch_thread.start()

            # Wait for the app to launch (with timeout)
            if app_launched.wait(timeout=10):  # 10 second timeout
                if launch_error:
                    self.fail(f"App launch failed: {launch_error}")
                else:
                    print("✅ GUI app launch in headless mode passed")
            else:
                self.fail("App launch timed out after 10 seconds")

        except Exception as e:
            error_msg = f"Unexpected error during app launch test: {e}"
            if "gradio_image_annotation" in str(e):
                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
                error_msg += "\nPlease run: conda activate redaction"
                error_msg += "\nThen run this test again."
            self.fail(error_msg)

    def test_app_configuration_loading(self):
        """Test: Verify that the app can load its configuration without errors."""
        print("\n=== Testing GUI app configuration loading ===")

        try:
            # Add the parent directory to the path
            parent_dir = os.path.dirname(os.path.dirname(__file__))
            if parent_dir not in sys.path:
                sys.path.insert(0, parent_dir)

            # Import the app module (not needed?)
            # import app

            # Check if key configuration variables are accessible
            # These should be imported from tools.config
            from tools.config import (
                DEFAULT_LANGUAGE,
                GRADIO_SERVER_PORT,
                MAX_FILE_SIZE,
                PII_DETECTION_MODELS,
            )

            # Verify these are not None/empty
            self.assertIsNotNone(
                GRADIO_SERVER_PORT, "GRADIO_SERVER_PORT should be configured"
            )
            self.assertIsNotNone(MAX_FILE_SIZE, "MAX_FILE_SIZE should be configured")
            self.assertIsNotNone(
                DEFAULT_LANGUAGE, "DEFAULT_LANGUAGE should be configured"
            )
            self.assertIsNotNone(
                PII_DETECTION_MODELS, "PII_DETECTION_MODELS should be configured"
            )

            print("✅ GUI app configuration loading passed")

        except ImportError as e:
            error_msg = f"Failed to import configuration: {e}"
            if "gradio_image_annotation" in str(e):
                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
                error_msg += "\nPlease run: conda activate redaction"
                error_msg += "\nThen run this test again."
            self.fail(error_msg)
        except Exception as e:
            error_msg = f"Unexpected error during configuration test: {e}"
            if "gradio_image_annotation" in str(e):
                error_msg += "\n\nNOTE: This test requires the 'redaction' conda environment to be activated."
                error_msg += "\nPlease run: conda activate redaction"
                error_msg += "\nThen run this test again."
            self.fail(error_msg)


def run_all_tests():
    """Run all test examples and report results."""
    print("=" * 80)
    print("DOCUMENT REDACTION TEST SUITE")
    print("=" * 80)
    print("This test suite includes:")
    print("- CLI examples from the epilog")
    print("- GUI application loading and initialization tests")
    print("Tests will be skipped if required example files are not found.")
    print("AWS-related tests may fail if credentials are not configured.")
    print("=" * 80)

    # Create test suite
    loader = unittest.TestLoader()
    suite = unittest.TestSuite()

    # Add CLI tests
    cli_suite = loader.loadTestsFromTestCase(TestCLIRedactExamples)
    suite.addTests(cli_suite)

    # Add GUI tests
    gui_suite = loader.loadTestsFromTestCase(TestGUIApp)
    suite.addTests(gui_suite)

    # Run tests with detailed output
    runner = unittest.TextTestRunner(verbosity=2, stream=None)
    result = runner.run(suite)

    # Print summary
    print("\n" + "=" * 80)
    print("TEST SUMMARY")
    print("=" * 80)
    print(f"Tests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Skipped: {len(result.skipped) if hasattr(result, 'skipped') else 0}")

    if result.failures:
        print("\nFAILURES:")
        for test, traceback in result.failures:
            print(f"- {test}: {traceback}")

    if result.errors:
        print("\nERRORS:")
        for test, traceback in result.errors:
            print(f"- {test}: {traceback}")

    success = len(result.failures) == 0 and len(result.errors) == 0
    print(f"\nOverall result: {'✅ PASSED' if success else '❌ FAILED'}")
    print("=" * 80)

    return success


if __name__ == "__main__":
    # Run the test suite
    success = run_all_tests()
    exit(0 if success else 1)