#!/usr/bin/env python3
"""
Simple PDF reader to test pypdf behavior for vulnerability CVE-2026-24688.

Usage:
    python simple_read_pdf.py <path_to_pdf>

This will attempt to:
1. Open the PDF
2. Access basic metadata
3. Access outline/bookmarks (THIS is where circular ref vulnerability triggers)
4. Extract text from first page

⚠️ WARNING: If PDF has circular outline references, this will hang!
"""

import sys
import time
from pathlib import Path

# Add pypdf to path
sys.path.insert(0, str(Path(__file__).parent))

from pypdf import PdfReader


def read_pdf(pdf_path: str):
    """
    Read PDF and display information.
    
    Args:
        pdf_path: Path to PDF file
    """
    pdf_path = Path(pdf_path)
    
    if not pdf_path.exists():
        print(f"❌ Error: File not found: {pdf_path}")
        return 1
    
    print("=" * 70)
    print(f"📄 Reading PDF: {pdf_path.name}")
    print("=" * 70)
    print()
    
    # Step 1: Open PDF
    print("Step 1: Opening PDF...")
    start = time.time()
    try:
        reader = PdfReader(str(pdf_path))
        elapsed = time.time() - start
        print(f"✅ Opened successfully ({elapsed:.3f}s)")
    except Exception as e:
        print(f"❌ Failed to open: {e}")
        return 1
    
    print()
    
    # Step 2: Basic metadata
    print("Step 2: Reading metadata...")
    try:
        metadata = reader.metadata
        if metadata:
            print(f"  Title: {metadata.get('/Title', 'N/A')}")
            print(f"  Author: {metadata.get('/Author', 'N/A')}")
            print(f"  Subject: {metadata.get('/Subject', 'N/A')}")
            print(f"  Creator: {metadata.get('/Creator', 'N/A')}")
        else:
            print("  No metadata found")
        print(f"  Pages: {len(reader.pages)}")
        print(f"  Encrypted: {reader.is_encrypted}")
    except Exception as e:
        print(f"⚠️  Warning: {e}")
    
    print()
    
    # Step 3: Outline (THIS IS WHERE VULNERABILITY TRIGGERS!)
    print("Step 3: Reading outline/bookmarks...")
    print("⚠️  THIS IS WHERE CIRCULAR REFERENCE VULNERABILITY TRIGGERS!")
    print("⏳  If this hangs, you'll need to Ctrl+C to kill it...")
    print()
    
    start = time.time()
    try:
        outline = reader.outline
        elapsed = time.time() - start
        
        if outline:
            print(f"✅ Outline read successfully ({elapsed:.3f}s)")
            print(f"  Bookmark count: {len(outline)}")
            
            # Show first few bookmarks
            print("\n  First few bookmarks:")
            for i, item in enumerate(outline[:5]):
                if isinstance(item, list):
                    print(f"    [{i+1}] (nested outline)")
                else:
                    title = item.get('/Title', 'Untitled')
                    print(f"    [{i+1}] {title}")
            
            if len(outline) > 5:
                print(f"    ... and {len(outline) - 5} more")
        else:
            elapsed = time.time() - start
            print(f"✅ No outline/bookmarks ({elapsed:.3f}s)")
    
    except KeyboardInterrupt:
        print()
        print()
        print("=" * 70)
        print("❌ KILLED BY USER (Ctrl+C)")
        print("=" * 70)
        print()
        print("🔥 This PDF has CIRCULAR OUTLINE REFERENCES!")
        print("   The code was stuck in an infinite loop.")
        print()
        print("This demonstrates the vulnerability:")
        print("  Location: pypdf/_doc_common.py:858-873")
        print("  Issue: No cycle detection in outline traversal")
        print("  Impact: Denial of Service (infinite loop)")
        print()
        return 1
    
    except RecursionError as e:
        elapsed = time.time() - start
        print(f"❌ RecursionError after {elapsed:.3f}s: {e}")
        print()
        print("🔥 This PDF has NESTED CIRCULAR REFERENCES!")
        print("   The code exceeded Python's recursion limit.")
        print()
        return 1
    
    except Exception as e:
        elapsed = time.time() - start
        print(f"⚠️  Error after {elapsed:.3f}s: {e}")
        return 1
    
    print()
    
    # Step 4: Extract text from first page
    print("Step 4: Extracting text from first page...")
    try:
        if len(reader.pages) > 0:
            first_page = reader.pages[0]
            text = first_page.extract_text()
            
            if text:
                print(f"✅ Text extracted ({len(text)} characters)")
                print("\n  First 200 characters:")
                print("  " + "-" * 66)
                preview = text[:200].replace('\n', '\n  ')
                print(f"  {preview}")
                if len(text) > 200:
                    print("  ...")
                print("  " + "-" * 66)
            else:
                print("  No text found on first page")
        else:
            print("  No pages in PDF")
    except Exception as e:
        print(f"⚠️  Warning: {e}")
    
    print()
    print("=" * 70)
    print("✅ PDF read successfully - No vulnerabilities detected")
    print("=" * 70)
    
    return 0


def main():
    if len(sys.argv) < 2:
        print("Simple PDF Reader (pypdf)")
        print()
        print("Usage:")
        print("  python simple_read_pdf.py <path_to_pdf>")
        print()
        print("Examples:")
        print("  python simple_read_pdf.py document.pdf")
        print("  python simple_read_pdf.py /path/to/file.pdf")
        print()
        print("To test the circular reference vulnerability:")
        print("  python simple_read_pdf.py malicious_circular_outline.pdf")
        print()
        print("⚠️  WARNING: Malicious PDFs will hang! Use Ctrl+C to kill.")
        print()
        return 1
    
    pdf_path = sys.argv[1]
    return read_pdf(pdf_path)


if __name__ == "__main__":
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        print()
        print()
        print("=" * 70)
        print("⚠️  INTERRUPTED BY USER")
        print("=" * 70)
        sys.exit(1)
