"""Source-normalizer route tests (markdown passthrough, text wrap, scanned PDF)."""

import pytest

from app.services.kb.pdf_extractor import ScannedPDFError, detect_scanned
from app.services.kb.source_normalizer import normalize_markdown, normalize_text


def test_normalize_markdown_strips_trailing_whitespace():
    body, meta = normalize_markdown("# Voi\n\nVoi to.   \n\n")
    assert body == "# Voi\n\nVoi to.\n"
    assert meta == {}


def test_normalize_text_wraps_under_h1():
    body, meta = normalize_text("Voi là động vật to.", "Voi")
    assert body.startswith("# Voi\n\n")
    assert "Voi là động vật to." in body
    assert meta == {"suggested_title": "Voi"}


def test_normalize_text_default_title_when_blank():
    body, meta = normalize_text("body", "")
    assert body.startswith("# Untitled")


def test_detect_scanned_heuristic():
    # Empty body + pages → likely scanned
    assert detect_scanned("", 5) is True
    # Body present → not scanned
    assert detect_scanned("# x\nbody", 5) is False
    # 0 pages → not scanned (no info)
    assert detect_scanned("", 0) is False


def test_scanned_pdf_error_class_hierarchy():
    # Must subclass ValueError so callers can catch broadly.
    assert issubclass(ScannedPDFError, ValueError)
