"""Chunker unit tests — covers metadata, title-boost, degenerate paths."""

from types import SimpleNamespace

from app.services.kb.chunker import chunk_article, chunk_articles


def _mk(**kw):
    return SimpleNamespace(
        article_id=kw.get("article_id", "voi"),
        tenant_id=kw.get("tenant_id", "global"),
        title=kw.get("title", "Voi"),
        body_md=kw.get("body_md", ""),
        tags=kw.get("tags", []),
        age_min=kw.get("age_min"),
        age_max=kw.get("age_max"),
        language=kw.get("language", "vi"),
        version=kw.get("version", 1),
    )


def test_empty_body_returns_no_chunks():
    assert chunk_article(_mk(body_md="")) == []
    assert chunk_article(_mk(body_md="   \n   ")) == []


def test_title_boost_prepended_on_every_chunk():
    art = _mk(body_md="# Voi\n\n" + ("Voi to lắm. " * 100))
    docs = chunk_article(art)
    assert docs
    for d in docs:
        assert d.page_content.startswith("Tiêu đề: Voi"), d.page_content[:50]


def test_title_boost_omitted_when_title_empty():
    art = _mk(title="", body_md="# Section\n\nbody text " * 30)
    docs = chunk_article(art)
    assert docs
    for d in docs:
        assert not d.page_content.startswith("Tiêu đề:")


def test_metadata_includes_required_fields():
    art = _mk(
        body_md="# H1\n\n## H2\n\n" + ("voi ăn lá cây. " * 80),
        tags=["dong-vat"],
        age_min=4,
        age_max=10,
    )
    docs = chunk_article(art)
    assert docs
    m = docs[0].metadata
    for key in (
        "source_id",
        "article_id",
        "tenant_id",
        "title",
        "tags",
        "language",
        "article_version",
        "chunk_idx",
    ):
        assert key in m, f"metadata missing key {key}: {sorted(m.keys())}"
    assert m["source_id"] == m["article_id"] == "voi"
    assert m["tags"] == ["dong-vat"]
    assert m["age_min"] == 4 and m["age_max"] == 10


def test_chunk_idx_is_monotonic():
    art = _mk(body_md="# Voi\n\n" + ("Voi sống ở rừng. " * 200))
    docs = chunk_article(art)
    indices = [d.metadata["chunk_idx"] for d in docs]
    assert indices == list(range(len(docs)))


def test_chunk_articles_flattens():
    a = _mk(article_id="a", body_md="# A\n\nbody " * 30)
    b = _mk(article_id="b", body_md="# B\n\nbody " * 30)
    out = chunk_articles([a, b])
    aids = {d.metadata["article_id"] for d in out}
    assert aids == {"a", "b"}
