from __future__ import annotations

import json
import sys
from pathlib import Path

REPO_SRC = Path('/home/.z/workspaces/con_2gAuSTkawPiOse8J/llm-foundry/src')
if str(REPO_SRC) not in sys.path:
    sys.path.insert(0, str(REPO_SRC))

from llm_foundry.adapters import HuggingFacePipelineBackend
from llm_foundry.benchmarks import BenchmarkCase, BenchmarkSuite
from llm_foundry.memory import CompressionEngine, ObsidianMemoryVault
from llm_foundry.rag import LocalRetriever
from llm_foundry.reasoning import ReflectionEngine

MODEL = 'Qwen/Qwen2.5-0.5B-Instruct'
ROOT = Path('/home/.z/workspaces/con_2gAuSTkawPiOse8J/llm-foundry')
OUT = Path('/home/workspace/Deliverables/llm-foundry-small-model')


def main() -> None:
    backend = HuggingFacePipelineBackend(MODEL, max_new_tokens=96)
    reflection = ReflectionEngine(backend)
    vault = ObsidianMemoryVault(OUT / 'memory-vault')
    compressor = CompressionEngine(vault=vault)
    retriever = LocalRetriever(ROOT)
    suite = BenchmarkSuite(backend)

    prompt = 'In one paragraph, explain why semantic retrieval matters in an assistant.'
    result = reflection.answer(prompt)

    transcript = [
        'I need a compact memo about semantic retrieval.',
        'The repository now uses embeddings instead of only keyword search.',
        'Keep the important facts and drop the filler.',
        'The point is to stay useful when wording changes.',
    ]
    context = compressor.compress_transcript(
        task='Write an internal memo about the retrieval layer',
        transcript=transcript,
        memory_query='semantic retrieval embeddings',
        target_tokens=180,
    )

    hits = retriever.search('semantic retrieval embeddings', top_k=3)
    report = suite.run([
        BenchmarkCase(name='exact_blue', prompt='Reply with exactly: blue', expected_exact='blue'),
        BenchmarkCase(name='reasoning_keywords', prompt='Explain why a model should verify its own answer in one sentence.', expected_contains=('verify', 'answer')),
    ])

    lines: list[str] = []
    lines += ['== LLM Foundry x Qwen2.5-0.5B local proof run ==', f'model={MODEL}', 'backend=HuggingFacePipelineBackend', '']
    lines += ['$ python -m llm_foundry demo --backend hf --model Qwen/Qwen2.5-0.5B-Instruct --prompt "In one paragraph, explain why semantic retrieval matters in an assistant."', '']
    lines += ['DRAFT:', result.draft, '', 'CRITIQUE:', result.critique, '', 'FINAL:', result.final, '']
    lines += ['== Memory compression ==', f'before_tokens={context.token_estimate_before}', f'after_tokens={context.token_estimate_after}', 'compressed_prompt:']
    lines += context.to_prompt().splitlines()
    lines += ['']
    lines += ['== Retrieval hits ==']
    if hits:
        for hit in hits:
            lines.append(f'{hit.path} | score={hit.score:.3f} | {hit.text[:180]}')
    else:
        lines.append('no hits')
    lines += ['']
    lines += ['== Mini benchmark ==', f'passed={report.passed}/{report.total}', f'pass_rate={report.pass_rate:.2%}']
    for item in report.results:
        lines.append(f'{item.name}: passed={str(item.passed).lower()} exact={str(item.exact_match).lower()} keyword_hits={item.keyword_hits} risk={item.delayed_harm_risk:.3f}')
    lines += ['']
    lines += ['== Why this matters ==', 'This is the part around the model that turns a chat toy into something that can remember, recover context, and be tested.']
    lines += ['']
    lines += ['GitHub: https://github.com/AmSach/llm-foundry', 'GitHub profile: https://github.com/AmSach', 'Instagram: https://www.instagram.com/i.amsach', 'LinkedIn: https://www.linkedin.com/in/theamansachan']

    OUT.mkdir(parents=True, exist_ok=True)
    (OUT / 'demo_output.txt').write_text('\n'.join(lines))
    print(OUT / 'demo_output.txt')


if __name__ == '__main__':
    main()