from __future__ import annotations import json import resource import sys import textwrap from pathlib import Path ROOT = Path('/home/.z/workspaces/con_2gAuSTkawPiOse8J/llm-foundry') SRC = ROOT / 'src' OUT = Path('/home/workspace/Deliverables/qwen-sky-proof') MODEL = 'Qwen/Qwen2.5-0.5B-Instruct' MAX_NEW = 64 sys.path.insert(0, str(SRC)) from llm_foundry.adapters import HuggingFacePipelineBackend from llm_foundry.memory import CompressionEngine, ObsidianMemoryVault from llm_foundry.tokenizer import estimate_token_count def rss_mb(): return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 def run_prompt(backend, prompt): start = time.perf_counter() out = backend.generate(prompt) ms = (time.perf_counter() - start) * 1000 return out, ms import time def esc(t): return (t or '').replace('&','&').replace('<','<').replace('>','>').replace('"','"').replace("'", ''') def make_report(): OUT.mkdir(parents=True) backend = HuggingFacePipelineBackend(MODEL, max_new_tokens=MAX_NEW) # Memory vault seeded with facts the model can use vault = ObsidianMemoryVault(OUT / 'memory-vault') vault.add_note('sky-scattering', 'Short wavelengths scatter more than long ones. Blue/violet scatter first, leaving red/orange at sunset.', tags=['science','physics']) vault.add_note('sunset-color', 'At sunset light travels longer through atmosphere, scattering blue away, leaving red/orange hues.', tags=['science','sunset']) vault.add_note('day-sky', 'During day we see scattered blue light overhead. Short waves scatter in all directions.', tags=['science','sky']) compressor = CompressionEngine(vault=vault) # Both get SAME question+task. After gets relevant memory injected. TASK = 'Answer: why is the sky blue during the day and red at sunset? Use 3 bullet points. Plain English.' NO_CONTEXT = TASK CONTEXT = compressor.compress_transcript( task=TASK, transcript=[ 'The sky changes colour because of how sunlight interacts with the atmosphere.', 'Shorter wavelengths scatter first, making the day sky blue.', 'At sunset light passes through more atmosphere, scattering blue away, leaving red.', ], memory_query='sky blue red sunset scattering atmosphere', target_tokens=60, ) WITH_CONTEXT = f"{TASK}\n\nCONTEXT:\n{CONTEXT.to_prompt()}" no_ctx_out, no_ctx_ms = run_prompt(backend, NO_CONTEXT) with_ctx_out, with_ctx_ms = run_prompt(backend, WITH_CONTEXT) no_ctx_toks = estimate_token_count(NO_CONTEXT) ctx_toks = estimate_token_count(WITH_CONTEXT) peak = rss_mb() # Token delta tok_delta_pct = 100.0 * (1 - ctx_toks / no_ctx_toks) ms_delta = with_ctx_ms - no_ctx_ms transcript = textwrap.dedent(f"""\ == LLM Foundry local model proof == model={MODEL} max_new_tokens={MAX_NEW} before_tokens={no_ctx_toks} after_tokens={ctx_toks} before_latency_ms={no_ctx_ms:.1f} after_latency_ms={with_ctx_ms:.1f} TASK (identical for both): {TASK} BEFORE (task only, no memory): {NO_CONTEXT} BEFORE OUTPUT: {no_ctx_out} AFTER (task + compressed memory context): {WITH_CONTEXT} AFTER OUTPUT: {with_ctx_out} DELTA: token_delta_pct={tok_delta_pct:.1f}% (saved on context) latency_delta_ms={ms_delta:.1f} (higher because context is larger) peak_rss_mb={peak:.0f} """) def section(title, pre_text, caption=''): cap = f'

{esc(caption)}

' if caption else '' return ( f'

' f'

{esc(title)}

{cap}' f'

{esc(pre_text or "(no output)")}

' f'

' ) def metric(val, label, note=''): note_html = f'

{esc(note)}

' if note else '' return ( f'

' f'

{esc(val)}

' f'

{esc(label)}

' f'{note_html}' f'

' ) def row(a_lbl, a_val, b_lbl, b_val, diff, diff_note): cls = 'pos' if diff > 0 else 'neg' arrow = '↑' if diff > 0 else '↓' return ( f'' f'{esc(a_lbl)}{esc(a_val)}' f'{esc(b_lbl)}{esc(b_val)}' f'{arrow} {diff_note}' f'' ) html = f""" LLM Foundry local model proof

Same local model. Same task. Real before vs after memory.

Qwen2.5-0.5B-Instruct through LLM Foundry's memory layer. No API calls. No cloud. 100% local.

model={MODEL} HuggingFacePipelineBackend Local memory vault + embeddings 0 cloud APIs

{metric(f'{no_ctx_ms:.0f} ms', 'Before latency', 'raw task, no memory')} {metric(f'{with_ctx_ms:.0f} ms', 'After latency', '+ memory context')} {metric(f'{peak:.0f} MB', 'Peak RAM', 'both runs')}

{section('BEFORE — task only', NO_CONTEXT, 'no memory, no compression')} {section('BEFORE output', no_ctx_out or '(no output)')}

{section('AFTER — task + memory context', WITH_CONTEXT, '+ compressed memory context from vault')} {section('AFTER output', with_ctx_out or '(no output)')}

Delta

{row('Latency', f'{no_ctx_ms:.0f} ms', 'Latency', f'{with_ctx_ms:.0f} ms', ms_delta_abs = (no_ctx_ms - with_ctx_ms); (ms_delta_abs if ms_delta_abs > 0 else 0), f'{"faster" if ms_delta < 0 else "slower"} by {abs(ms_delta):.0f} ms')} {row('Context tokens', f'{no_ctx_toks}', 'Context tokens', f'{ctx_toks}', -tok_delta_pct, f'{"saved" if tok_delta_pct > 0 else "added"} {abs(tok_delta_pct:.0f}% via compression')} {row('Peak RAM', f'{peak:.0f} MB', '—', '—', 0, '')}

Metric	Before	After	Change

""" (OUT / 'report.html').write_text(html, encoding='utf-8') (OUT / 'terminal_transcript.txt').write_text(transcript) (OUT / 'comparison.json').write_text(json.dumps({ 'model': MODEL, 'before': {'prompt': NO_CONTEXT, 'output': no_ctx_out, 'latency_ms': no_ctx_ms, 'tokens': no_ctx_toks}, 'after': {'prompt': WITH_CONTEXT, 'output': with_ctx_out, 'latency_ms': with_ctx_ms, 'tokens': ctx_toks}, 'tok_delta_pct': tok_delta_pct, 'latency_delta_ms': ms_delta, 'peak_rss_mb': peak, }, indent=2)) print(f"DONE tok_delta_pct={tok_delta_pct:.1f} ms_delta={ms_delta:.1f}") print(f"BEFORE:\n{no_ctx_out}\n") print(f"AFTER:\n{with_ctx_out}\n") if __name__ == '__main__': make_report()