#!/usr/bin/env bun

import { execSync } from "child_process";
import { mkdirSync, writeFileSync, utimesSync, existsSync } from "fs";
import { join } from "path";

const BATCH_SIZE = 12; // Substack API max per request
const DELAY_MS = 300;

function usage() {
  console.log(`Usage: bun run archive.ts <substack-url-or-subdomain> <output-dir> [--since YYYY-MM-DD] [--free-only]

Arguments:
  substack-url-or-subdomain   Full URL or domain (e.g. newsletter.squishy.computer)
  output-dir                  Directory to save markdown files
  --since YYYY-MM-DD          Only fetch posts on or after this date (default: all)
  --free-only                 Skip paid/subscriber-only posts

Examples:
  bun run archive.ts https://newsletter.squishy.computer Articles/brander
  bun run archive.ts stratechery.com Articles/stratechery --since 2023-01-01
  bun run archive.ts contraptions.venkateshrao.com Articles/vgr --free-only`);
  process.exit(1);
}

function parseArgs() {
  const args = process.argv.slice(2);
  if (args.length < 2) usage();

  let baseUrl = args[0];
  const outputDir = args[1];
  let since: string | null = null;
  let freeOnly = false;

  for (let i = 2; i < args.length; i++) {
    if (args[i] === "--since" && args[i + 1]) {
      since = args[++i];
    } else if (args[i] === "--free-only") {
      freeOnly = true;
    } else if (args[i] === "--help" || args[i] === "-h") {
      usage();
    }
  }

  if (!baseUrl.startsWith("http")) {
    baseUrl = `https://${baseUrl}`;
  }
  baseUrl = baseUrl.replace(/\/+$/, "");

  return { baseUrl, outputDir, since, freeOnly };
}

interface PostMeta {
  title: string;
  slug: string;
  post_date: string;
  audience: string;
}

interface PostFull {
  title: string;
  slug: string;
  post_date: string;
  body_html: string;
  canonical_url: string;
  publishedBylines?: { name: string }[];
}

async function fetchPostList(baseUrl: string, since: string | null, freeOnly: boolean): Promise<PostMeta[]> {
  const posts: PostMeta[] = [];
  let offset = 0;

  while (true) {
    const url = `${baseUrl}/api/v1/archive?sort=new&limit=${BATCH_SIZE}&offset=${offset}`;
    const res = await fetch(url);
    if (!res.ok) {
      console.error(`API error at offset ${offset}: ${res.status} ${res.statusText}`);
      break;
    }
    const batch = await res.json();

    if (!Array.isArray(batch) || batch.length === 0) break;

    for (const p of batch) {
      if (since && p.post_date < since) {
        return posts;
      }
      if (freeOnly && p.audience === "only_paid") continue;
      posts.push({ title: p.title, slug: p.slug, post_date: p.post_date, audience: p.audience });
    }

    if (batch.length < BATCH_SIZE) break;
    offset += BATCH_SIZE;
    await Bun.sleep(DELAY_MS);
  }

  return posts;
}

async function fetchPost(baseUrl: string, slug: string): Promise<PostFull | null> {
  const url = `${baseUrl}/api/v1/posts/${slug}`;
  const res = await fetch(url);
  if (!res.ok) return null;
  return res.json();
}

function htmlToMarkdown(html: string): string {
  try {
    return execSync("pandoc -f html -t markdown --wrap=none", {
      input: html,
      encoding: "utf-8",
      maxBuffer: 10 * 1024 * 1024,
    });
  } catch {
    return html;
  }
}

function escapeYaml(s: string): string {
  if (/[:"'#\[\]{}|>&*!%@`]/.test(s) || s.trim() !== s) {
    return `"${s.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
  }
  return s;
}

async function main() {
  const { baseUrl, outputDir, since, freeOnly } = parseArgs();

  mkdirSync(outputDir, { recursive: true });

  console.log(`Fetching post list from ${baseUrl}...${freeOnly ? " (free only)" : ""}`);
  const posts = await fetchPostList(baseUrl, since, freeOnly);
  console.log(`Found ${posts.length} posts${since ? ` since ${since}` : ""}${freeOnly ? " (free)" : ""}\n`);

  if (posts.length === 0) {
    console.log("Nothing to do.");
    return;
  }

  let saved = 0;
  let failed = 0;

  for (let i = 0; i < posts.length; i++) {
    const meta = posts[i];
    const filename = `${meta.slug}.md`;
    const filepath = join(outputDir, filename);

    process.stdout.write(`[${i + 1}/${posts.length}] ${meta.title}... `);

    const post = await fetchPost(baseUrl, meta.slug);
    if (!post || !post.body_html) {
      console.log("SKIP (no content)");
      failed++;
      await Bun.sleep(DELAY_MS);
      continue;
    }

    const md = htmlToMarkdown(post.body_html);
    const dateStr = meta.post_date.slice(0, 10);
    const author = post.publishedBylines?.[0]?.name ?? "";
    const postUrl = post.canonical_url || `${baseUrl}/p/${meta.slug}`;

    const frontmatter = [
      "---",
      `title: ${escapeYaml(meta.title)}`,
      `date: ${dateStr}`,
      `url: ${postUrl}`,
      ...(author ? [`author: ${escapeYaml(author)}`] : []),
      "---",
      "",
    ].join("\n");

    writeFileSync(filepath, frontmatter + md);

    const pubDate = new Date(meta.post_date);
    utimesSync(filepath, pubDate, pubDate);

    console.log(`OK (${md.length} chars)`);
    saved++;
    await Bun.sleep(DELAY_MS);
  }

  console.log(`\nDone! ${saved} saved, ${failed} failed → ${outputDir}`);
}

main().catch((e) => {
  console.error(e);
  process.exit(1);
});
