smol-training-playbook

Running on CPU Upgrade

tfrere HF Staff commited on Feb 5

Commit

2b16052

1 Parent(s): aaaea48

feat: add txt/docx export scripts and fix MDX angle bracket parsing

- Add export-txt.mjs and export-docx.mjs for alternative export formats
- Fix MDX parser error by escaping angle brackets before numbers (e.g., <30B → <30B)
- Update article content from Notion
- Minor improvements to export-pdf.mjs and screenshot-elements.mjs

Files changed (13) hide show

.gitignore +1 -0
app/package-lock.json +0 -0
app/package.json +0 -0
app/scripts/README-TXT-EXPORT.md +129 -0
app/scripts/export-docx.mjs +303 -0
app/scripts/export-pdf.mjs +16 -4
app/scripts/export-txt.mjs +527 -0
app/scripts/notion-importer/mdx-converter.mjs +30 -0
app/scripts/screenshot-elements.mjs +87 -16
app/src/content/article.mdx +0 -0
app/src/pages/dataviz.astro +1 -1
app/src/pages/index.astro +30 -0
app/yarn.lock +0 -0

.gitignore CHANGED Viewed

@@ -43,3 +43,4 @@ app/public/data/**/*
 .temp-*/
 .backup-*/

 .temp-*/
 .backup-*/
+*.docx

app/package-lock.json CHANGED Viewed

Binary files a/app/package-lock.json and b/app/package-lock.json differ

app/package.json CHANGED Viewed

Binary files a/app/package.json and b/app/package.json differ

app/scripts/README-TXT-EXPORT.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# TXT Export for Book Publishing
+This script exports the article to a simple text format suitable for book publishing software, with custom tags for special elements.
+## Usage
+```bash
+npm run export:txt
+```
+Or with custom filename:
+```bash
+node scripts/export-txt.mjs --filename=my-article
+```
+## Output
+The script generates a `.txt` file in the `dist/` folder with the following format:
+### Text Tags
+#### Figures/Images
+```
+<f> NAME ANCHOR DESCRIPTION </f>
+```
+- **NAME**: Figure name (e.g., "Figure 1")
+- **ANCHOR**: HTML anchor/ID for cross-references
+- **DESCRIPTION**: Figure caption/description
+Example:
+```
+<f>Figure 1 placeholder-image A placeholder image description</f>
+```
+#### Tables
+```
+<t> NAME DESCRIPTION </t>
+```
+- **NAME**: Table name (e.g., "Table 1")
+- **DESCRIPTION**: Table caption/description
+Example:
+```
+<t>Table 1 | Comparison of model architectures</t>
+```
+#### Code Blocks
+```
+<c> CODE | DESCRIPTION </c>
+```
+- **CODE**: The actual code content
+- **DESCRIPTION**: Optional description or caption
+Example:
+```
+<c>function hello() {
+  console.log("Hello world");
+} | JavaScript example function</c>
+```
+#### Inline Code
+```
+<ic> CODE </ic>
+```
+Example:
+```
+Use the <ic>npm install</ic> command to install dependencies.
+```
+#### LaTeX Formulas
+```
+<l> katex-number </l>
+```
+References to exported KaTeX formula PNGs, numbered chronologically.
+Example:
+```
+The equation <l>katex-1</l> shows the relationship...
+```
+The corresponding PNG files should be exported separately (e.g., `katex-1.png`, `katex-2.png`, etc.)
+## Standard Markdown Elements
+The script also preserves standard markdown formatting:
+- **Headings**: `# ## ###` etc.
+- **Paragraphs**: Plain text with line breaks
+- **Lists**: Bulleted (`-`) and numbered (`1. 2. 3.`)
+- **Blockquotes**: `> Text`
+## How It Works
+1. **Build**: Builds the Astro site (if not already built)
+2. **Launch**: Starts a preview server
+3. **Extract**: Uses Playwright to load the page and extract content from the DOM
+4. **Convert**: Transforms HTML elements into the custom tag format
+5. **Export**: Writes the result to `dist/article.txt`
+## Example Output
+```
+# Introduction
+This is a paragraph with <ic>inline code</ic> and a reference to <l>katex-1</l>.
+<f>Figure 1 training-loss Training loss over time for SmolLM3</f>
+## Methods
+We used the following approach:
+- First step
+- Second step
+- Third step
+<c>def train_model():
+    return model | Python training function</c>
+<t>Table 1 | Hyperparameters used in training</t>
+```
+## Notes
+- The script reuses the same infrastructure as PDF export (`export-pdf.mjs`)
+- It's designed to work with the existing Astro build pipeline
+- All custom components (Image, HtmlEmbed, Note, etc.) are properly handled
+- KaTeX formulas are numbered sequentially for easy reference to exported PNGs

app/scripts/export-docx.mjs ADDED Viewed

	@@ -0,0 +1,303 @@

+#!/usr/bin/env node
+/**
+ * Export TXT to DOCX format for book publishing
+ *
+ * This script converts the exported TXT file to a simple DOCX document:
+ * - Preserves headings, paragraphs, lists
+ * - Keeps custom tags (<f>, <t>, <l>, <ic>, <il>, <n>) as-is for manual processing
+ * - Formats code blocks
+ * - Creates a clean document ready for further editing
+ *
+ * Usage:
+ *   node scripts/export-docx.mjs [--input=path/to/file.txt]
+ *   npm run export:docx
+ */
+import { Document, Packer, Paragraph, TextRun, HeadingLevel, AlignmentType } from 'docx';
+import { promises as fs } from 'node:fs';
+import { resolve } from 'node:path';
+import process from 'node:process';
+function parseArgs(argv) {
+  const out = {};
+  for (const arg of argv.slice(2)) {
+    if (!arg.startsWith('--')) continue;
+    const [k, v] = arg.replace(/^--/, '').split('=');
+    out[k] = v === undefined ? true : v;
+  }
+  return out;
+}
+function detectHeadingLevel(line) {
+  const match = line.match(/^(#{1,6})\s+(.+)$/);
+  if (!match) return null;
+  const level = match[1].length;
+  const text = match[2].trim();
+  return { level, text };
+}
+function parseInlineFormatting(text) {
+  const runs = [];
+  let currentPos = 0;
+  // Parse inline tags: <ic>, <il>, <n> (keep as-is with special formatting)
+  const tagRegex = /<(ic|il|n)>([^<]*)<\/\1>/g;
+  let match;
+  while ((match = tagRegex.exec(text)) !== null) {
+    // Add text before the tag
+    if (match.index > currentPos) {
+      const beforeText = text.substring(currentPos, match.index);
+      if (beforeText) {
+        runs.push(new TextRun(beforeText));
+      }
+    }
+    // Add the tagged content with special formatting
+    const tagType = match[1];
+    const content = match[2];
+    if (tagType === 'ic') {
+      // Inline code: monospace, gray background
+      runs.push(new TextRun({
+        text: content,
+        font: 'Courier New',
+        color: '333333',
+        shading: { fill: 'E8E8E8', type: 'clear' }
+      }));
+    } else if (tagType === 'il') {
+      // Inline LaTeX: italic, keep as-is
+      runs.push(new TextRun({
+        text: content,
+        italics: true,
+        color: '0066CC'
+      }));
+    } else if (tagType === 'n') {
+      // Note: keep tag for manual processing
+      runs.push(new TextRun({
+        text: `<n>${content}</n>`,
+        color: 'FF6B00',
+        italics: true
+      }));
+    }
+    currentPos = match.index + match[0].length;
+  }
+  // Add remaining text
+  if (currentPos < text.length) {
+    runs.push(new TextRun(text.substring(currentPos)));
+  }
+  return runs.length > 0 ? runs : [new TextRun(text)];
+}
+async function convertTxtToDocx(txtPath, outputPath) {
+  console.log(`📖 Reading TXT file: ${txtPath}`);
+  const content = await fs.readFile(txtPath, 'utf-8');
+  const lines = content.split('\n');
+  const paragraphs = [];
+  let inCodeBlock = false;
+  let codeLines = [];
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i];
+    // Skip empty lines unless in code block
+    if (!line.trim() && !inCodeBlock) {
+      paragraphs.push(new Paragraph({ text: '' }));
+      continue;
+    }
+    // Handle code blocks <c>...</c>
+    if (line.trim().startsWith('<c>')) {
+      inCodeBlock = true;
+      codeLines = [];
+      const firstLine = line.replace(/^<c>\s*/, '');
+      if (firstLine && !firstLine.startsWith('</c>')) {
+        codeLines.push(firstLine);
+      }
+      continue;
+    }
+    if (line.trim().endsWith('</c>')) {
+      const lastLine = line.replace(/<\/c>\s*$/, '');
+      if (lastLine) codeLines.push(lastLine);
+      // Add code block as paragraph(s)
+      if (codeLines.length > 0) {
+        paragraphs.push(new Paragraph({
+          text: codeLines.join('\n'),
+          font: 'Courier New',
+          size: 20,
+          shading: { fill: 'F5F5F5', type: 'clear' },
+          spacing: { before: 200, after: 200 }
+        }));
+      }
+      inCodeBlock = false;
+      codeLines = [];
+      continue;
+    }
+    if (inCodeBlock) {
+      codeLines.push(line);
+      continue;
+    }
+    // Handle figure tags <f>...</f>
+    if (line.trim().startsWith('<f>')) {
+      paragraphs.push(new Paragraph({
+        children: [new TextRun({
+          text: line.trim(),
+          color: '0066CC',
+          bold: true
+        })],
+        spacing: { before: 200, after: 100 }
+      }));
+      continue;
+    }
+    // Handle table tags <t>...</t>
+    if (line.trim().startsWith('<t>')) {
+      paragraphs.push(new Paragraph({
+        children: [new TextRun({
+          text: line.trim(),
+          color: '009688',
+          bold: true
+        })],
+        spacing: { before: 200, after: 100 }
+      }));
+      continue;
+    }
+    // Handle LaTeX display tags <l>...</l>
+    if (line.trim().startsWith('<l>')) {
+      paragraphs.push(new Paragraph({
+        children: [new TextRun({
+          text: line.trim(),
+          color: '9C27B0',
+          bold: true
+        })],
+        alignment: AlignmentType.CENTER,
+        spacing: { before: 200, after: 200 }
+      }));
+      continue;
+    }
+    // Handle headings
+    const heading = detectHeadingLevel(line);
+    if (heading) {
+      const headingLevels = {
+        1: HeadingLevel.HEADING_1,
+        2: HeadingLevel.HEADING_2,
+        3: HeadingLevel.HEADING_3,
+        4: HeadingLevel.HEADING_4,
+        5: HeadingLevel.HEADING_5,
+        6: HeadingLevel.HEADING_6
+      };
+      paragraphs.push(new Paragraph({
+        text: heading.text,
+        heading: headingLevels[heading.level],
+        spacing: { before: 400, after: 200 }
+      }));
+      continue;
+    }
+    // Handle list items
+    if (line.trim().startsWith('- ')) {
+      const text = line.trim().substring(2);
+      paragraphs.push(new Paragraph({
+        children: parseInlineFormatting(text),
+        bullet: { level: 0 },
+        spacing: { before: 100, after: 100 }
+      }));
+      continue;
+    }
+    // Handle numbered lists
+    const numberedMatch = line.trim().match(/^(\d+)\.\s+(.+)$/);
+    if (numberedMatch) {
+      const text = numberedMatch[2];
+      paragraphs.push(new Paragraph({
+        children: parseInlineFormatting(text),
+        numbering: { reference: 'default-numbering', level: 0 },
+        spacing: { before: 100, after: 100 }
+      }));
+      continue;
+    }
+    // Handle blockquotes
+    if (line.trim().startsWith('> ')) {
+      const text = line.trim().substring(2);
+      paragraphs.push(new Paragraph({
+        children: parseInlineFormatting(text),
+        italics: true,
+        indent: { left: 720 },
+        spacing: { before: 200, after: 200 }
+      }));
+      continue;
+    }
+    // Regular paragraph
+    if (line.trim()) {
+      paragraphs.push(new Paragraph({
+        children: parseInlineFormatting(line.trim()),
+        spacing: { before: 100, after: 100 }
+      }));
+    }
+  }
+  console.log(`📝 Creating DOCX with ${paragraphs.length} paragraphs...`);
+  const doc = new Document({
+    sections: [{
+      properties: {},
+      children: paragraphs
+    }]
+  });
+  console.log(`💾 Writing DOCX to: ${outputPath}`);
+  const buffer = await Packer.toBuffer(doc);
+  await fs.writeFile(outputPath, buffer);
+  console.log(`✅ DOCX created successfully!`);
+}
+async function main() {
+  const cwd = process.cwd();
+  const args = parseArgs(process.argv);
+  const inputPath = args.input || resolve(cwd, 'dist', 'the-smol-training-playbook-the-secrets-to-building-world-class-llms.txt');
+  const outputPath = args.output || inputPath.replace('.txt', '.docx');
+  // Check if input exists
+  try {
+    await fs.access(inputPath);
+  } catch {
+    console.error(`❌ Error: Input file not found: ${inputPath}`);
+    console.error('   Run "npm run export:txt" first to generate the TXT file.');
+    process.exit(1);
+  }
+  await convertTxtToDocx(inputPath, outputPath);
+  // Also copy to public folder
+  const publicPath = outputPath.replace('/dist/', '/public/');
+  try {
+    await fs.mkdir(resolve(cwd, 'public'), { recursive: true });
+    await fs.copyFile(outputPath, publicPath);
+    console.log(`✅ DOCX copied to: ${publicPath}`);
+  } catch (e) {
+    console.warn('Unable to copy DOCX to public/:', e?.message || e);
+  }
+}
+main().catch((err) => {
+  console.error('❌ Error:', err.message);
+  console.error(err);
+  process.exit(1);
+});

app/scripts/export-pdf.mjs CHANGED Viewed

@@ -246,6 +246,18 @@ iframe, embed, object { width: 100% !important; max-width: 100% !important; heig
 .html-embed, .html-embed__card { max-width: 100% !important; width: 100% !important; }
 .html-embed__card > div[id^="frag-"] { width: 100% !important; max-width: 100% !important; }
 /* Banner centering */
 .hero .points { mix-blend-mode: normal !important; }
 .hero-banner, .hero .hero-banner, [class*="hero-banner"] {
@@ -282,8 +294,8 @@ iframe, embed, object { width: 100% !important; max-width: 100% !important; heig
   width: auto !important;
   height: auto !important;
   max-width: 100% !important;
-  /* Limit height to fit on a single page (~250mm = 945px at 96dpi, minus margins) */
-  max-height: 800px !important;
   display: block !important;
   object-fit: contain !important;
   margin-left: auto !important;
@@ -727,8 +739,8 @@ async function main() {
     const browser = await chromium.launch({ headless: true });
     try {
-      // Use 2x scale factor for retina-quality screenshots
-      const deviceScaleFactor = 2;
       const context = await browser.newContext({
         deviceScaleFactor
       });

 .html-embed, .html-embed__card { max-width: 100% !important; width: 100% !important; }
 .html-embed__card > div[id^="frag-"] { width: 100% !important; max-width: 100% !important; }
+/* Wide mode: remove blur/mask effects for print */
+.wide, .html-embed--wide {
+  -webkit-mask: none !important;
+  mask: none !important;
+  background: transparent !important;
+  padding: 0 !important;
+  width: 100% !important;
+  margin-left: 0 !important;
+  transform: none !important;
+  border-radius: 0 !important;
+}
 /* Banner centering */
 .hero .points { mix-blend-mode: normal !important; }
 .hero-banner, .hero .hero-banner, [class*="hero-banner"] {
   width: auto !important;
   height: auto !important;
   max-width: 100% !important;
+  /* Limit height to fit on a single page (~269mm printable = ~1015px, with margin) */
+  max-height: 950px !important;
   display: block !important;
   object-fit: contain !important;
   margin-left: auto !important;
     const browser = await chromium.launch({ headless: true });
     try {
+      // Use 4x scale factor for high-DPI screenshots
+      const deviceScaleFactor = 4;
       const context = await browser.newContext({
         deviceScaleFactor
       });

app/scripts/export-txt.mjs ADDED Viewed

	@@ -0,0 +1,527 @@

+#!/usr/bin/env node
+/**
+ * Export article to TXT format for book publishing
+ *
+ * This script exports the article to a simple text format with custom tags:
+ * - <f> NAME ANCHOR DESCRIPTION </f> for figures/images
+ * - <t> NAME DESCRIPTION </t> for tables
+ * - <c> CODE | DESCRIPTION </c> for code blocks
+ * - <ic> CODE </ic> for inline code
+ * - <l> katex-number </l> for LaTeX formulas (references exported PNGs)
+ *
+ * Usage:
+ *   node scripts/export-txt.mjs
+ *   npm run export:txt
+ *
+ * Output: dist/article.txt
+ */
+import { spawn } from 'node:child_process';
+import { setTimeout as delay } from 'node:timers/promises';
+import { chromium } from 'playwright';
+import { resolve } from 'node:path';
+import { promises as fs } from 'node:fs';
+import process from 'node:process';
+async function run(command, args = [], options = {}) {
+  return new Promise((resolvePromise, reject) => {
+    const child = spawn(command, args, { stdio: 'inherit', shell: false, ...options });
+    child.on('error', reject);
+    child.on('exit', (code) => {
+      if (code === 0) resolvePromise(undefined);
+      else reject(new Error(`${command} ${args.join(' ')} exited with code ${code}`));
+    });
+  });
+}
+async function waitForServer(url, timeoutMs = 60000) {
+  const start = Date.now();
+  while (Date.now() - start < timeoutMs) {
+    try {
+      const res = await fetch(url);
+      if (res.ok) return;
+    } catch { }
+    await delay(500);
+  }
+  throw new Error(`Server did not start in time: ${url}`);
+}
+function parseArgs(argv) {
+  const out = {};
+  for (const arg of argv.slice(2)) {
+    if (!arg.startsWith('--')) continue;
+    const [k, v] = arg.replace(/^--/, '').split('=');
+    out[k] = v === undefined ? true : v;
+  }
+  return out;
+}
+function slugify(text) {
+  return String(text || '')
+    .normalize('NFKD')
+    .replace(/\p{Diacritic}+/gu, '')
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+    .slice(0, 120) || 'article';
+}
+/**
+ * Clean text content: remove extra whitespace, normalize line breaks
+ */
+function cleanText(text) {
+  return String(text || '')
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+/**
+ * Strip HTML tags from text
+ */
+function stripHtml(html) {
+  return String(html || '')
+    .replace(/<[^>]*>/g, '')
+    .replace(/&nbsp;/g, ' ')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .trim();
+}
+/**
+ * Convert heading level to markdown syntax
+ */
+function headingToMarkdown(level, text) {
+  const hashes = '#'.repeat(Math.min(level, 6));
+  return `${hashes} ${text}`;
+}
+/**
+ * Extract and convert article content to TXT format
+ */
+async function extractArticleContent(page) {
+  return await page.evaluate(() => {
+    const output = [];
+    let globalCounter = 0; // Global counter for all visual elements (matches screenshot script)
+    const katexMap = new Map(); // Track unique katex formulas for referencing
+    // Helper: clean text
+    const cleanText = (text) => String(text || '').replace(/\s+/g, ' ').trim();
+    // Helper: strip HTML
+    const stripHtml = (html) => {
+      const div = document.createElement('div');
+      div.innerHTML = html;
+      return cleanText(div.textContent || '');
+    };
+    // Helper: get element ID or generate anchor
+    const getAnchor = (el) => {
+      if (el.id) return el.id;
+      // Try to find ID in parent figure
+      const figure = el.closest('figure');
+      if (figure?.id) return figure.id;
+      return '';
+    };
+    // Helper: parse caption to extract name and description
+    const parseCaptionText = (captionText, type = 'Figure') => {
+      if (!captionText) return { name: '', description: '' };
+      // Try to match patterns like:
+      // "Figure 1: Description"
+      // "Table 2: Description"
+      // "Fig. 3: Description"
+      const patterns = [
+        new RegExp(`^(${type}\\s*\\d+[a-z]?)\\s*[:\\-–—]\\s*(.+)$`, 'i'),
+        new RegExp(`^(Fig\\.?\\s*\\d+[a-z]?)\\s*[:\\-–—]\\s*(.+)$`, 'i'),
+        new RegExp(`^(Table\\s*\\d+[a-z]?)\\s*[:\\-–—]\\s*(.+)$`, 'i'),
+      ];
+      for (const pattern of patterns) {
+        const match = captionText.match(pattern);
+        if (match) {
+          return { name: match[1].trim(), description: match[2].trim() };
+        }
+      }
+      // No pattern found, entire text is description
+      return { name: '', description: captionText.trim() };
+    };
+    // Process main content
+    const main = document.querySelector('main');
+    if (!main) return 'Error: main element not found';
+    // Helper: get all visual elements in DOM order (same as screenshot script)
+    const allVisualElements = Array.from(main.querySelectorAll('.html-embed, .table-scroll > table, .image-wrapper, figure, .katex-display'));
+    const elementIndexMap = new Map();
+    // Pre-process: assign global indices to visual elements
+    allVisualElements.forEach((el, idx) => {
+      elementIndexMap.set(el, idx + 1);
+    });
+    // Walk through all child nodes
+    const processNode = (node) => {
+      const tag = node.tagName?.toLowerCase();
+      // Headings
+      if (/^h[1-6]$/.test(tag)) {
+        const level = parseInt(tag[1]);
+        const text = cleanText(node.textContent);
+        const hashes = '#'.repeat(level);
+        output.push(`\n${hashes} ${text}\n`);
+        return;
+      }
+      // Paragraphs
+      if (tag === 'p') {
+        const text = node.textContent?.trim();
+        if (text) {
+          // Process inline elements within paragraph
+          let processedText = '';
+          const processInline = (n) => {
+            if (n.nodeType === Node.TEXT_NODE) {
+              processedText += n.textContent;
+            } else if (n.tagName === 'CODE' && !n.closest('pre')) {
+              // Inline code
+              const code = cleanText(n.textContent);
+              processedText += `<ic>${code}</ic>`;
+            } else if (n.classList?.contains('katex')) {
+              // Inline katex - wrap in <il> tags
+              const formula = cleanText(n.textContent || '');
+              processedText += `<il>${formula}</il>`;
+            } else if (n.childNodes) {
+              n.childNodes.forEach(processInline);
+            }
+          };
+          node.childNodes.forEach(processInline);
+          output.push(processedText.trim() + '\n');
+        }
+        return;
+      }
+      // Display math (KaTeX)
+      if (node.classList?.contains('katex-display')) {
+        const globalIndex = elementIndexMap.get(node);
+        if (globalIndex) {
+          output.push(`<l>katex-${globalIndex}</l>\n`);
+        }
+        return;
+      }
+      // Code blocks
+      if (tag === 'pre') {
+        const code = node.querySelector('code');
+        if (code) {
+          const codeText = code.textContent || '';
+          const language = code.className.match(/language-(\w+)/)?.[1] || '';
+          // Try to find description from parent or next sibling
+          let description = '';
+          const figure = node.closest('figure');
+          if (figure) {
+            const caption = figure.querySelector('figcaption');
+            if (caption) description = stripHtml(caption.innerHTML);
+          }
+          if (description) {
+            output.push(`<c>${codeText.trim()} | ${description}</c>\n`);
+          } else {
+            output.push(`<c>${codeText.trim()}</c>\n`);
+          }
+        }
+        return;
+      }
+      // Tables
+      if (tag === 'table') {
+        // Check if this table is in a .table-scroll container (visual element)
+        const tableScroll = node.closest('.table-scroll');
+        const globalIndex = tableScroll ? elementIndexMap.get(node) : null;
+        // Skip if not a tracked table, but still recurse
+        if (!globalIndex) {
+          return;
+        }
+        const figure = node.closest('figure');
+        let name = '';
+        let description = '';
+        let anchor = '';
+        if (figure) {
+          anchor = getAnchor(figure);
+          const caption = figure.querySelector('figcaption');
+          if (caption) {
+            const captionText = stripHtml(caption.innerHTML);
+            const parsed = parseCaptionText(captionText, 'Table');
+            name = parsed.name;
+            description = parsed.description;
+          }
+        }
+        // If no name found, generate one with global index (matching filename format)
+        if (!name) {
+          name = `table-${globalIndex}`;
+        }
+        // Build the tag
+        const parts = [name];
+        if (anchor) parts.push(anchor);
+        if (description) parts.push(description);
+        output.push(`<t>${parts.join(' | ')}</t>\n`);
+        // Extract table as simple text representation
+        const rows = Array.from(node.querySelectorAll('tr'));
+        const tableText = rows.map(row => {
+          const cells = Array.from(row.querySelectorAll('th, td'));
+          return cells.map(cell => cleanText(cell.textContent)).join(' | ');
+        }).join('\n');
+        output.push(tableText + '\n\n');
+        return;
+      }
+      // Figures (images, embeds)
+      if (tag === 'figure') {
+        const img = node.querySelector('img');
+        const htmlEmbed = node.querySelector('.html-embed, .html-embed--screenshot');
+        const imageWrapper = node.querySelector('.image-wrapper');
+        const caption = node.querySelector('figcaption');
+        // Skip if it's not really a figure (no img, no embed, no caption)
+        if (!img && !htmlEmbed && !imageWrapper && !caption) return;
+        // Try to find the global index from the visual element
+        const visualElement = htmlEmbed || imageWrapper || node;
+        const globalIndex = elementIndexMap.get(visualElement);
+        if (!globalIndex) return; // Skip if not tracked
+        let name = '';
+        let anchor = getAnchor(node);
+        let description = '';
+        if (caption) {
+          const captionText = stripHtml(caption.innerHTML);
+          const parsed = parseCaptionText(captionText, 'Figure');
+          name = parsed.name;
+          description = parsed.description;
+        }
+        // Get image alt text as fallback for description
+        if (!description && img?.alt) {
+          description = img.alt;
+        }
+        // If no name found in caption, generate one with global index (matching filename format)
+        if (!name) {
+          // Determine type for naming (matches screenshot script naming)
+          const type = htmlEmbed ? 'embed' : 'image';
+          name = `${type}-${globalIndex}`;
+        }
+        // Build the tag: <f> NAME ANCHOR DESCRIPTION </f>
+        const parts = [name];
+        if (anchor) parts.push(anchor);
+        if (description) parts.push(description);
+        output.push(`<f>${parts.join(' | ')}</f>\n\n`);
+        return;
+      }
+      // Lists
+      if (tag === 'ul' || tag === 'ol') {
+        const items = Array.from(node.querySelectorAll(':scope > li'));
+        items.forEach((item, idx) => {
+          const bullet = tag === 'ul' ? '-' : `${idx + 1}.`;
+          const text = cleanText(item.textContent);
+          output.push(`${bullet} ${text}\n`);
+        });
+        output.push('\n');
+        return;
+      }
+      // Blockquotes
+      if (tag === 'blockquote') {
+        const text = cleanText(node.textContent);
+        output.push(`> ${text}\n\n`);
+        return;
+      }
+      // Notes (Note component and Sidenote)
+      if (node.classList?.contains('note') || node.classList?.contains('sidenote')) {
+        const title = node.querySelector('.note__title, .note-title')?.textContent || '';
+        const content = cleanText(node.textContent);
+        if (title) {
+          output.push(`<n>${title} | ${content}</n>\n\n`);
+        } else {
+          output.push(`<n>${content}</n>\n\n`);
+        }
+        return;
+      }
+      // Recurse through children for unhandled elements
+      if (node.children && node.children.length > 0 && !['pre', 'code', 'table', 'figure'].includes(tag)) {
+        try {
+          Array.from(node.children).forEach(processNode);
+        } catch (e) {
+          console.error('Error processing children:', e);
+        }
+      }
+    };
+    // Process all direct children of main
+    Array.from(main.children).forEach(processNode);
+    // Add metadata about visual elements
+    const katexCount = Array.from(main.querySelectorAll('.katex-display')).length;
+    if (katexCount > 0) {
+      output.push(`\n\n<!-- Visual elements are numbered globally in DOM order (1, 2, 3...) to match exported screenshots -->\n`);
+      output.push(`<!-- KaTeX formulas: ${katexCount} formulas exported as N-katex.png where N is the global index -->\n`);
+    }
+    return output.join('');
+  });
+}
+async function main() {
+  const cwd = process.cwd();
+  const args = parseArgs(process.argv);
+  let outFileBase = args.filename || 'article';
+  outFileBase = outFileBase.replace(/\.txt$/i, '');
+  // Build only if dist/ does not exist
+  const distDir = resolve(cwd, 'dist');
+  let hasDist = false;
+  try {
+    const st = await fs.stat(distDir);
+    hasDist = st && st.isDirectory();
+  } catch { }
+  if (!hasDist) {
+    console.log('> Building Astro site…');
+    await run('npm', ['run', 'build']);
+  } else {
+    console.log('> Skipping build (dist/ exists)…');
+  }
+  console.log('> Starting Astro preview…');
+  // Capture stdout to detect the actual port used
+  let capturedPort = 8080;
+  const preview = spawn('npm', ['run', 'preview'], {
+    cwd,
+    stdio: ['ignore', 'pipe', 'pipe'],
+    detached: true
+  });
+  // Listen for port in output
+  preview.stdout.on('data', (data) => {
+    const output = data.toString();
+    process.stdout.write(output);
+    const match = output.match(/http:\/\/localhost:(\d+)/);
+    if (match) {
+      capturedPort = parseInt(match[1]);
+    }
+  });
+  preview.stderr.on('data', (data) => {
+    process.stderr.write(data);
+  });
+  const previewExit = new Promise((resolvePreview) => {
+    preview.on('close', (code, signal) => resolvePreview({ code, signal }));
+  });
+  // Wait a bit for the server to start and output the port
+  await delay(3000);
+  const baseUrl = `http://localhost:${capturedPort}/`;
+  try {
+    await waitForServer(baseUrl, 60000);
+    console.log('> Server ready, extracting content…');
+    const browser = await chromium.launch({ headless: true });
+    try {
+      const context = await browser.newContext();
+      const page = await context.newPage();
+      // Set viewport
+      await page.setViewportSize({ width: 1200, height: 1400 });
+      // Load page (use 'load' instead of 'networkidle' to avoid timeout on heavy pages)
+      await page.goto(baseUrl, { waitUntil: 'load', timeout: 60000 });
+      // Wait for content to be ready
+      await page.waitForTimeout(3000);
+      // Wait for main content to be present
+      await page.waitForSelector('main', { timeout: 10000 });
+      // Get article title for filename
+      if (!args.filename) {
+        const title = await page.evaluate(() => {
+          const h1 = document.querySelector('h1.hero-title');
+          const t = h1 ? h1.textContent : document.title;
+          return (t || '').replace(/\s+/g, ' ').trim();
+        });
+        outFileBase = slugify(title);
+      }
+      console.log('> Extracting article content…');
+      const txtContent = await extractArticleContent(page);
+      // Write output
+      const outPath = resolve(cwd, 'dist', `${outFileBase}.txt`);
+      await fs.writeFile(outPath, txtContent, 'utf-8');
+      console.log(`✅ TXT exported: ${outPath}`);
+      // Copy to public folder
+      const publicPath = resolve(cwd, 'public', `${outFileBase}.txt`);
+      try {
+        await fs.mkdir(resolve(cwd, 'public'), { recursive: true });
+        await fs.copyFile(outPath, publicPath);
+        console.log(`✅ TXT copied to: ${publicPath}`);
+      } catch (e) {
+        console.warn('Unable to copy TXT to public/:', e?.message || e);
+      }
+    } finally {
+      await browser.close();
+    }
+  } finally {
+    // Clean shutdown
+    try {
+      if (process.platform !== 'win32') {
+        try { process.kill(-preview.pid, 'SIGINT'); } catch { }
+      }
+      try { preview.kill('SIGINT'); } catch { }
+      await Promise.race([previewExit, delay(3000)]);
+      if (!preview.killed) {
+        try {
+          if (process.platform !== 'win32') {
+            try { process.kill(-preview.pid, 'SIGKILL'); } catch { }
+          }
+          try { preview.kill('SIGKILL'); } catch { }
+        } catch { }
+        await Promise.race([previewExit, delay(1000)]);
+      }
+    } catch { }
+  }
+}
+main().catch((err) => {
+  console.error('❌ Error:', err.message);
+  console.error(err);
+  process.exit(1);
+});

app/scripts/notion-importer/mdx-converter.mjs CHANGED Viewed

@@ -670,6 +670,33 @@ function addSpacingAroundComponents(content) {
     return processedContent;
 }
 /**
  * Fix smart quotes (curly quotes) and replace them with straight quotes
  * @param {string} content - Markdown content
@@ -732,6 +759,9 @@ async function processMdxContent(content, pageId = null, notionToken = null, out
     // Fix smart quotes first
     processedContent = fixSmartQuotes(processedContent);
     // Process external images first (before other transformations)
     if (outputDir) {
         // Create a temporary external images directory in the output folder

     return processedContent;
 }
+/**
+ * Escape angle brackets before numbers to prevent MDX parsing errors
+ * In MDX, <30B would be interpreted as a JSX element, but element names can't start with numbers
+ * @param {string} content - Markdown content
+ * @returns {string} - Content with escaped angle brackets
+ */
+function escapeAngleBracketsBeforeNumbers(content) {
+    console.log('  🔧 Escaping angle brackets before numbers...');
+    let fixedCount = 0;
+    // Replace < followed by a digit with &lt; (but not inside code blocks or HTML tags)
+    // Pattern: < followed by a digit, not preceded by = (to avoid <=)
+    const processed = content.replace(/(<)(\d)/g, (match, bracket, digit) => {
+        fixedCount++;
+        return `&lt;${digit}`;
+    });
+    if (fixedCount > 0) {
+        console.log(`    ✅ Escaped ${fixedCount} angle bracket(s) before numbers`);
+    } else {
+        console.log('    ℹ️  No angle brackets before numbers found');
+    }
+    return processed;
+}
 /**
  * Fix smart quotes (curly quotes) and replace them with straight quotes
  * @param {string} content - Markdown content
     // Fix smart quotes first
     processedContent = fixSmartQuotes(processedContent);
+    // Escape angle brackets before numbers (e.g., <30B -> &lt;30B)
+    processedContent = escapeAngleBracketsBeforeNumbers(processedContent);
     // Process external images first (before other transformations)
     if (outputDir) {
         // Create a temporary external images directory in the output folder

app/scripts/screenshot-elements.mjs CHANGED Viewed

@@ -2,11 +2,16 @@ import { chromium } from 'playwright';
 import { mkdir } from 'fs/promises';
 import { join } from 'path';
 const URL = 'http://localhost:4321/?viz=true';
 const OUTPUT_DIR = './screenshots';
 const SELECTORS = ['.html-embed', '.table-scroll > table', '.image-wrapper', '.katex-display'];
-const DEVICE_SCALE_FACTOR = 2; // Retina quality
 const BASE_VIEWPORT = { width: 1200, height: 800 };
 const slugify = (value) =>
   String(value || '')
@@ -20,6 +25,7 @@ async function main() {
   await mkdir(OUTPUT_DIR, { recursive: true });
   console.log('🚀 Launching browser...');
   const browser = await chromium.launch({ headless: true });
   const context = await browser.newContext({
     deviceScaleFactor: DEVICE_SCALE_FACTOR,
@@ -97,7 +103,7 @@ async function main() {
     });
     const slug = slugify(label);
-    const baseName = `${i + 1}-${type}${slug ? `--${slug}` : ''}`;
     const filename = `${baseName}.png`;
     const filepath = join(OUTPUT_DIR, filename);
@@ -108,7 +114,7 @@ async function main() {
       }
       if (type !== 'table' && type !== 'katex') {
-        await element.evaluate((el) => {
           const stash = (node) => {
             if (!node || !(node instanceof HTMLElement)) return;
             node.dataset.__prevStyle = node.getAttribute('style') ?? '';
@@ -131,19 +137,65 @@ async function main() {
             // Aggressive cleanup only for banners
             const all = el.querySelectorAll('*');
             all.forEach((node) => stash(node));
             const svgRects = el.querySelectorAll('svg rect');
-            svgRects.forEach((rect) => {
-              rect.setAttribute('rx', '0');
-              rect.setAttribute('ry', '0');
-              rect.setAttribute('stroke', 'none');
             });
           }
-        });
       }
       if (type === 'table') {
-        const cloneId = await element.evaluate((el, idx) => {
           const existing = document.getElementById(`__table-clone-wrapper-${idx}`);
           if (existing) existing.remove();
@@ -176,12 +228,18 @@ async function main() {
           clone.style.minWidth = '0';
           clone.style.maxWidth = 'none';
           clone.style.tableLayout = 'auto';
           const cells = clone.querySelectorAll('th, td');
           cells.forEach(cell => {
             cell.style.width = 'auto';
             cell.style.minWidth = '0';
             cell.style.maxWidth = 'none';
           });
           tableScroll.appendChild(clone);
@@ -191,7 +249,7 @@ async function main() {
           document.body.appendChild(wrapper);
           return clone.id;
-        }, i);
         const wrapperSelector = `#__table-clone-wrapper-${i}`;
         const cloneSelector = `#${cloneId}`;
@@ -213,7 +271,8 @@ async function main() {
         await page.locator(cloneSelector).screenshot({
           path: filepath,
-          type: 'png'
         });
         await page.evaluate((selector) => {
@@ -221,7 +280,7 @@ async function main() {
           if (el) el.remove();
         }, wrapperSelector);
       } else if (type === 'katex') {
-        const cloneId = await element.evaluate((el, idx) => {
           const existing = document.getElementById(`__katex-clone-wrapper-${idx}`);
           if (existing) existing.remove();
@@ -243,12 +302,22 @@ async function main() {
           clone.style.width = 'max-content';
           clone.style.maxWidth = 'none';
           clone.style.margin = '0';
           wrapper.appendChild(clone);
           document.body.appendChild(wrapper);
           return clone.id;
-        }, i);
         const wrapperSelector = `#__katex-clone-wrapper-${i}`;
         const cloneSelector = `#${cloneId}`;
@@ -270,7 +339,8 @@ async function main() {
         await page.locator(cloneSelector).screenshot({
           path: filepath,
-          type: 'png'
         });
         await page.evaluate((selector) => {
@@ -280,7 +350,8 @@ async function main() {
       } else {
         await element.screenshot({
           path: filepath,
-          type: 'png'
         });
       }
@@ -316,7 +387,7 @@ async function main() {
             });
             await page.waitForTimeout(150);
-            await element.screenshot({ path: openFilepath, type: 'png' });
             console.log(`  ✅ ${openFilename}`);
             await selectHandle.evaluate((el) => {

 import { mkdir } from 'fs/promises';
 import { join } from 'path';
+// Parse CLI arguments
+const args = process.argv.slice(2);
+const TRANSPARENT = args.includes('--transparent');
 const URL = 'http://localhost:4321/?viz=true';
 const OUTPUT_DIR = './screenshots';
 const SELECTORS = ['.html-embed', '.table-scroll > table', '.image-wrapper', '.katex-display'];
+const DEVICE_SCALE_FACTOR = 4; // 4x for high-quality print
 const BASE_VIEWPORT = { width: 1200, height: 800 };
+const FILENAME_SUFFIX = TRANSPARENT ? '-transparent' : '';
 const slugify = (value) =>
   String(value || '')
   await mkdir(OUTPUT_DIR, { recursive: true });
   console.log('🚀 Launching browser...');
+  if (TRANSPARENT) console.log('🔲 Transparent mode enabled (omitBackground: true)');
   const browser = await chromium.launch({ headless: true });
   const context = await browser.newContext({
     deviceScaleFactor: DEVICE_SCALE_FACTOR,
     });
     const slug = slugify(label);
+    const baseName = `${i + 1}-${type}${slug ? `--${slug}` : ''}${FILENAME_SUFFIX}`;
     const filename = `${baseName}.png`;
     const filepath = join(OUTPUT_DIR, filename);
       }
       if (type !== 'table' && type !== 'katex') {
+        await element.evaluate((el, isTransparent) => {
           const stash = (node) => {
             if (!node || !(node instanceof HTMLElement)) return;
             node.dataset.__prevStyle = node.getAttribute('style') ?? '';
             // Aggressive cleanup only for banners
             const all = el.querySelectorAll('*');
             all.forEach((node) => stash(node));
+          }
+          // Also target d3-loss-curves (banner component)
+          const lossCurves = el.querySelector('.d3-loss-curves');
+          if (lossCurves) {
+            lossCurves.style.background = 'transparent';
+            lossCurves.style.border = 'none';
+            lossCurves.style.borderRadius = '0';
+          }
+          // In transparent mode, neutralize backgrounds but preserve UI elements
+          if (isTransparent) {
+            // Step 1: Save computed backgrounds of UI elements we want to preserve
+            const uiSelectors = '.legend, [class*="legend"], .tooltip, [class*="tooltip"], .d3-tooltip, select, button, input, [class*="swatch"], [class*="label"]';
+            const uiElements = el.querySelectorAll(uiSelectors);
+            const savedStyles = new Map();
+            uiElements.forEach((uiEl) => {
+              const computed = window.getComputedStyle(uiEl);
+              savedStyles.set(uiEl, {
+                background: computed.background,
+                backgroundColor: computed.backgroundColor
+              });
+            });
+            // Step 2: Apply transparency to EVERYTHING
+            el.style.setProperty('background', 'transparent', 'important');
+            el.style.setProperty('background-color', 'transparent', 'important');
+            el.style.setProperty('background-image', 'none', 'important');
+            const allElements = el.querySelectorAll('*');
+            allElements.forEach((node) => {
+              if (node instanceof HTMLElement) {
+                node.style.setProperty('background', 'transparent', 'important');
+                node.style.setProperty('background-color', 'transparent', 'important');
+                node.style.setProperty('background-image', 'none', 'important');
+              }
+            });
+            // Step 3: Restore UI elements backgrounds
+            savedStyles.forEach((styles, uiEl) => {
+              if (styles.backgroundColor && styles.backgroundColor !== 'rgba(0, 0, 0, 0)') {
+                uiEl.style.setProperty('background-color', styles.backgroundColor, 'important');
+              }
+            });
+            // Target SVG rect elements that look like backgrounds
             const svgRects = el.querySelectorAll('svg rect');
+            svgRects.forEach((rect, idx) => {
+              const fill = (rect.getAttribute('fill') || '').toLowerCase();
+              if (idx === 0 || fill === 'white' || fill.startsWith('#fff') || fill.includes('255, 255, 255')) {
+                rect.setAttribute('fill', 'none');
+              }
             });
           }
+        }, TRANSPARENT);
       }
       if (type === 'table') {
+        const cloneId = await element.evaluate((el, idx, isTransparent) => {
           const existing = document.getElementById(`__table-clone-wrapper-${idx}`);
           if (existing) existing.remove();
           clone.style.minWidth = '0';
           clone.style.maxWidth = 'none';
           clone.style.tableLayout = 'auto';
+          if (isTransparent) {
+            clone.style.background = 'transparent';
+          }
           const cells = clone.querySelectorAll('th, td');
           cells.forEach(cell => {
             cell.style.width = 'auto';
             cell.style.minWidth = '0';
             cell.style.maxWidth = 'none';
+            if (isTransparent) {
+              cell.style.background = 'transparent';
+            }
           });
           tableScroll.appendChild(clone);
           document.body.appendChild(wrapper);
           return clone.id;
+        }, i, TRANSPARENT);
         const wrapperSelector = `#__table-clone-wrapper-${i}`;
         const cloneSelector = `#${cloneId}`;
         await page.locator(cloneSelector).screenshot({
           path: filepath,
+          type: 'png',
+          omitBackground: TRANSPARENT
         });
         await page.evaluate((selector) => {
           if (el) el.remove();
         }, wrapperSelector);
       } else if (type === 'katex') {
+        const cloneId = await element.evaluate((el, idx, isTransparent) => {
           const existing = document.getElementById(`__katex-clone-wrapper-${idx}`);
           if (existing) existing.remove();
           clone.style.width = 'max-content';
           clone.style.maxWidth = 'none';
           clone.style.margin = '0';
+          if (isTransparent) {
+            clone.style.background = 'transparent';
+            // Neutralize white backgrounds in katex elements
+            const allElements = clone.querySelectorAll('*');
+            allElements.forEach((node) => {
+              if (node instanceof HTMLElement) {
+                node.style.background = 'transparent';
+              }
+            });
+          }
           wrapper.appendChild(clone);
           document.body.appendChild(wrapper);
           return clone.id;
+        }, i, TRANSPARENT);
         const wrapperSelector = `#__katex-clone-wrapper-${i}`;
         const cloneSelector = `#${cloneId}`;
         await page.locator(cloneSelector).screenshot({
           path: filepath,
+          type: 'png',
+          omitBackground: TRANSPARENT
         });
         await page.evaluate((selector) => {
       } else {
         await element.screenshot({
           path: filepath,
+          type: 'png',
+          omitBackground: TRANSPARENT
         });
       }
             });
             await page.waitForTimeout(150);
+            await element.screenshot({ path: openFilepath, type: 'png', omitBackground: TRANSPARENT });
             console.log(`  ✅ ${openFilename}`);
             await selectHandle.evaluate((el) => {

app/src/content/article.mdx CHANGED Viewed

The diff for this file is too large to render. See raw diff

app/src/pages/dataviz.astro CHANGED Viewed

@@ -247,7 +247,7 @@ const visualsWithMeta = visuals.map((item: any) => {
                                             <p class="header-desc">{item.desc || item.caption}</p>
                                         )}
                                         {item.anchorId && (
-                                            <a href={`/#${item.anchorId}`} class="header-link" target="_blank" rel="noopener">
                                                 View in article →
                                             </a>
                                         )}

                                             <p class="header-desc">{item.desc || item.caption}</p>
                                         )}
                                         {item.anchorId && (
+                                            <a href={`/#${item.anchorId}`} class="header-link">
                                                 View in article →
                                             </a>
                                         )}

app/src/pages/index.astro CHANGED Viewed

@@ -197,6 +197,36 @@ const licence =
         } catch {}
       })();
     </script>
     <script type="module" src="/scripts/color-palettes.js"></script>
     <script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>

         } catch {}
       })();
     </script>
+    <!-- Hash Router for HF Spaces compatibility -->
+    <script is:inline>
+      (() => {
+        // Routes map: #/route -> actual page path
+        const routes = {
+          '/dataviz': '/dataviz',
+          '/trackio': '/trackio',
+        };
+        function handleHashRoute() {
+          const hash = window.location.hash;
+          // Only handle hashes that start with #/ (route pattern)
+          if (!hash.startsWith('#/')) return;
+          const route = hash.slice(1); // Remove the # prefix
+          const targetPath = routes[route];
+          if (targetPath) {
+            // Redirect to the actual page
+            window.location.href = targetPath;
+          }
+        }
+        // Check on page load
+        handleHashRoute();
+        // Also listen for hash changes (in case user navigates via hash)
+        window.addEventListener('hashchange', handleHashRoute);
+      })();
+    </script>
     <script type="module" src="/scripts/color-palettes.js"></script>
     <script src="https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js"></script>

app/yarn.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff