Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions src/commands/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -3074,6 +3074,16 @@ const LLMS_PROBE_CAP = 30
// list hundreds of URLs; without this a single root hit could schedule
// dozens of unrelated probes (`/blog`, `/pricing`, …).
const LLMS_CHILDREN_PER_NODE = 12
// Budget for the explicit-link phase of discovery (phase 2 in discoverLlmsTxt).
// Separate from LLMS_PROBE_CAP so following authored `llms.txt` links — a
// strong, deliberate signal, e.g. docs.snowflake.com's root index listing ~40
// per-section files — isn't starved by the speculative slug-walk's budget.
const EXPLICIT_LLMS_PROBE_CAP = 200

// Matches ONLY the canonical index filename, not sibling dumps like
// `llms-full.txt` (concatenated page content, not a link index) — those would
// add no link rows and just waste a fetch.
const LLMS_INDEX_RE = /\/llms\.txt$/i

/**
* Convert a path like `/docs/quickstart` into its llms.txt URL.
Expand All @@ -3083,6 +3093,39 @@ function pathToLlmsUrl(origin, path) {
return `${origin}${path}/llms.txt`
}

/**
* Derive the path a fetched llms.txt "lives at" (its pathname minus the
* trailing `/llms.txt`), so explicit-link hits carry the same `path` shape
* the slug-walk hits do — mergeValidHits sorts by it (deepest wins) and
* narrowToDocsSubtreeIfNeeded reads it.
*/
function llmsPathFromUrl(llmsUrl) {
const pn = new URL(llmsUrl).pathname.replace(/\/llms\.txt$/i, '')
return pn || '/'
}

/**
* Pull explicit child-index links out of a parsed llms.txt — item rows whose
* URL literally points at another `llms.txt` (e.g. Snowflake's root file
* listing `…/data-integration/llms.txt`). Restricted to the same origin as the
* source: cross-origin llms.txt would drag in unrelated third-party docs, and
* the slug-walk's path math assumes one origin. Returns deduped URL strings.
*/
function extractNestedLlmsUrls(parsed, originScope) {
const out = new Set()
for (const section of parsed.sections) {
for (const item of section.items) {
try {
const u = new URL(item.url)
if (u.origin === originScope && LLMS_INDEX_RE.test(u.pathname)) out.add(u.toString())
} catch {
// ignore unparseable URLs
}
}
}
return [...out]
}

/**
* Given a current path + a parsed llms.txt, return the next paths to probe:
* the parent directory (UP) and distinct first-segment-children extracted
Expand Down Expand Up @@ -3186,6 +3229,55 @@ async function discoverLlmsTxt(sourceUrl) {
frontier = next
}

// Phase 2 — follow EXPLICIT llms.txt links authored inside the files phase 1
// discovered, recursively. No slug-guessing here: we only fetch URLs that
// literally point at an llms.txt (a root index listing per-section files,
// like docs.snowflake.com). Each new file's own explicit links are followed
// in turn until none remain. Dedupes against every URL phase 1 touched
// (hits + misses) and against itself; phase 1's `tried` set is keyed by path,
// so we track full llms.txt URLs separately here and leave phase 1 untouched.
const triedUrls = new Set([...hits.map((h) => h.llmsUrl), ...misses.map((m) => m.url)])
let explicitFrontier = []
for (const hit of hits) {
for (const u of extractNestedLlmsUrls(hit.parsed, sourceUrl.origin)) {
if (!triedUrls.has(u)) explicitFrontier.push(u)
}
}

let explicitProbes = 0
while (explicitFrontier.length > 0 && explicitProbes < EXPLICIT_LLMS_PROBE_CAP) {
const ring = []
for (const u of explicitFrontier) {
if (triedUrls.has(u)) continue
triedUrls.add(u)
ring.push(u)
explicitProbes++
if (explicitProbes >= EXPLICIT_LLMS_PROBE_CAP) break
}
if (ring.length === 0) break

const results = await Promise.all(ring.map(async (llmsUrl) => ({ llmsUrl, res: await fetchLlmsTxt(llmsUrl) })))

const next = []
for (const { llmsUrl, res } of results) {
if (!res.ok) {
misses.push({ url: llmsUrl, status: res.status, error: res.error })
continue
}
const linkItems = res.stats?.linkItems ?? 0
if (linkItems === 0) {
misses.push({ url: llmsUrl, status: 'no link items' })
continue
}
hits.push({ ...res, llmsUrl, path: llmsPathFromUrl(llmsUrl) })
if (!res.usable) skipped.push({ url: llmsUrl, reason: res.reason })
for (const nested of extractNestedLlmsUrls(res.parsed, sourceUrl.origin)) {
if (!triedUrls.has(nested)) next.push(nested)
}
}
explicitFrontier = next
}

return { hits, misses, skipped }
}

Expand Down