Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 44 additions & 163 deletions data/onPostBuild/llmstxt.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { GatsbyNode } from 'gatsby';
import * as path from 'path';
import * as fs from 'fs';
import languageInfo from '../../src/data/languages/languageInfo';

/**
* This script is used to create a file called llms.txt that contains a list of all the pages in the site.
Expand All @@ -21,46 +20,12 @@ const LLMS_TXT_PREAMBLE = `# Ably Documentation

const REPORTER_PREFIX = 'onPostBuild:';

// Valid languages for URL generation (matching your requirements)
const VALID_LANGUAGES = [
'javascript',
'nodejs',
'csharp',
'flutter',
'java',
'objc',
'php',
'python',
'ruby',
'swift',
'go',
'kotlin',
'react',
];

// Function to get the display label for a language
const getLanguageLabel = (languageKey: string): string => {
return languageInfo[languageKey as keyof typeof languageInfo]?.label || languageKey;
};

interface DocumentQueryResult {
site: {
siteMetadata: {
siteUrl: string;
};
};
allFileHtml: {
edges: {
node: {
slug: string;
meta: {
title: string;
meta_description: string;
languages?: string[];
};
};
}[];
};
allMdx: {
nodes: {
parent: {
Expand All @@ -71,9 +36,6 @@ interface DocumentQueryResult {
title?: string;
meta_description?: string;
};
internal: {
contentFilePath?: string;
};
}[];
};
}
Expand All @@ -96,15 +58,13 @@ interface CategoryStructure {
pages?: Array<{
slug: string;
meta: { title: string; meta_description: string };
languages: string[];
}>;
subcategories: {
[subcategory: string]: {
title: string;
pages: Array<{
slug: string;
meta: { title: string; meta_description: string };
languages: string[];
}>;
};
};
Expand Down Expand Up @@ -157,6 +117,12 @@ const categorizePage = (slug: string): { category: string; subcategory?: string

// LiveSync
livesync: { category: 'LiveSync' },

// AI Transport
'ai-transport': { category: 'AI Transport' },

// General - FAQs
faq: { category: 'General', subcategory: 'FAQs' },
};

// Try to match two-part path first (e.g., "platform/account"), then single part (e.g., "platform")
Expand Down Expand Up @@ -199,6 +165,11 @@ const categorizePage = (slug: string): { category: string; subcategory?: string
return { category: 'Platform', subcategory: 'Control API' };
}

// Special handling for guides/ai-transport - route to AI Transport category with Guides subcategory
if (firstPart === 'guides' && secondPart === 'ai-transport') {
return { category: 'AI Transport', subcategory: 'Guides' };
}

if (categoryMap[firstPart]) {
return categoryMap[firstPart];
}
Expand All @@ -207,48 +178,6 @@ const categorizePage = (slug: string): { category: string; subcategory?: string
return { category: 'General', subcategory: 'Documentation' };
};

// Function to extract code element classes from an MDX file
const extractCodeLanguages = async (filePath: string): Promise<Set<string>> => {
try {
// Check if the file exists
if (!fs.existsSync(filePath)) {
return new Set();
}

// Read the file content
const fileContent = fs.readFileSync(filePath, 'utf8');

// Find all instances of code blocks with language specifiers (```language or ```prefix_language)
const codeBlockRegex = /```(\w+)/g;
let match;
const languages = new Set<string>();

while ((match = codeBlockRegex.exec(fileContent)) !== null) {
if (match[1] && match[1].trim()) {
const codeBlockLang = match[1].trim();

// Handle prefixed languages like realtime_javascript, rest_javascript, etc.
// Extract the language part after the underscore
if (codeBlockLang.includes('_')) {
const parts = codeBlockLang.split('_');
// Take the last part as the language (e.g., 'javascript' from 'realtime_javascript')
const language = parts[parts.length - 1];
if (language) {
languages.add(language);
}
} else {
// Add the language as-is if it doesn't have an underscore
languages.add(codeBlockLang);
}
}
}
return languages;
} catch (error) {
console.error(`Error extracting code element classes from ${filePath}:`, error);
return new Set();
}
};

export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter, basePath }) => {
const query = `
query {
Expand All @@ -258,19 +187,6 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
}
}

allFileHtml {
edges {
node {
slug
meta {
title
meta_description
languages
}
}
}
}

allMdx {
nodes {
parent {
Expand All @@ -283,9 +199,6 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
title
meta_description
}
internal {
contentFilePath
}
}
}
}
Expand All @@ -309,38 +222,30 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
throw new Error('Site URL not found.');
}

// Process MDX pages (allMdx) and extract languages from files
const pages = await Promise.all(
queryRecords.allMdx.nodes
.filter((node) => {
// Only include pages from docs directory that have the required frontmatter
return (
node.parent.relativeDirectory.startsWith('docs') &&
node.frontmatter?.title &&
node.frontmatter?.meta_description
);
})
.map(async (node) => {
// Create slug from parent file info - remove 'docs/' prefix since it's already in relativeDirectory
const slug = (
node.parent.relativeDirectory + (node.parent.name === 'index' ? '' : `/${node.parent.name}`)
).replace(/^docs\//, '');

// Extract valid languages from the file content
const filePath = node.internal.contentFilePath || '';
const detectedLanguages = await extractCodeLanguages(filePath);
const languages = Array.from(detectedLanguages).filter((lang) => VALID_LANGUAGES.includes(lang));

return {
slug,
meta: {
title: node.frontmatter.title!,
meta_description: node.frontmatter.meta_description!,
},
languages,
};
}),
);
// Process MDX pages (allMdx)
const pages = queryRecords.allMdx.nodes
.filter((node) => {
// Only include pages from docs directory that have the required frontmatter
return (
node.parent.relativeDirectory.startsWith('docs') &&
node.frontmatter?.title &&
node.frontmatter?.meta_description
);
})
.map((node) => {
// Create slug from parent file info - remove 'docs/' prefix since it's already in relativeDirectory
const slug = (
node.parent.relativeDirectory + (node.parent.name === 'index' ? '' : `/${node.parent.name}`)
).replace(/^docs\//, '');

return {
slug,
meta: {
title: node.frontmatter.title!,
meta_description: node.frontmatter.meta_description!,
},
};
});

reporter.info(`${REPORTER_PREFIX} Found ${pages.length} pages to place into llms.txt`);

Expand Down Expand Up @@ -382,7 +287,7 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
const serializedPages = [LLMS_TXT_PREAMBLE];

// Define the order of categories
const categoryOrder = ['Platform', 'Pub/Sub', 'Chat', 'Spaces', 'LiveObjects', 'LiveSync', 'General'];
const categoryOrder = ['Platform', 'Pub/Sub', 'Chat', 'Spaces', 'LiveObjects', 'LiveSync', 'AI Transport', 'General'];

// Sort categories by defined order
const sortedCategories = Object.keys(categoryStructure).sort((a, b) => {
Expand All @@ -401,49 +306,25 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter
});

// Helper function to serialize pages
// Note: We only generate the base .md URL since the markdown endpoint returns
// the same content regardless of language parameter - all language code snippets
// are included in the single markdown file.
const serializePages = (
pages: Array<{ slug: string; meta: { title: string; meta_description: string }; languages: string[] }>,
pages: Array<{ slug: string; meta: { title: string; meta_description: string } }>,
) => {
for (const page of pages) {
const { slug, meta, languages } = page;
const { slug, meta } = page;
const { title, meta_description } = meta;

try {
const baseUrl = prefixPath({ url: `/docs/${slug}`, siteUrl, pathPrefix: basePath });
const baseUrl = prefixPath({ url: `/docs/${slug}.md`, siteUrl, pathPrefix: basePath });
const safeTitle = escapeMarkdown(title);

// Generate base page entry (without language parameter)
// Generate base page entry only (no language-specific variants needed)
// The markdown file contains all language code snippets
const baseLink = `[${safeTitle}](${baseUrl})`;
const baseLine = `- ${[baseLink, meta_description].join(': ')}`;
serializedPages.push(baseLine);

// Generate language-specific entries if the page has multiple languages
// Skip language variants that match the page's primary language (e.g., skip ?lang=go for /getting-started/go)
// Only generate language variants if there are 2 or more languages
if (languages && languages.length > 1) {
// Extract the last part of the slug to check if it matches a language
const slugParts = slug.split('/');
const slugLastPart = slugParts[slugParts.length - 1];

// Map slug names to their corresponding language codes
const slugToLangMap: Record<string, string> = {
dotnet: 'csharp',
'objective-c': 'objc',
};

// Get the primary language for this page (either direct match or mapped)
const primaryLanguage = slugToLangMap[slugLastPart] || slugLastPart;

for (const language of languages) {
// Skip if the language matches the page's primary language
if (language !== primaryLanguage) {
const langUrl = `${baseUrl}?lang=${language}`;
const langLink = `[${safeTitle} (${getLanguageLabel(language)})](${langUrl})`;
const langLine = `- ${[langLink, meta_description].join(': ')}`;
serializedPages.push(langLine);
}
}
}
} catch (err) {
reporter.panic(`${REPORTER_PREFIX} Error serializing pages`, err as Error);
}
Expand Down