Merge pull request #96 from evalstate/feat/markdown-docs

evalstate · web-flow · commit 6c1ef5653952 · 2025-10-07T13:24:03.000+02:00
prefer markdown if offered by hugging face docs
diff --git a/packages/mcp/src/docs-search/doc-fetch.test.ts b/packages/mcp/src/docs-search/doc-fetch.test.ts
@@ -1,9 +1,37 @@
-import { describe, it, expect, vi } from 'vitest';
+import { describe, it, expect, vi, afterEach } from 'vitest';
 import { DocFetchTool, normalizeDocUrl } from './doc-fetch.js';
 
+const createMockResponse = ({
+	content,
+	contentType = 'text/html',
+	status = 200,
+	statusText = 'OK',
+}: {
+	content: string;
+	contentType?: string;
+	status?: number;
+	statusText?: string;
+}) =>
+	new Response(content, {
+		status,
+		statusText,
+		headers: { 'content-type': contentType },
+	});
+
+const stubFetch = (factory: () => Response) => {
+	const fetchMock = vi.fn().mockImplementation(() => Promise.resolve(factory()));
+	vi.stubGlobal('fetch', fetchMock);
+	return fetchMock;
+};
+
 describe('DocFetchTool', () => {
     const tool = new DocFetchTool();
 
+	afterEach(() => {
+		vi.clearAllMocks();
+		vi.unstubAllGlobals();
+	});
+
 	describe('URL validation', () => {
 		it('should accept valid HF and Gradio docs URLs', () => {
 			const validUrls = [
@@ -38,13 +66,30 @@ describe('DocFetchTool', () => {
 	});
 
 	describe('document chunking', () => {
+		it('uses markdown content from host when available', async () => {
+			const markdown = '# Heading\nBody content';
+			const fetchMock = stubFetch(() =>
+				createMockResponse({
+					content: markdown,
+					contentType: 'text/markdown',
+				}),
+			);
+
+			const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
+			expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/test', {
+				headers: { accept: 'text/markdown' },
+			});
+			expect(result).toBe(markdown);
+		});
+
 		it('should return small documents without chunking', async () => {
 			
 			// Mock fetch to return HTML that converts to short markdown
-			global.fetch = vi.fn().mockResolvedValue({
-				ok: true,
-				text: () => Promise.resolve('<h1>Short Document</h1><p>This is a short document.</p>'),
-			});
+			stubFetch(() =>
+				createMockResponse({
+					content: '<h1>Short Document</h1><p>This is a short document.</p>',
+				}),
+			);
 
 			const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
 			
@@ -57,10 +102,11 @@ describe('DocFetchTool', () => {
 			// Mock fetch to return HTML that converts to long markdown
 			const longHtml = '<h1>Long Document</h1>' + '<p>This is a very long sentence that will be repeated many times to create a document that exceeds the 7500 token limit for testing chunking functionality.</p>'.repeat(200);
 			
-			global.fetch = vi.fn().mockResolvedValue({
-				ok: true,
-				text: () => Promise.resolve(longHtml),
-			});
+			stubFetch(() =>
+				createMockResponse({
+					content: longHtml,
+				}),
+			);
 
 			const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
 			
@@ -74,21 +120,51 @@ describe('DocFetchTool', () => {
 				{ in: 'https://gradio.app/guides/x', out: 'https://www.gradio.app/guides/x' },
 				{ in: 'https://www.gradio.app/guides/x', out: 'https://www.gradio.app/guides/x' },
 				{ in: 'https://huggingface.co/docs/transformers', out: 'https://huggingface.co/docs/transformers' },
+				{ in: '/docs/diffusers/index', out: 'https://huggingface.co/docs/diffusers/index' },
+				{ in: './docs/diffusers/index', out: 'https://huggingface.co/docs/diffusers/index' },
 				{ in: 'not a url', out: 'not a url' },
 			];
 			for (const c of cases) {
 				expect(normalizeDocUrl(c.in)).toBe(c.out);
 			}
 		});
 
+		it('normalizes relative doc paths to the huggingface docs host', async () => {
+			const fetchMock = stubFetch(() =>
+				createMockResponse({
+					content: '<h1>Title</h1><p>Body</p>',
+				}),
+			);
+
+			const result = await tool.fetch({ doc_url: '/docs/test' });
+			expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/test', {
+				headers: { accept: 'text/markdown' },
+			});
+			expect(result).toContain('# Title');
+		});
+
+		it('normalizes ./docs paths to the huggingface docs host', async () => {
+			const fetchMock = stubFetch(() =>
+				createMockResponse({
+					content: '<h1>Another Title</h1><p>Body</p>',
+				}),
+			);
+
+			await tool.fetch({ doc_url: './docs/another' });
+			expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/another', {
+				headers: { accept: 'text/markdown' },
+			});
+		});
+
 		it('should return subsequent chunks with offset', async () => {
 			// Mock fetch to return the same long HTML
 			const longHtml = '<h1>Long Document</h1>' + '<p>This is a very long sentence that will be repeated many times to create a document that exceeds the 7500 token limit for testing chunking functionality.</p>'.repeat(200);
 			
-			global.fetch = vi.fn().mockResolvedValue({
-				ok: true,
-				text: () => Promise.resolve(longHtml),
-			});
+			stubFetch(() =>
+				createMockResponse({
+					content: longHtml,
+				}),
+			);
 
 			// Get first chunk
 			const firstChunk = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
@@ -106,10 +182,11 @@ describe('DocFetchTool', () => {
 		});
 
 		it('should handle offset beyond document length', async () => {
-			global.fetch = vi.fn().mockResolvedValue({
-				ok: true,
-				text: () => Promise.resolve('<h1>Short Document</h1><p>This is short.</p>'),
-			});
+			stubFetch(() =>
+				createMockResponse({
+					content: '<h1>Short Document</h1><p>This is short.</p>',
+				}),
+			);
 
 			const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test', offset: 10000 });
 			
diff --git a/packages/mcp/src/docs-search/doc-fetch.ts b/packages/mcp/src/docs-search/doc-fetch.ts
@@ -67,7 +67,8 @@ export class DocFetchTool {
 			const nodeName = ((node as unknown as { nodeName?: string }).nodeName || '').toLowerCase();
 			if (nodeName === 'img') {
 				try {
-					const src = (node as unknown as { getAttribute?: (name: string) => string | null }).getAttribute?.('src') ??
+					const src =
+						(node as unknown as { getAttribute?: (name: string) => string | null }).getAttribute?.('src') ??
 						((node as unknown as { src?: string }).src || '');
 					if (
 						/\.svg(\?|$)/i.test(src) ||
@@ -87,21 +88,30 @@ export class DocFetchTool {
 		this.turndownService.addRule('dropHeadingAnchors', {
 			filter: (node) => {
 				try {
-					const n = node as unknown as { nodeName?: string; getAttribute?: (k: string) => string | null; textContent?: string; childNodes?: Array<{ nodeName?: string }> };
+					const n = node as unknown as {
+						nodeName?: string;
+						getAttribute?: (k: string) => string | null;
+						textContent?: string;
+						childNodes?: Array<{ nodeName?: string }>;
+					};
 					if ((n.nodeName || '').toLowerCase() !== 'a') return false;
 					const href = n.getAttribute?.('href') || '';
 					if (!href || !href.startsWith('#')) return false;
 					const text = (n.textContent || '').trim();
 					const children = (n as unknown as { childNodes?: Array<{ nodeName?: string }> }).childNodes || [];
-					const onlyIcons = children.length > 0 && children.every((c) => ((c.nodeName || '').toLowerCase() === 'img' || (c.nodeName || '').toLowerCase() === 'svg'));
+					const onlyIcons =
+						children.length > 0 &&
+						children.every(
+							(c) => (c.nodeName || '').toLowerCase() === 'img' || (c.nodeName || '').toLowerCase() === 'svg'
+						);
 					const looksLikeEncodedSvg = /data:image\/svg\+xml|%3csvg|svg%2bxml/i.test(text);
 					const noAlnumText = text.length <= 3 && !/[a-z0-9]/i.test(text);
 					return onlyIcons || looksLikeEncodedSvg || noAlnumText;
 				} catch {
 					return false;
 				}
 			},
-			replacement: () => ''
+			replacement: () => '',
 		});
 	}
 
@@ -136,37 +146,35 @@ export class DocFetchTool {
 			const normalizedUrl = normalizeDocUrl(params.doc_url);
 			this.validateUrl(normalizedUrl);
 
-			const response = await fetch(normalizedUrl);
-
+			const response = await fetch(normalizedUrl, { headers: { accept: 'text/markdown' } });
 			if (!response.ok) {
 				throw new Error(`Failed to fetch document: ${response.status} ${response.statusText}`);
 			}
-
-			const htmlContent = await response.text();
-
-			// Convert HTML to Markdown
-			let fullMarkdownContent = this.turndownService.turndown(htmlContent);
-
-			// Post-process: strip any leftover SVG images that slipped past DOM filters
-			//  - Markdown images pointing to data:image/svg+xml or *.svg
-			//  - Empty links left behind after image removal: [](...)
-			fullMarkdownContent = fullMarkdownContent
-				.replace(/!\[[^\]]*\]\(\s*(?:data:image\/svg\+xml[^)]*|[^)]*\.svg(?:\?[^)]*)?)\s*\)/gi, '')
-				.replace(/\[\s*\]\(\s*[^)]*\s*\)/g, '');
-
-			// Remove anchors whose link text still contains encoded SVG payloads (edge cases)
-			fullMarkdownContent = fullMarkdownContent
-				.replace(/\[[^\]]*(?:data:image\/svg\+xml|%3csvg|svg%2bxml)[^\]]*\]\([^)]*\)/gi, '');
+			let content = await response.text();
+			const contentType = response.headers.get('content-type') || '';
+			const isPlainOrMarkdown = contentType.includes('text/plain') || contentType.includes('text/markdown');
+			if (!isPlainOrMarkdown) {
+				// attempt conversion to markdown
+				content = this.turndownService.turndown(content);
+
+				// Post-process: strip any leftover SVG images that slipped past DOM filters
+				//  - Markdown images pointing to data:image/svg+xml or *.svg
+				//  - Empty links left behind after image removal: [](...)
+				content = content
+					.replace(/!\[[^\]]*\]\(\s*(?:data:image\/svg\+xml[^)]*|[^)]*\.svg(?:\?[^)]*)?)\s*\)/gi, '')
+					.replace(/\[\s*\]\(\s*[^)]*\s*\)/g, '');
+
+				// Remove anchors whose link text still contains encoded SVG payloads (edge cases)
+				content = content.replace(/\[[^\]]*(?:data:image\/svg\+xml|%3csvg|svg%2bxml)[^\]]*\]\([^)]*\)/gi, '');
+			}
 
 			// Apply chunking logic
-			return this.applyChunking(fullMarkdownContent, params.offset || 0);
+			return this.applyChunking(content, params.offset || 0);
 		} catch (error) {
 			throw new Error(`Failed to fetch document: ${error instanceof Error ? error.message : 'Unknown error'}`);
 		}
 	}
 
-
-
 	/**
 	 * Apply chunking logic to markdown content
 	 */
@@ -213,15 +221,23 @@ export class DocFetchTool {
  * - Convert gradio.app → www.gradio.app so pages resolve correctly
  */
 export function normalizeDocUrl(input: string): string {
-    try {
-        const url = new URL(input);
-        const host = url.hostname.toLowerCase();
-        if (host === 'gradio.app') {
-            url.hostname = 'www.gradio.app';
-            return url.toString();
-        }
-        return input;
-    } catch {
-        return input;
-    }
+	try {
+		const trimmed = input.trim();
+		if (trimmed.startsWith('/docs')) {
+			return `https://huggingface.co${trimmed}`;
+		}
+		if (trimmed.startsWith('./docs')) {
+			return `https://huggingface.co/${trimmed.slice(2)}`;
+		}
+
+		const url = new URL(trimmed);
+		const host = url.hostname.toLowerCase();
+		if (host === 'gradio.app') {
+			url.hostname = 'www.gradio.app';
+			return url.toString();
+		}
+		return trimmed;
+	} catch {
+		return input;
+	}
 }