Skip to content

Commit 6c1ef56

Browse files
authored
Merge pull request #96 from evalstate/feat/markdown-docs
prefer markdown if offered by hugging face docs
2 parents 74c979f + 0be578a commit 6c1ef56

File tree

2 files changed

+146
-53
lines changed

2 files changed

+146
-53
lines changed

packages/mcp/src/docs-search/doc-fetch.test.ts

Lines changed: 94 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,37 @@
1-
import { describe, it, expect, vi } from 'vitest';
1+
import { describe, it, expect, vi, afterEach } from 'vitest';
22
import { DocFetchTool, normalizeDocUrl } from './doc-fetch.js';
33

4+
const createMockResponse = ({
5+
content,
6+
contentType = 'text/html',
7+
status = 200,
8+
statusText = 'OK',
9+
}: {
10+
content: string;
11+
contentType?: string;
12+
status?: number;
13+
statusText?: string;
14+
}) =>
15+
new Response(content, {
16+
status,
17+
statusText,
18+
headers: { 'content-type': contentType },
19+
});
20+
21+
const stubFetch = (factory: () => Response) => {
22+
const fetchMock = vi.fn().mockImplementation(() => Promise.resolve(factory()));
23+
vi.stubGlobal('fetch', fetchMock);
24+
return fetchMock;
25+
};
26+
427
describe('DocFetchTool', () => {
528
const tool = new DocFetchTool();
629

30+
afterEach(() => {
31+
vi.clearAllMocks();
32+
vi.unstubAllGlobals();
33+
});
34+
735
describe('URL validation', () => {
836
it('should accept valid HF and Gradio docs URLs', () => {
937
const validUrls = [
@@ -38,13 +66,30 @@ describe('DocFetchTool', () => {
3866
});
3967

4068
describe('document chunking', () => {
69+
it('uses markdown content from host when available', async () => {
70+
const markdown = '# Heading\nBody content';
71+
const fetchMock = stubFetch(() =>
72+
createMockResponse({
73+
content: markdown,
74+
contentType: 'text/markdown',
75+
}),
76+
);
77+
78+
const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
79+
expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/test', {
80+
headers: { accept: 'text/markdown' },
81+
});
82+
expect(result).toBe(markdown);
83+
});
84+
4185
it('should return small documents without chunking', async () => {
4286

4387
// Mock fetch to return HTML that converts to short markdown
44-
global.fetch = vi.fn().mockResolvedValue({
45-
ok: true,
46-
text: () => Promise.resolve('<h1>Short Document</h1><p>This is a short document.</p>'),
47-
});
88+
stubFetch(() =>
89+
createMockResponse({
90+
content: '<h1>Short Document</h1><p>This is a short document.</p>',
91+
}),
92+
);
4893

4994
const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
5095

@@ -57,10 +102,11 @@ describe('DocFetchTool', () => {
57102
// Mock fetch to return HTML that converts to long markdown
58103
const longHtml = '<h1>Long Document</h1>' + '<p>This is a very long sentence that will be repeated many times to create a document that exceeds the 7500 token limit for testing chunking functionality.</p>'.repeat(200);
59104

60-
global.fetch = vi.fn().mockResolvedValue({
61-
ok: true,
62-
text: () => Promise.resolve(longHtml),
63-
});
105+
stubFetch(() =>
106+
createMockResponse({
107+
content: longHtml,
108+
}),
109+
);
64110

65111
const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
66112

@@ -74,21 +120,51 @@ describe('DocFetchTool', () => {
74120
{ in: 'https://gradio.app/guides/x', out: 'https://www.gradio.app/guides/x' },
75121
{ in: 'https://www.gradio.app/guides/x', out: 'https://www.gradio.app/guides/x' },
76122
{ in: 'https://huggingface.co/docs/transformers', out: 'https://huggingface.co/docs/transformers' },
123+
{ in: '/docs/diffusers/index', out: 'https://huggingface.co/docs/diffusers/index' },
124+
{ in: './docs/diffusers/index', out: 'https://huggingface.co/docs/diffusers/index' },
77125
{ in: 'not a url', out: 'not a url' },
78126
];
79127
for (const c of cases) {
80128
expect(normalizeDocUrl(c.in)).toBe(c.out);
81129
}
82130
});
83131

132+
it('normalizes relative doc paths to the huggingface docs host', async () => {
133+
const fetchMock = stubFetch(() =>
134+
createMockResponse({
135+
content: '<h1>Title</h1><p>Body</p>',
136+
}),
137+
);
138+
139+
const result = await tool.fetch({ doc_url: '/docs/test' });
140+
expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/test', {
141+
headers: { accept: 'text/markdown' },
142+
});
143+
expect(result).toContain('# Title');
144+
});
145+
146+
it('normalizes ./docs paths to the huggingface docs host', async () => {
147+
const fetchMock = stubFetch(() =>
148+
createMockResponse({
149+
content: '<h1>Another Title</h1><p>Body</p>',
150+
}),
151+
);
152+
153+
await tool.fetch({ doc_url: './docs/another' });
154+
expect(fetchMock).toHaveBeenCalledWith('https://huggingface.co/docs/another', {
155+
headers: { accept: 'text/markdown' },
156+
});
157+
});
158+
84159
it('should return subsequent chunks with offset', async () => {
85160
// Mock fetch to return the same long HTML
86161
const longHtml = '<h1>Long Document</h1>' + '<p>This is a very long sentence that will be repeated many times to create a document that exceeds the 7500 token limit for testing chunking functionality.</p>'.repeat(200);
87162

88-
global.fetch = vi.fn().mockResolvedValue({
89-
ok: true,
90-
text: () => Promise.resolve(longHtml),
91-
});
163+
stubFetch(() =>
164+
createMockResponse({
165+
content: longHtml,
166+
}),
167+
);
92168

93169
// Get first chunk
94170
const firstChunk = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test' });
@@ -106,10 +182,11 @@ describe('DocFetchTool', () => {
106182
});
107183

108184
it('should handle offset beyond document length', async () => {
109-
global.fetch = vi.fn().mockResolvedValue({
110-
ok: true,
111-
text: () => Promise.resolve('<h1>Short Document</h1><p>This is short.</p>'),
112-
});
185+
stubFetch(() =>
186+
createMockResponse({
187+
content: '<h1>Short Document</h1><p>This is short.</p>',
188+
}),
189+
);
113190

114191
const result = await tool.fetch({ doc_url: 'https://huggingface.co/docs/test', offset: 10000 });
115192

packages/mcp/src/docs-search/doc-fetch.ts

Lines changed: 52 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ export class DocFetchTool {
6767
const nodeName = ((node as unknown as { nodeName?: string }).nodeName || '').toLowerCase();
6868
if (nodeName === 'img') {
6969
try {
70-
const src = (node as unknown as { getAttribute?: (name: string) => string | null }).getAttribute?.('src') ??
70+
const src =
71+
(node as unknown as { getAttribute?: (name: string) => string | null }).getAttribute?.('src') ??
7172
((node as unknown as { src?: string }).src || '');
7273
if (
7374
/\.svg(\?|$)/i.test(src) ||
@@ -87,21 +88,30 @@ export class DocFetchTool {
8788
this.turndownService.addRule('dropHeadingAnchors', {
8889
filter: (node) => {
8990
try {
90-
const n = node as unknown as { nodeName?: string; getAttribute?: (k: string) => string | null; textContent?: string; childNodes?: Array<{ nodeName?: string }> };
91+
const n = node as unknown as {
92+
nodeName?: string;
93+
getAttribute?: (k: string) => string | null;
94+
textContent?: string;
95+
childNodes?: Array<{ nodeName?: string }>;
96+
};
9197
if ((n.nodeName || '').toLowerCase() !== 'a') return false;
9298
const href = n.getAttribute?.('href') || '';
9399
if (!href || !href.startsWith('#')) return false;
94100
const text = (n.textContent || '').trim();
95101
const children = (n as unknown as { childNodes?: Array<{ nodeName?: string }> }).childNodes || [];
96-
const onlyIcons = children.length > 0 && children.every((c) => ((c.nodeName || '').toLowerCase() === 'img' || (c.nodeName || '').toLowerCase() === 'svg'));
102+
const onlyIcons =
103+
children.length > 0 &&
104+
children.every(
105+
(c) => (c.nodeName || '').toLowerCase() === 'img' || (c.nodeName || '').toLowerCase() === 'svg'
106+
);
97107
const looksLikeEncodedSvg = /data:image\/svg\+xml|%3csvg|svg%2bxml/i.test(text);
98108
const noAlnumText = text.length <= 3 && !/[a-z0-9]/i.test(text);
99109
return onlyIcons || looksLikeEncodedSvg || noAlnumText;
100110
} catch {
101111
return false;
102112
}
103113
},
104-
replacement: () => ''
114+
replacement: () => '',
105115
});
106116
}
107117

@@ -136,37 +146,35 @@ export class DocFetchTool {
136146
const normalizedUrl = normalizeDocUrl(params.doc_url);
137147
this.validateUrl(normalizedUrl);
138148

139-
const response = await fetch(normalizedUrl);
140-
149+
const response = await fetch(normalizedUrl, { headers: { accept: 'text/markdown' } });
141150
if (!response.ok) {
142151
throw new Error(`Failed to fetch document: ${response.status} ${response.statusText}`);
143152
}
144-
145-
const htmlContent = await response.text();
146-
147-
// Convert HTML to Markdown
148-
let fullMarkdownContent = this.turndownService.turndown(htmlContent);
149-
150-
// Post-process: strip any leftover SVG images that slipped past DOM filters
151-
// - Markdown images pointing to data:image/svg+xml or *.svg
152-
// - Empty links left behind after image removal: [](...)
153-
fullMarkdownContent = fullMarkdownContent
154-
.replace(/!\[[^\]]*\]\(\s*(?:data:image\/svg\+xml[^)]*|[^)]*\.svg(?:\?[^)]*)?)\s*\)/gi, '')
155-
.replace(/\[\s*\]\(\s*[^)]*\s*\)/g, '');
156-
157-
// Remove anchors whose link text still contains encoded SVG payloads (edge cases)
158-
fullMarkdownContent = fullMarkdownContent
159-
.replace(/\[[^\]]*(?:data:image\/svg\+xml|%3csvg|svg%2bxml)[^\]]*\]\([^)]*\)/gi, '');
153+
let content = await response.text();
154+
const contentType = response.headers.get('content-type') || '';
155+
const isPlainOrMarkdown = contentType.includes('text/plain') || contentType.includes('text/markdown');
156+
if (!isPlainOrMarkdown) {
157+
// attempt conversion to markdown
158+
content = this.turndownService.turndown(content);
159+
160+
// Post-process: strip any leftover SVG images that slipped past DOM filters
161+
// - Markdown images pointing to data:image/svg+xml or *.svg
162+
// - Empty links left behind after image removal: [](...)
163+
content = content
164+
.replace(/!\[[^\]]*\]\(\s*(?:data:image\/svg\+xml[^)]*|[^)]*\.svg(?:\?[^)]*)?)\s*\)/gi, '')
165+
.replace(/\[\s*\]\(\s*[^)]*\s*\)/g, '');
166+
167+
// Remove anchors whose link text still contains encoded SVG payloads (edge cases)
168+
content = content.replace(/\[[^\]]*(?:data:image\/svg\+xml|%3csvg|svg%2bxml)[^\]]*\]\([^)]*\)/gi, '');
169+
}
160170

161171
// Apply chunking logic
162-
return this.applyChunking(fullMarkdownContent, params.offset || 0);
172+
return this.applyChunking(content, params.offset || 0);
163173
} catch (error) {
164174
throw new Error(`Failed to fetch document: ${error instanceof Error ? error.message : 'Unknown error'}`);
165175
}
166176
}
167177

168-
169-
170178
/**
171179
* Apply chunking logic to markdown content
172180
*/
@@ -213,15 +221,23 @@ export class DocFetchTool {
213221
* - Convert gradio.app → www.gradio.app so pages resolve correctly
214222
*/
215223
export function normalizeDocUrl(input: string): string {
216-
try {
217-
const url = new URL(input);
218-
const host = url.hostname.toLowerCase();
219-
if (host === 'gradio.app') {
220-
url.hostname = 'www.gradio.app';
221-
return url.toString();
222-
}
223-
return input;
224-
} catch {
225-
return input;
226-
}
224+
try {
225+
const trimmed = input.trim();
226+
if (trimmed.startsWith('/docs')) {
227+
return `https://huggingface.co${trimmed}`;
228+
}
229+
if (trimmed.startsWith('./docs')) {
230+
return `https://huggingface.co/${trimmed.slice(2)}`;
231+
}
232+
233+
const url = new URL(trimmed);
234+
const host = url.hostname.toLowerCase();
235+
if (host === 'gradio.app') {
236+
url.hostname = 'www.gradio.app';
237+
return url.toString();
238+
}
239+
return trimmed;
240+
} catch {
241+
return input;
242+
}
227243
}

0 commit comments

Comments
 (0)