Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions apps/api/src/scraper/WebScraper/scrapers/fetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,39 @@ import axios from "axios";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";

/**
* Detects if the content is a PDF file
* @param content The content to check
* @returns true if the content is a PDF
*/
function isPDFContent(content: string): boolean {
if (!content || typeof content !== 'string') {
return false;
}

const trimmedContent = content.trim();

// Check for PDF header signature
if (trimmedContent.startsWith('%PDF-')) {
return true;
}

// Check for PDF binary content indicators
if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') &&
trimmedContent.includes('stream') && trimmedContent.includes('endstream')) {
return true;
}

// Check for high ratio of non-printable characters (typical of binary PDF content)
const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length;
const totalChars = content.length;
if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) {
return true;
}

return false;
}

/**
* Scrapes a URL with Axios
* @param url The URL to scrape
Expand Down Expand Up @@ -44,6 +77,19 @@ export async function scrapeWithFetch(
}

const text = response.data;

// Check if the content is a PDF file
if (isPDFContent(text)) {
Logger.debug(`⛏️ fetch: Detected PDF content for ${url}, skipping PDF processing`);
logParams.error_message = "PDF content detected - not suitable for text extraction";
logParams.response_code = response.status;
return {
content: "",
pageStatusCode: response.status,
pageError: "PDF content detected - not suitable for text extraction",
};
}

logParams.success = true;
logParams.html = text;
logParams.response_code = response.status;
Expand Down
46 changes: 46 additions & 0 deletions apps/api/src/scraper/WebScraper/scrapers/playwright.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,39 @@ import { generateRequestParams } from "../single_url";
import { universalTimeout } from "../global";
import { Logger } from "../../../lib/logger";

/**
* Detects if the content is a PDF file
* @param content The content to check
* @returns true if the content is a PDF
*/
function isPDFContent(content: string): boolean {
if (!content || typeof content !== 'string') {
return false;
}

const trimmedContent = content.trim();

// Check for PDF header signature
if (trimmedContent.startsWith('%PDF-')) {
return true;
}

// Check for PDF binary content indicators
if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') &&
trimmedContent.includes('stream') && trimmedContent.includes('endstream')) {
return true;
}

// Check for high ratio of non-printable characters (typical of binary PDF content)
const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length;
const totalChars = content.length;
if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) {
return true;
}

return false;
}

/**
* Scrapes a URL with Playwright
* @param url The URL to scrape
Expand Down Expand Up @@ -64,6 +97,19 @@ export async function scrapeWithPlaywright(
try {
const data = JSON.parse(textData);
const html = data.content;

// Check if the content is a PDF file
if (isPDFContent(html)) {
Logger.debug(`⛏️ Playwright: Detected PDF content for ${url}, skipping PDF processing`);
logParams.error_message = "PDF content detected - not suitable for text extraction";
logParams.response_code = data.pageStatusCode;
return {
content: "",
pageStatusCode: data.pageStatusCode,
pageError: "PDF content detected - not suitable for text extraction",
};
}

logParams.success = true;
logParams.html = html;
logParams.response_code = data.pageStatusCode;
Expand Down