devflowinc · dan-and · Sep 27, 2025
diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts
@@ -2,6 +2,39 @@ import axios from "axios";
 import { universalTimeout } from "../global";
 import { Logger } from "../../../lib/logger";
 
+/**
+ * Detects if the content is a PDF file
+ * @param content The content to check
+ * @returns true if the content is a PDF
+ */
+function isPDFContent(content: string): boolean {
+  if (!content || typeof content !== 'string') {
+    return false;
+  }
+
+  const trimmedContent = content.trim();
+
+  // Check for PDF header signature
+  if (trimmedContent.startsWith('%PDF-')) {
+    return true;
+  }
+
+  // Check for PDF binary content indicators
+  if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') && 
+      trimmedContent.includes('stream') && trimmedContent.includes('endstream')) {
+    return true;
+  }
+
+  // Check for high ratio of non-printable characters (typical of binary PDF content)
+  const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length;
+  const totalChars = content.length;
+  if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) {
+    return true;
+  }
+
+  return false;
+}
+
 /**
  * Scrapes a URL with Axios
  * @param url The URL to scrape
@@ -44,6 +77,19 @@ export async function scrapeWithFetch(
     }
 
     const text = response.data;
+
+    // Check if the content is a PDF file
+    if (isPDFContent(text)) {
+      Logger.debug(`⛏️ fetch: Detected PDF content for ${url}, skipping PDF processing`);
+      logParams.error_message = "PDF content detected - not suitable for text extraction";
+      logParams.response_code = response.status;
+      return {
+        content: "",
+        pageStatusCode: response.status,
+        pageError: "PDF content detected - not suitable for text extraction",
+      };
+    }
+
     logParams.success = true;
     logParams.html = text;
     logParams.response_code = response.status;

diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts
@@ -3,6 +3,39 @@ import { generateRequestParams } from "../single_url";
 import { universalTimeout } from "../global";
 import { Logger } from "../../../lib/logger";
 
+/**
+ * Detects if the content is a PDF file
+ * @param content The content to check
+ * @returns true if the content is a PDF
+ */
+function isPDFContent(content: string): boolean {
+  if (!content || typeof content !== 'string') {
+    return false;
+  }
+
+  const trimmedContent = content.trim();
+
+  // Check for PDF header signature
+  if (trimmedContent.startsWith('%PDF-')) {
+    return true;
+  }
+
+  // Check for PDF binary content indicators
+  if (trimmedContent.includes('obj') && trimmedContent.includes('endobj') && 
+      trimmedContent.includes('stream') && trimmedContent.includes('endstream')) {
+    return true;
+  }
+
+  // Check for high ratio of non-printable characters (typical of binary PDF content)
+  const nonPrintableChars = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g) || []).length;
+  const totalChars = content.length;
+  if (totalChars > 100 && nonPrintableChars / totalChars > 0.1) {
+    return true;
+  }
+
+  return false;
+}
+
 /**
  * Scrapes a URL with Playwright
  * @param url The URL to scrape
@@ -64,6 +97,19 @@ export async function scrapeWithPlaywright(
     try {
       const data = JSON.parse(textData);
       const html = data.content;
+
+      // Check if the content is a PDF file
+      if (isPDFContent(html)) {
+        Logger.debug(`⛏️ Playwright: Detected PDF content for ${url}, skipping PDF processing`);
+        logParams.error_message = "PDF content detected - not suitable for text extraction";
+        logParams.response_code = data.pageStatusCode;
+        return {
+          content: "",
+          pageStatusCode: data.pageStatusCode,
+          pageError: "PDF content detected - not suitable for text extraction",
+        };
+      }
+
       logParams.success = true;
       logParams.html = html;
       logParams.response_code = data.pageStatusCode;