devflowinc · dan-and · Sep 23, 2025 · Sep 23, 2025 · Sep 24, 2025 · Sep 24, 2025
diff --git a/apps/api/package-lock.json b/apps/api/package-lock.json
diff --git a/apps/api/package.json b/apps/api/package.json
@@ -41,7 +41,7 @@
     "@types/ws": "^8.5.12",
     "async": "^3.2.5",
     "async-mutex": "^0.5.0",
-    "axios": "^1.3.4",
+    "axios": "^1.12.2",
     "bullmq": "^5.11.0",
     "cacheable-lookup": "^6.1.0",
     "cheerio": "^1.0.0-rc.12",
@@ -62,6 +62,7 @@
     "turndown": "^7.1.3",
     "turndown-plugin-gfm": "^1.0.2",
     "uuid": "^10.0.0",
+    "winston": "^3.11.0",
     "ws": "^8.18.0",
     "xml2js": "^0.6.2",
     "zod": "^3.23.8"

diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml
diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts
@@ -20,7 +20,7 @@ export async function authenticateUser(
 }
 function setTrace(team_id: string, api_key: string) {
   try {
-    console.log("Setting trace attributes");
+    Logger.debug("Setting trace attributes");
   } catch (error) {
     Logger.error(`Error setting trace attributes: ${error.message}`);
   }

diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts
@@ -41,7 +41,7 @@ export async function crawlStatusController(req: Request, res: Response) {
 
     const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
     const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
-    const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
+    const jobStatus = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
 
     const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
 

diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts
@@ -88,7 +88,7 @@ export async function scrapeHelper(
   await job.remove();
 
   if (!doc) {
-    console.error("!!! PANIC DOC IS", doc, job);
+    Logger.error("Panic: Document processing failed", { doc, job: job.id });
     return {
       success: true,
       error: "No page found",

diff --git a/apps/api/src/controllers/v0/status.ts b/apps/api/src/controllers/v0/status.ts
@@ -14,7 +14,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
 
     const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
     const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
-    const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
+    const jobStatus = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
 
     const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);
 

diff --git a/apps/api/src/controllers/v1/bulk-scrape.ts b/apps/api/src/controllers/v1/bulk-scrape.ts
@@ -0,0 +1,98 @@
+import { Response } from "express";
+import { v4 as uuidv4 } from "uuid";
+import {
+  BulkScrapeRequest,
+  bulkScrapeRequestSchema,
+  CrawlResponse,
+  legacyScrapeOptions,
+  RequestWithAuth,
+} from "./types";
+import {
+  addCrawlJobs,
+  lockURLs,
+  saveCrawl,
+  StoredCrawl,
+} from "../../lib/crawl-redis";
+// import { logCrawl } from "../../services/logging/crawl_log";
+import { getScrapeQueue } from "../../services/queue-service";
+import { getJobPriority } from "../../lib/job-priority";
+
+export async function bulkScrapeController(
+  req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>,
+  res: Response<CrawlResponse>
+) {
+  req.body = bulkScrapeRequestSchema.parse(req.body);
+
+  const id = uuidv4();
+
+  // await logCrawl(id, req.auth.team_id);
+
+  // Credit checking not available in firecrawl-simple
+  // let { remainingCredits } = req.account;
+  // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+  // if(!useDbAuthentication){
+  //   remainingCredits = Infinity;
+  // }
+
+  const pageOptions = legacyScrapeOptions(req.body);
+
+  const sc: StoredCrawl = {
+    crawlerOptions: null,
+    pageOptions,
+    team_id: req.auth.team_id,
+    createdAt: Date.now(),
+    plan: req.auth.plan,
+  };
+
+  await saveCrawl(id, sc);
+
+  let jobPriority = 20;
+
+  // If it is over 1000, we need to get the job priority,
+  // otherwise we can use the default priority of 20
+  if(req.body.urls.length > 1000){
+    // set base to 21
+    jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
+  }
+
+  const jobs = req.body.urls.map((x) => {
+    const uuid = uuidv4();
+    return {
+      name: uuid,
+      data: {
+        url: x,
+        mode: "single_urls",
+        team_id: req.auth.team_id,
+        plan: req.auth.plan,
+        crawlerOptions: null,
+        pageOptions,
+        origin: "api",
+        crawl_id: id,
+        sitemapped: true,
+        v1: true,
+      },
+      opts: {
+        jobId: uuid,
+        priority: 20,
+      },
+    };
+  });
+
+  await lockURLs(
+    id,
+    jobs.map((x) => x.data.url)
+  );
+  await addCrawlJobs(
+    id,
+    jobs.map((x) => x.opts.jobId)
+  );
+  await getScrapeQueue().addBulk(jobs);
+
+  const protocol = process.env.ENV === "local" ? req.protocol : "https";
+
+  return res.status(200).json({
+    success: true,
+    id,
+    url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`,
+  });
+}
diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts
@@ -92,7 +92,11 @@ export async function crawlController(
 ) {
   req.body = crawlRequestSchema.parse(req.body);
 
-  Logger.debug(`[Crawl] Request: ${JSON.stringify(req.body)}`);
+  Logger.debug(`[Crawl] Request received`, { 
+    body: req.body, 
+    teamId: req.auth.team_id,
+    plan: req.auth.plan 
+  });
 
   const id = uuidv4();
 
@@ -135,9 +139,8 @@ export async function crawlController(
     sc.robots = await crawler.getRobotsTxt();
   } catch (e) {
     Logger.debug(
-      `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
-        e
-      )}`
+      `[Crawl] Failed to get robots.txt (this is probably fine!)`, 
+      { error: e.message, url: req.body.url }
     );
   }
 
@@ -149,18 +152,28 @@ export async function crawlController(
       : await crawler.tryGetSitemap();
 
   if (sitemap !== null && sitemap.length > 0) {
+    // FIX: Respect the limit parameter when processing sitemap URLs
+    const limit = crawlerOptions.limit || 10000;
+    const limitedSitemap = sitemap.slice(0, limit);
+
+    Logger.debug(`[Crawl] Sitemap found with URLs, applying limit`, { 
+      totalUrls: sitemap.length, 
+      limit: limit,
+      limitedUrls: limitedSitemap.length 
+    });
+
     let jobPriority = 20;
     // If it is over 1000, we need to get the job priority,
     // otherwise we can use the default priority of 20
-    if (sitemap.length > 1000) {
+    if (limitedSitemap.length > 1000) {
       // set base to 21
       jobPriority = await getJobPriority({
         plan: req.auth.plan,
         team_id: req.auth.team_id,
         basePriority: 21,
       });
     }
-    const jobs = sitemap.map((x) => {
+    const jobs = limitedSitemap.map((x) => {
       const url = x.url;
       const uuid = uuidv4();
       return {

diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts
@@ -79,6 +79,7 @@ export async function scrapeController(
       pageOptions,
       origin: req.body.origin,
       is_scrape: true,
+      crawl_id: jobId,
     },
     {},
     jobId,
@@ -106,7 +107,7 @@ export async function scrapeController(
   await job.remove();
 
   if (!doc) {
-    console.error("!!! PANIC DOC IS", doc, job);
+    Logger.error("Panic: Document processing failed", { doc, job: job.id });
     return res.status(200).json({
       success: true,
       warning: "No page found",

diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts
@@ -18,7 +18,7 @@ export const url = z.preprocess(
     .url()
     .regex(/^https?:\/\//, "URL uses unsupported protocol")
     .refine(
-      (x) => /\.[a-z]{2,}(\/|$)/i.test(x),
+      (x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
       "URL must have a valid top-level domain or be a valid path"
     )
     .refine((x) => {
@@ -78,6 +78,13 @@ export const scrapeRequestSchema = scrapeOptions
 
 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
 
+export const bulkScrapeRequestSchema = scrapeOptions.extend({
+  urls: url.array(),
+  origin: z.string().optional().default("api"),
+}).strict(strictMessage);
+
+export type BulkScrapeRequest = z.infer<typeof bulkScrapeRequestSchema>;
+
 const crawlerOptions = z
   .object({
     includePaths: z.string().array().default([]),
@@ -157,6 +164,7 @@ export type Document = {
     articleSection?: string;
     sourceURL?: string;
     statusCode?: number;
+    scrapeId?: string;
     error?: string;
   };
 };
@@ -280,7 +288,7 @@ export function legacyDocumentConverter(doc: any): Document {
 
   return {
     markdown: doc.markdown,
-    links: doc.linksOnPage.filter((x: any) => x !== null),
+    links: doc.linksOnPage ? doc.linksOnPage.filter((x: any) => x !== null) : [],
     rawHtml: doc.rawHtml,
     html: doc.html,
     extract: doc.llm_extraction,
@@ -291,6 +299,7 @@ export function legacyDocumentConverter(doc: any): Document {
       pageStatusCode: undefined,
       error: doc.metadata.pageError,
       statusCode: doc.metadata.pageStatusCode,
+      scrapeId: doc.metadata.scrapeId,
     },
   };
 }
diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts
@@ -2,7 +2,7 @@ import { WebCrawler } from "../scraper/WebScraper/crawler";
 import { redisConnection } from "../services/queue-service";
 
 export type StoredCrawl = {
-  originUrl: string;
+  originUrl?: string;
   crawlerOptions: any;
   pageOptions: any;
   team_id: string;
@@ -108,7 +108,7 @@ export async function lockURL(
 ): Promise<boolean> {
   if (typeof sc.crawlerOptions?.limit === "number") {
     if (
-      (await redisConnection.scard("crawl:" + id + ":visited")) >=
+      (await redisConnection.scard("crawl:" + id + ":visited_unique")) >=
       sc.crawlerOptions.limit
     ) {
       return false;
@@ -117,6 +117,12 @@ export async function lockURL(
   const res =
     (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
   await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
+
+  if (res) {
+    await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
+    await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
+  }
+
   return res;
 }
 
@@ -125,13 +131,19 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
   const res =
     (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0;
   await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
+
+  if (res) {
+    await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
+    await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
+  }
+
   return res;
 }
 
 export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
   const crawler = new WebCrawler({
     jobId: id,
-    initialUrl: sc.originUrl,
+    initialUrl: sc.originUrl || "",
     includes: sc.crawlerOptions?.includes ?? [],
     excludes: sc.crawlerOptions?.excludes ?? [],
     maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,

diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts
@@ -85,7 +85,7 @@ export async function parseMarkdown(html: string): Promise<string> {
 
     return markdownContent;
   } catch (error) {
-    console.error("Error converting HTML to Markdown: ", error);
+    Logger.error("Error converting HTML to Markdown", { error: error.message });
     return ""; // Optionally return an empty string or handle the error as needed
   }
 }