Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8,517 changes: 8,517 additions & 0 deletions apps/api/package-lock.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion apps/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"@types/ws": "^8.5.12",
"async": "^3.2.5",
"async-mutex": "^0.5.0",
"axios": "^1.3.4",
"axios": "^1.12.2",
"bullmq": "^5.11.0",
"cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12",
Expand All @@ -62,6 +62,7 @@
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"uuid": "^10.0.0",
"winston": "^3.11.0",
"ws": "^8.18.0",
"xml2js": "^0.6.2",
"zod": "^3.23.8"
Expand Down
2,036 changes: 1,164 additions & 872 deletions apps/api/pnpm-lock.yaml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion apps/api/src/controllers/auth.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export async function authenticateUser(
}
function setTrace(team_id: string, api_key: string) {
try {
console.log("Setting trace attributes");
Logger.debug("Setting trace attributes");
} catch (error) {
Logger.error(`Error setting trace attributes: ${error.message}`);
}
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/controllers/v0/crawl-status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ export async function crawlStatusController(req: Request, res: Response) {

const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const jobStatus = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);

Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/controllers/v0/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ export async function scrapeHelper(
await job.remove();

if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
Logger.error("Panic: Document processing failed", { doc, job: job.id });
return {
success: true,
error: "No page found",
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/controllers/v0/status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons

const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";
const jobStatus = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue);

Expand Down
98 changes: 98 additions & 0 deletions apps/api/src/controllers/v1/bulk-scrape.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import {
BulkScrapeRequest,
bulkScrapeRequestSchema,
CrawlResponse,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJobs,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
// import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { getJobPriority } from "../../lib/job-priority";

export async function bulkScrapeController(
req: RequestWithAuth<{}, CrawlResponse, BulkScrapeRequest>,
res: Response<CrawlResponse>
) {
req.body = bulkScrapeRequestSchema.parse(req.body);

const id = uuidv4();

// await logCrawl(id, req.auth.team_id);

// Credit checking not available in firecrawl-simple
// let { remainingCredits } = req.account;
// const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
// if(!useDbAuthentication){
// remainingCredits = Infinity;
// }

const pageOptions = legacyScrapeOptions(req.body);

const sc: StoredCrawl = {
crawlerOptions: null,
pageOptions,
team_id: req.auth.team_id,
createdAt: Date.now(),
plan: req.auth.plan,
};

await saveCrawl(id, sc);

let jobPriority = 20;

// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if(req.body.urls.length > 1000){
// set base to 21
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
}

const jobs = req.body.urls.map((x) => {
const uuid = uuidv4();
return {
name: uuid,
data: {
url: x,
mode: "single_urls",
team_id: req.auth.team_id,
plan: req.auth.plan,
crawlerOptions: null,
pageOptions,
origin: "api",
crawl_id: id,
sitemapped: true,
v1: true,
},
opts: {
jobId: uuid,
priority: 20,
},
};
});

await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);

const protocol = process.env.ENV === "local" ? req.protocol : "https";

return res.status(200).json({
success: true,
id,
url: `${protocol}://${req.get("host")}/v1/bulk/scrape/${id}`,
});
}
25 changes: 19 additions & 6 deletions apps/api/src/controllers/v1/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ export async function crawlController(
) {
req.body = crawlRequestSchema.parse(req.body);

Logger.debug(`[Crawl] Request: ${JSON.stringify(req.body)}`);
Logger.debug(`[Crawl] Request received`, {
body: req.body,
teamId: req.auth.team_id,
plan: req.auth.plan
});

const id = uuidv4();

Expand Down Expand Up @@ -135,9 +139,8 @@ export async function crawlController(
sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e
)}`
`[Crawl] Failed to get robots.txt (this is probably fine!)`,
{ error: e.message, url: req.body.url }
);
}

Expand All @@ -149,18 +152,28 @@ export async function crawlController(
: await crawler.tryGetSitemap();

if (sitemap !== null && sitemap.length > 0) {
// FIX: Respect the limit parameter when processing sitemap URLs
const limit = crawlerOptions.limit || 10000;
const limitedSitemap = sitemap.slice(0, limit);

Logger.debug(`[Crawl] Sitemap found with URLs, applying limit`, {
totalUrls: sitemap.length,
limit: limit,
limitedUrls: limitedSitemap.length
});

let jobPriority = 20;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if (sitemap.length > 1000) {
if (limitedSitemap.length > 1000) {
// set base to 21
jobPriority = await getJobPriority({
plan: req.auth.plan,
team_id: req.auth.team_id,
basePriority: 21,
});
}
const jobs = sitemap.map((x) => {
const jobs = limitedSitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
Expand Down
3 changes: 2 additions & 1 deletion apps/api/src/controllers/v1/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ export async function scrapeController(
pageOptions,
origin: req.body.origin,
is_scrape: true,
crawl_id: jobId,
},
{},
jobId,
Expand Down Expand Up @@ -106,7 +107,7 @@ export async function scrapeController(
await job.remove();

if (!doc) {
console.error("!!! PANIC DOC IS", doc, job);
Logger.error("Panic: Document processing failed", { doc, job: job.id });
return res.status(200).json({
success: true,
warning: "No page found",
Expand Down
13 changes: 11 additions & 2 deletions apps/api/src/controllers/v1/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export const url = z.preprocess(
.url()
.regex(/^https?:\/\//, "URL uses unsupported protocol")
.refine(
(x) => /\.[a-z]{2,}(\/|$)/i.test(x),
(x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
"URL must have a valid top-level domain or be a valid path"
)
.refine((x) => {
Expand Down Expand Up @@ -78,6 +78,13 @@ export const scrapeRequestSchema = scrapeOptions

export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;

export const bulkScrapeRequestSchema = scrapeOptions.extend({
urls: url.array(),
origin: z.string().optional().default("api"),
}).strict(strictMessage);

export type BulkScrapeRequest = z.infer<typeof bulkScrapeRequestSchema>;

const crawlerOptions = z
.object({
includePaths: z.string().array().default([]),
Expand Down Expand Up @@ -157,6 +164,7 @@ export type Document = {
articleSection?: string;
sourceURL?: string;
statusCode?: number;
scrapeId?: string;
error?: string;
};
};
Expand Down Expand Up @@ -280,7 +288,7 @@ export function legacyDocumentConverter(doc: any): Document {

return {
markdown: doc.markdown,
links: doc.linksOnPage.filter((x: any) => x !== null),
links: doc.linksOnPage ? doc.linksOnPage.filter((x: any) => x !== null) : [],
rawHtml: doc.rawHtml,
html: doc.html,
extract: doc.llm_extraction,
Expand All @@ -291,6 +299,7 @@ export function legacyDocumentConverter(doc: any): Document {
pageStatusCode: undefined,
error: doc.metadata.pageError,
statusCode: doc.metadata.pageStatusCode,
scrapeId: doc.metadata.scrapeId,
},
};
}
18 changes: 15 additions & 3 deletions apps/api/src/lib/crawl-redis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";

export type StoredCrawl = {
originUrl: string;
originUrl?: string;
crawlerOptions: any;
pageOptions: any;
team_id: string;
Expand Down Expand Up @@ -108,7 +108,7 @@ export async function lockURL(
): Promise<boolean> {
if (typeof sc.crawlerOptions?.limit === "number") {
if (
(await redisConnection.scard("crawl:" + id + ":visited")) >=
(await redisConnection.scard("crawl:" + id + ":visited_unique")) >=
sc.crawlerOptions.limit
) {
return false;
Expand All @@ -117,6 +117,12 @@ export async function lockURL(
const res =
(await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0;
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");

if (res) {
await redisConnection.sadd("crawl:" + id + ":visited_unique", url);
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
}

return res;
}

Expand All @@ -125,13 +131,19 @@ export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
const res =
(await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0;
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");

if (res) {
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
}

return res;
}

export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
initialUrl: sc.originUrl,
initialUrl: sc.originUrl || "",
includes: sc.crawlerOptions?.includes ?? [],
excludes: sc.crawlerOptions?.excludes ?? [],
maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000,
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/lib/html-to-markdown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export async function parseMarkdown(html: string): Promise<string> {

return markdownContent;
} catch (error) {
console.error("Error converting HTML to Markdown: ", error);
Logger.error("Error converting HTML to Markdown", { error: error.message });
return ""; // Optionally return an empty string or handle the error as needed
}
}
Expand Down
Loading