Skip to content

Commit d57d7a4

Browse files
authored
Merge pull request #693 from getmaxun/develop
chore: release v0.0.20
2 parents fc3d282 + 3597e55 commit d57d7a4

30 files changed

+4399
-1863
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Maxun lets you train a robot in 2 minutes and scrape the web on auto-pilot. Web
2626
<a href="https://trendshift.io/repositories/12113" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12113" alt="getmaxun%2Fmaxun | Trendshift" style="width: 250px; height: 55px; margin-top: 10px;" width="250" height="55"/></a>
2727
</p>
2828

29-
![maxun_demo](https://github.com/user-attachments/assets/a61ba670-e56a-4ae1-9681-0b4bd6ba9cdc)
29+
![maxun_gif](https://github.com/user-attachments/assets/3e0b0cf8-9e52-44d2-a140-b26b7b481477)
3030

3131
<img src="https://static.scarf.sh/a.png?x-pxid=c12a77cc-855e-4602-8a0f-614b2d0da56a" />
3232

maxun-core/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "maxun-core",
3-
"version": "0.0.19",
3+
"version": "0.0.20",
44
"description": "Core package for Maxun, responsible for data extraction",
55
"main": "build/index.js",
66
"typings": "build/index.d.ts",

maxun-core/src/browserSide/scraper.js

Lines changed: 201 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -424,26 +424,214 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
424424
*/
425425
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
426426
// XPath evaluation functions
427-
const evaluateXPath = (rootElement, xpath) => {
427+
const queryInsideContext = (context, part) => {
428428
try {
429-
const ownerDoc =
430-
rootElement.nodeType === Node.DOCUMENT_NODE
431-
? rootElement
432-
: rootElement.ownerDocument;
429+
const { tagName, conditions } = parseXPathPart(part);
433430

434-
if (!ownerDoc) return null;
431+
const candidateElements = Array.from(context.querySelectorAll(tagName));
432+
if (candidateElements.length === 0) {
433+
return [];
434+
}
435435

436-
const result = ownerDoc.evaluate(
436+
const matchingElements = candidateElements.filter((el) => {
437+
return elementMatchesConditions(el, conditions);
438+
});
439+
440+
return matchingElements;
441+
} catch (err) {
442+
console.error("Error in queryInsideContext:", err);
443+
return [];
444+
}
445+
};
446+
447+
// Helper function to parse XPath part
448+
const parseXPathPart = (part) => {
449+
const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
450+
const tagName = tagMatch ? tagMatch[1] : "*";
451+
452+
const conditionMatches = part.match(/\[([^\]]+)\]/g);
453+
const conditions = conditionMatches
454+
? conditionMatches.map((c) => c.slice(1, -1))
455+
: [];
456+
457+
return { tagName, conditions };
458+
};
459+
460+
// Helper function to check if element matches all conditions
461+
const elementMatchesConditions = (element, conditions) => {
462+
for (const condition of conditions) {
463+
if (!elementMatchesCondition(element, condition)) {
464+
return false;
465+
}
466+
}
467+
return true;
468+
};
469+
470+
// Helper function to check if element matches a single condition
471+
const elementMatchesCondition = (element, condition) => {
472+
condition = condition.trim();
473+
474+
if (/^\d+$/.test(condition)) {
475+
return true;
476+
}
477+
478+
// Handle @attribute="value"
479+
const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
480+
if (attrMatch) {
481+
const [, attr, value] = attrMatch;
482+
const elementValue = element.getAttribute(attr);
483+
return elementValue === value;
484+
}
485+
486+
// Handle contains(@class, 'value')
487+
const classContainsMatch = condition.match(
488+
/^contains\(@class,\s*["']([^"']+)["']\)$/
489+
);
490+
if (classContainsMatch) {
491+
const className = classContainsMatch[1];
492+
return element.classList.contains(className);
493+
}
494+
495+
// Handle contains(@attribute, 'value')
496+
const attrContainsMatch = condition.match(
497+
/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/
498+
);
499+
if (attrContainsMatch) {
500+
const [, attr, value] = attrContainsMatch;
501+
const elementValue = element.getAttribute(attr) || "";
502+
return elementValue.includes(value);
503+
}
504+
505+
// Handle text()="value"
506+
const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
507+
if (textMatch) {
508+
const expectedText = textMatch[1];
509+
const elementText = element.textContent?.trim() || "";
510+
return elementText === expectedText;
511+
}
512+
513+
// Handle contains(text(), 'value')
514+
const textContainsMatch = condition.match(
515+
/^contains\(text\(\),\s*["']([^"']+)["']\)$/
516+
);
517+
if (textContainsMatch) {
518+
const expectedText = textContainsMatch[1];
519+
const elementText = element.textContent?.trim() || "";
520+
return elementText.includes(expectedText);
521+
}
522+
523+
// Handle count(*)=0 (element has no children)
524+
if (condition === "count(*)=0") {
525+
return element.children.length === 0;
526+
}
527+
528+
// Handle other count conditions
529+
const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
530+
if (countMatch) {
531+
const expectedCount = parseInt(countMatch[1]);
532+
return element.children.length === expectedCount;
533+
}
534+
535+
return true;
536+
};
537+
538+
const evaluateXPath = (document, xpath, isShadow = false) => {
539+
try {
540+
const result = document.evaluate(
437541
xpath,
438-
rootElement,
542+
document,
439543
null,
440544
XPathResult.FIRST_ORDERED_NODE_TYPE,
441545
null
442-
);
546+
).singleNodeValue;
443547

444-
return result.singleNodeValue;
445-
} catch (error) {
446-
console.warn("XPath evaluation failed:", xpath, error);
548+
if (!isShadow) {
549+
if (result === null) {
550+
return null;
551+
}
552+
return result;
553+
}
554+
555+
let cleanPath = xpath;
556+
let isIndexed = false;
557+
558+
const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
559+
if (indexedMatch) {
560+
cleanPath = indexedMatch[1] + indexedMatch[3];
561+
isIndexed = true;
562+
}
563+
564+
const pathParts = cleanPath
565+
.replace(/^\/\//, "")
566+
.split("/")
567+
.map((p) => p.trim())
568+
.filter((p) => p.length > 0);
569+
570+
let currentContexts = [document];
571+
572+
for (let i = 0; i < pathParts.length; i++) {
573+
const part = pathParts[i];
574+
const nextContexts = [];
575+
576+
for (const ctx of currentContexts) {
577+
const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
578+
let partWithoutPosition = part;
579+
let requestedPosition = null;
580+
581+
if (positionalMatch) {
582+
partWithoutPosition = positionalMatch[1];
583+
requestedPosition = parseInt(positionalMatch[2]);
584+
}
585+
586+
const matched = queryInsideContext(ctx, partWithoutPosition);
587+
588+
let elementsToAdd = matched;
589+
if (requestedPosition !== null) {
590+
const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
591+
if (index >= 0 && index < matched.length) {
592+
elementsToAdd = [matched[index]];
593+
} else {
594+
console.warn(
595+
`Position ${requestedPosition} out of range (${matched.length} elements found)`
596+
);
597+
elementsToAdd = [];
598+
}
599+
}
600+
601+
elementsToAdd.forEach((el) => {
602+
nextContexts.push(el);
603+
if (el.shadowRoot) {
604+
nextContexts.push(el.shadowRoot);
605+
}
606+
});
607+
}
608+
609+
if (nextContexts.length === 0) {
610+
return null;
611+
}
612+
613+
currentContexts = nextContexts;
614+
}
615+
616+
if (currentContexts.length > 0) {
617+
if (isIndexed && indexedMatch) {
618+
const requestedIndex = parseInt(indexedMatch[2]) - 1;
619+
if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
620+
return currentContexts[requestedIndex];
621+
} else {
622+
console.warn(
623+
`Requested index ${requestedIndex + 1} out of range (${currentContexts.length} elements found)`
624+
);
625+
return null;
626+
}
627+
}
628+
629+
return currentContexts[0];
630+
}
631+
632+
return null;
633+
} catch (err) {
634+
console.error("Critical XPath failure:", xpath, err);
447635
return null;
448636
}
449637
};
@@ -1018,7 +1206,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
10181206
listSelector,
10191207
containerIndex + 1
10201208
);
1021-
element = evaluateXPath(document, indexedSelector);
1209+
element = evaluateXPath(document, indexedSelector, field.isShadow);
10221210
} else {
10231211
// Fallback for CSS selectors within XPath containers
10241212
const container = containers[containerIndex];

maxun-core/src/interpret.ts

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -950,9 +950,9 @@ export default class Interpreter extends EventEmitter {
950950
if (checkLimit()) return allResults;
951951

952952
let loadMoreCounter = 0;
953-
let previousResultCount = allResults.length;
954-
let noNewItemsCounter = 0;
955-
const MAX_NO_NEW_ITEMS = 2;
953+
// let previousResultCount = allResults.length;
954+
// let noNewItemsCounter = 0;
955+
// const MAX_NO_NEW_ITEMS = 2;
956956

957957
while (true) {
958958
// Find working button with retry mechanism
@@ -1019,21 +1019,21 @@ export default class Interpreter extends EventEmitter {
10191019

10201020
await scrapeCurrentPage();
10211021

1022-
const currentResultCount = allResults.length;
1023-
const newItemsAdded = currentResultCount > previousResultCount;
1022+
// const currentResultCount = allResults.length;
1023+
// const newItemsAdded = currentResultCount > previousResultCount;
10241024

1025-
if (!newItemsAdded) {
1026-
noNewItemsCounter++;
1027-
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
1025+
// if (!newItemsAdded) {
1026+
// noNewItemsCounter++;
1027+
// debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
10281028

1029-
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
1030-
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
1031-
return allResults;
1032-
}
1033-
} else {
1034-
noNewItemsCounter = 0;
1035-
previousResultCount = currentResultCount;
1036-
}
1029+
// if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
1030+
// debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
1031+
// return allResults;
1032+
// }
1033+
// } else {
1034+
// noNewItemsCounter = 0;
1035+
// previousResultCount = currentResultCount;
1036+
// }
10371037

10381038
if (checkLimit()) return allResults;
10391039

nginx.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ server {
1111
}
1212

1313
# Proxy for backend
14-
location ^/(auth|storage|record|workflow|robot|proxy|api-docs|api)(/|$) {
14+
location ~ ^/(auth|storage|record|workflow|robot|proxy|api-docs|api|webhook)(/|$) {
1515
proxy_pass http://localhost:8080; # change as per your setup
1616
proxy_http_version 1.1;
1717
proxy_set_header Upgrade $http_upgrade;

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "maxun",
3-
"version": "0.0.18",
3+
"version": "0.0.20",
44
"author": "Maxun",
55
"license": "AGPL-3.0-or-later",
66
"dependencies": {
@@ -50,7 +50,7 @@
5050
"lodash": "^4.17.21",
5151
"loglevel": "^1.8.0",
5252
"loglevel-plugin-remote": "^0.6.8",
53-
"maxun-core": "^0.0.19",
53+
"maxun-core": "^0.0.20",
5454
"minio": "^8.0.1",
5555
"moment-timezone": "^0.5.45",
5656
"node-cron": "^3.0.3",

public/locales/de.json

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,11 @@
141141
"no_key_message": "Sie haben noch keinen API-Schlüssel generiert.",
142142
"generate_button": "API-Schlüssel generieren",
143143
"notifications": {
144-
"fetch_error": "API-Schlüssel konnte nicht abgerufen werden - ${error}",
144+
"fetch_error": "API-Schlüssel konnte nicht abgerufen werden - {{error}}",
145145
"generate_success": "API-Schlüssel erfolgreich generiert",
146-
"generate_error": "API-Schlüssel konnte nicht generiert werden - ${error}",
146+
"generate_error": "API-Schlüssel konnte nicht generiert werden - {{error}}",
147147
"delete_success": "API-Schlüssel erfolgreich gelöscht",
148-
"delete_error": "API-Schlüssel konnte nicht gelöscht werden - ${error}",
148+
"delete_error": "API-Schlüssel konnte nicht gelöscht werden - {{error}}",
149149
"copy_success": "API-Schlüssel erfolgreich kopiert"
150150
}
151151
},
@@ -305,7 +305,7 @@
305305
},
306306
"recording_page": {
307307
"loader": {
308-
"browser_startup": "Browser wird gestartet...Navigation zu {{url}}"
308+
"browser_startup": "Browser wird gestartet...Festhalten"
309309
}
310310
},
311311
"integration_settings": {
@@ -600,6 +600,7 @@
600600
"es": "Spanisch",
601601
"ja": "Japanisch",
602602
"zh": "Chinesisch",
603-
"de": "Deutsch"
603+
"de": "Deutsch",
604+
"tr": "Türkisch"
604605
}
605606
}

public/locales/en.json

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,11 @@
148148
"no_key_message": "You haven't generated an API key yet.",
149149
"generate_button": "Generate API Key",
150150
"notifications": {
151-
"fetch_error": "Failed to fetch API Key - ${error}",
151+
"fetch_error": "Failed to fetch API Key - {{error}}",
152152
"generate_success": "Generated API Key successfully",
153-
"generate_error": "Failed to generate API Key - ${error}",
153+
"generate_error": "Failed to generate API Key - {{error}}",
154154
"delete_success": "API Key deleted successfully",
155-
"delete_error": "Failed to delete API Key - ${error}",
155+
"delete_error": "Failed to delete API Key - {{error}}",
156156
"copy_success": "Copied API Key successfully"
157157
}
158158
},
@@ -318,7 +318,7 @@
318318
},
319319
"recording_page": {
320320
"loader": {
321-
"browser_startup": "Spinning up a browser...Navigating to {{url}}"
321+
"browser_startup": "Spinning up a browser...Hold tight"
322322
}
323323
},
324324
"integration_settings": {
@@ -613,6 +613,7 @@
613613
"es": "Spanish",
614614
"ja": "Japanese",
615615
"zh": "Chinese",
616-
"de": "German"
616+
"de": "German",
617+
"tr": "Turkish"
617618
}
618619
}

0 commit comments

Comments
 (0)