Skip to content

Commit e2d4073

Browse files
authored
Merge pull request #694 from getmaxun/child-shadow
feat(maxun-core): child extraction + deep shadow dom
2 parents fd7d0df + bc94690 commit e2d4073

File tree

7 files changed

+2291
-1159
lines changed

7 files changed

+2291
-1159
lines changed

maxun-core/src/browserSide/scraper.js

Lines changed: 201 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -424,26 +424,214 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
424424
*/
425425
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
426426
// XPath evaluation functions
427-
const evaluateXPath = (rootElement, xpath) => {
427+
const queryInsideContext = (context, part) => {
428428
try {
429-
const ownerDoc =
430-
rootElement.nodeType === Node.DOCUMENT_NODE
431-
? rootElement
432-
: rootElement.ownerDocument;
429+
const { tagName, conditions } = parseXPathPart(part);
433430

434-
if (!ownerDoc) return null;
431+
const candidateElements = Array.from(context.querySelectorAll(tagName));
432+
if (candidateElements.length === 0) {
433+
return [];
434+
}
435435

436-
const result = ownerDoc.evaluate(
436+
const matchingElements = candidateElements.filter((el) => {
437+
return elementMatchesConditions(el, conditions);
438+
});
439+
440+
return matchingElements;
441+
} catch (err) {
442+
console.error("Error in queryInsideContext:", err);
443+
return [];
444+
}
445+
};
446+
447+
// Helper function to parse XPath part
448+
const parseXPathPart = (part) => {
449+
const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
450+
const tagName = tagMatch ? tagMatch[1] : "*";
451+
452+
const conditionMatches = part.match(/\[([^\]]+)\]/g);
453+
const conditions = conditionMatches
454+
? conditionMatches.map((c) => c.slice(1, -1))
455+
: [];
456+
457+
return { tagName, conditions };
458+
};
459+
460+
// Helper function to check if element matches all conditions
461+
const elementMatchesConditions = (element, conditions) => {
462+
for (const condition of conditions) {
463+
if (!elementMatchesCondition(element, condition)) {
464+
return false;
465+
}
466+
}
467+
return true;
468+
};
469+
470+
// Helper function to check if element matches a single condition
471+
const elementMatchesCondition = (element, condition) => {
472+
condition = condition.trim();
473+
474+
if (/^\d+$/.test(condition)) {
475+
return true;
476+
}
477+
478+
// Handle @attribute="value"
479+
const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
480+
if (attrMatch) {
481+
const [, attr, value] = attrMatch;
482+
const elementValue = element.getAttribute(attr);
483+
return elementValue === value;
484+
}
485+
486+
// Handle contains(@class, 'value')
487+
const classContainsMatch = condition.match(
488+
/^contains\(@class,\s*["']([^"']+)["']\)$/
489+
);
490+
if (classContainsMatch) {
491+
const className = classContainsMatch[1];
492+
return element.classList.contains(className);
493+
}
494+
495+
// Handle contains(@attribute, 'value')
496+
const attrContainsMatch = condition.match(
497+
/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/
498+
);
499+
if (attrContainsMatch) {
500+
const [, attr, value] = attrContainsMatch;
501+
const elementValue = element.getAttribute(attr) || "";
502+
return elementValue.includes(value);
503+
}
504+
505+
// Handle text()="value"
506+
const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
507+
if (textMatch) {
508+
const expectedText = textMatch[1];
509+
const elementText = element.textContent?.trim() || "";
510+
return elementText === expectedText;
511+
}
512+
513+
// Handle contains(text(), 'value')
514+
const textContainsMatch = condition.match(
515+
/^contains\(text\(\),\s*["']([^"']+)["']\)$/
516+
);
517+
if (textContainsMatch) {
518+
const expectedText = textContainsMatch[1];
519+
const elementText = element.textContent?.trim() || "";
520+
return elementText.includes(expectedText);
521+
}
522+
523+
// Handle count(*)=0 (element has no children)
524+
if (condition === "count(*)=0") {
525+
return element.children.length === 0;
526+
}
527+
528+
// Handle other count conditions
529+
const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
530+
if (countMatch) {
531+
const expectedCount = parseInt(countMatch[1]);
532+
return element.children.length === expectedCount;
533+
}
534+
535+
return true;
536+
};
537+
538+
const evaluateXPath = (document, xpath, isShadow = false) => {
539+
try {
540+
const result = document.evaluate(
437541
xpath,
438-
rootElement,
542+
document,
439543
null,
440544
XPathResult.FIRST_ORDERED_NODE_TYPE,
441545
null
442-
);
546+
).singleNodeValue;
443547

444-
return result.singleNodeValue;
445-
} catch (error) {
446-
console.warn("XPath evaluation failed:", xpath, error);
548+
if (!isShadow) {
549+
if (result === null) {
550+
return null;
551+
}
552+
return result;
553+
}
554+
555+
let cleanPath = xpath;
556+
let isIndexed = false;
557+
558+
const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
559+
if (indexedMatch) {
560+
cleanPath = indexedMatch[1] + indexedMatch[3];
561+
isIndexed = true;
562+
}
563+
564+
const pathParts = cleanPath
565+
.replace(/^\/\//, "")
566+
.split("/")
567+
.map((p) => p.trim())
568+
.filter((p) => p.length > 0);
569+
570+
let currentContexts = [document];
571+
572+
for (let i = 0; i < pathParts.length; i++) {
573+
const part = pathParts[i];
574+
const nextContexts = [];
575+
576+
for (const ctx of currentContexts) {
577+
const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
578+
let partWithoutPosition = part;
579+
let requestedPosition = null;
580+
581+
if (positionalMatch) {
582+
partWithoutPosition = positionalMatch[1];
583+
requestedPosition = parseInt(positionalMatch[2]);
584+
}
585+
586+
const matched = queryInsideContext(ctx, partWithoutPosition);
587+
588+
let elementsToAdd = matched;
589+
if (requestedPosition !== null) {
590+
const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
591+
if (index >= 0 && index < matched.length) {
592+
elementsToAdd = [matched[index]];
593+
} else {
594+
console.warn(
595+
`Position ${requestedPosition} out of range (${matched.length} elements found)`
596+
);
597+
elementsToAdd = [];
598+
}
599+
}
600+
601+
elementsToAdd.forEach((el) => {
602+
nextContexts.push(el);
603+
if (el.shadowRoot) {
604+
nextContexts.push(el.shadowRoot);
605+
}
606+
});
607+
}
608+
609+
if (nextContexts.length === 0) {
610+
return null;
611+
}
612+
613+
currentContexts = nextContexts;
614+
}
615+
616+
if (currentContexts.length > 0) {
617+
if (isIndexed && indexedMatch) {
618+
const requestedIndex = parseInt(indexedMatch[2]) - 1;
619+
if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
620+
return currentContexts[requestedIndex];
621+
} else {
622+
console.warn(
623+
`Requested index ${requestedIndex + 1} out of range (${currentContexts.length} elements found)`
624+
);
625+
return null;
626+
}
627+
}
628+
629+
return currentContexts[0];
630+
}
631+
632+
return null;
633+
} catch (err) {
634+
console.error("Critical XPath failure:", xpath, err);
447635
return null;
448636
}
449637
};
@@ -1018,7 +1206,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
10181206
listSelector,
10191207
containerIndex + 1
10201208
);
1021-
element = evaluateXPath(document, indexedSelector);
1209+
element = evaluateXPath(document, indexedSelector, field.isShadow);
10221210
} else {
10231211
// Fallback for CSS selectors within XPath containers
10241212
const container = containers[containerIndex];

0 commit comments

Comments
 (0)