@@ -424,26 +424,214 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
424
424
*/
425
425
window . scrapeList = async function ( { listSelector, fields, limit = 10 } ) {
426
426
// XPath evaluation functions
427
- const evaluateXPath = ( rootElement , xpath ) => {
427
+ const queryInsideContext = ( context , part ) => {
428
428
try {
429
- const ownerDoc =
430
- rootElement . nodeType === Node . DOCUMENT_NODE
431
- ? rootElement
432
- : rootElement . ownerDocument ;
429
+ const { tagName, conditions } = parseXPathPart ( part ) ;
433
430
434
- if ( ! ownerDoc ) return null ;
431
+ const candidateElements = Array . from ( context . querySelectorAll ( tagName ) ) ;
432
+ if ( candidateElements . length === 0 ) {
433
+ return [ ] ;
434
+ }
435
435
436
- const result = ownerDoc . evaluate (
436
+ const matchingElements = candidateElements . filter ( ( el ) => {
437
+ return elementMatchesConditions ( el , conditions ) ;
438
+ } ) ;
439
+
440
+ return matchingElements ;
441
+ } catch ( err ) {
442
+ console . error ( "Error in queryInsideContext:" , err ) ;
443
+ return [ ] ;
444
+ }
445
+ } ;
446
+
447
+ // Helper function to parse XPath part
448
+ const parseXPathPart = ( part ) => {
449
+ const tagMatch = part . match ( / ^ ( [ a - z A - Z 0 - 9 - ] + ) / ) ;
450
+ const tagName = tagMatch ? tagMatch [ 1 ] : "*" ;
451
+
452
+ const conditionMatches = part . match ( / \[ ( [ ^ \] ] + ) \] / g) ;
453
+ const conditions = conditionMatches
454
+ ? conditionMatches . map ( ( c ) => c . slice ( 1 , - 1 ) )
455
+ : [ ] ;
456
+
457
+ return { tagName, conditions } ;
458
+ } ;
459
+
460
+ // Helper function to check if element matches all conditions
461
+ const elementMatchesConditions = ( element , conditions ) => {
462
+ for ( const condition of conditions ) {
463
+ if ( ! elementMatchesCondition ( element , condition ) ) {
464
+ return false ;
465
+ }
466
+ }
467
+ return true ;
468
+ } ;
469
+
470
+ // Helper function to check if element matches a single condition
471
+ const elementMatchesCondition = ( element , condition ) => {
472
+ condition = condition . trim ( ) ;
473
+
474
+ if ( / ^ \d + $ / . test ( condition ) ) {
475
+ return true ;
476
+ }
477
+
478
+ // Handle @attribute ="value"
479
+ const attrMatch = condition . match ( / ^ @ ( [ ^ = ] + ) = [ " ' ] ( [ ^ " ' ] + ) [ " ' ] $ / ) ;
480
+ if ( attrMatch ) {
481
+ const [ , attr , value ] = attrMatch ;
482
+ const elementValue = element . getAttribute ( attr ) ;
483
+ return elementValue === value ;
484
+ }
485
+
486
+ // Handle contains(@class, 'value')
487
+ const classContainsMatch = condition . match (
488
+ / ^ c o n t a i n s \( @ c l a s s , \s * [ " ' ] ( [ ^ " ' ] + ) [ " ' ] \) $ /
489
+ ) ;
490
+ if ( classContainsMatch ) {
491
+ const className = classContainsMatch [ 1 ] ;
492
+ return element . classList . contains ( className ) ;
493
+ }
494
+
495
+ // Handle contains(@attribute, 'value')
496
+ const attrContainsMatch = condition . match (
497
+ / ^ c o n t a i n s \( @ ( [ ^ , ] + ) , \s * [ " ' ] ( [ ^ " ' ] + ) [ " ' ] \) $ /
498
+ ) ;
499
+ if ( attrContainsMatch ) {
500
+ const [ , attr , value ] = attrContainsMatch ;
501
+ const elementValue = element . getAttribute ( attr ) || "" ;
502
+ return elementValue . includes ( value ) ;
503
+ }
504
+
505
+ // Handle text()="value"
506
+ const textMatch = condition . match ( / ^ t e x t \( \) = [ " ' ] ( [ ^ " ' ] + ) [ " ' ] $ / ) ;
507
+ if ( textMatch ) {
508
+ const expectedText = textMatch [ 1 ] ;
509
+ const elementText = element . textContent ?. trim ( ) || "" ;
510
+ return elementText === expectedText ;
511
+ }
512
+
513
+ // Handle contains(text(), 'value')
514
+ const textContainsMatch = condition . match (
515
+ / ^ c o n t a i n s \( t e x t \( \) , \s * [ " ' ] ( [ ^ " ' ] + ) [ " ' ] \) $ /
516
+ ) ;
517
+ if ( textContainsMatch ) {
518
+ const expectedText = textContainsMatch [ 1 ] ;
519
+ const elementText = element . textContent ?. trim ( ) || "" ;
520
+ return elementText . includes ( expectedText ) ;
521
+ }
522
+
523
+ // Handle count(*)=0 (element has no children)
524
+ if ( condition === "count(*)=0" ) {
525
+ return element . children . length === 0 ;
526
+ }
527
+
528
+ // Handle other count conditions
529
+ const countMatch = condition . match ( / ^ c o u n t \( \* \) = ( \d + ) $ / ) ;
530
+ if ( countMatch ) {
531
+ const expectedCount = parseInt ( countMatch [ 1 ] ) ;
532
+ return element . children . length === expectedCount ;
533
+ }
534
+
535
+ return true ;
536
+ } ;
537
+
538
+ const evaluateXPath = ( document , xpath , isShadow = false ) => {
539
+ try {
540
+ const result = document . evaluate (
437
541
xpath ,
438
- rootElement ,
542
+ document ,
439
543
null ,
440
544
XPathResult . FIRST_ORDERED_NODE_TYPE ,
441
545
null
442
- ) ;
546
+ ) . singleNodeValue ;
443
547
444
- return result . singleNodeValue ;
445
- } catch ( error ) {
446
- console . warn ( "XPath evaluation failed:" , xpath , error ) ;
548
+ if ( ! isShadow ) {
549
+ if ( result === null ) {
550
+ return null ;
551
+ }
552
+ return result ;
553
+ }
554
+
555
+ let cleanPath = xpath ;
556
+ let isIndexed = false ;
557
+
558
+ const indexedMatch = xpath . match ( / ^ \( ( .* ?) \) \[ ( \d + ) \] ( .* ) $ / ) ;
559
+ if ( indexedMatch ) {
560
+ cleanPath = indexedMatch [ 1 ] + indexedMatch [ 3 ] ;
561
+ isIndexed = true ;
562
+ }
563
+
564
+ const pathParts = cleanPath
565
+ . replace ( / ^ \/ \/ / , "" )
566
+ . split ( "/" )
567
+ . map ( ( p ) => p . trim ( ) )
568
+ . filter ( ( p ) => p . length > 0 ) ;
569
+
570
+ let currentContexts = [ document ] ;
571
+
572
+ for ( let i = 0 ; i < pathParts . length ; i ++ ) {
573
+ const part = pathParts [ i ] ;
574
+ const nextContexts = [ ] ;
575
+
576
+ for ( const ctx of currentContexts ) {
577
+ const positionalMatch = part . match ( / ^ ( [ ^ [ ] + ) \[ ( \d + ) \] $ / ) ;
578
+ let partWithoutPosition = part ;
579
+ let requestedPosition = null ;
580
+
581
+ if ( positionalMatch ) {
582
+ partWithoutPosition = positionalMatch [ 1 ] ;
583
+ requestedPosition = parseInt ( positionalMatch [ 2 ] ) ;
584
+ }
585
+
586
+ const matched = queryInsideContext ( ctx , partWithoutPosition ) ;
587
+
588
+ let elementsToAdd = matched ;
589
+ if ( requestedPosition !== null ) {
590
+ const index = requestedPosition - 1 ; // XPath is 1-based, arrays are 0-based
591
+ if ( index >= 0 && index < matched . length ) {
592
+ elementsToAdd = [ matched [ index ] ] ;
593
+ } else {
594
+ console . warn (
595
+ `Position ${ requestedPosition } out of range (${ matched . length } elements found)`
596
+ ) ;
597
+ elementsToAdd = [ ] ;
598
+ }
599
+ }
600
+
601
+ elementsToAdd . forEach ( ( el ) => {
602
+ nextContexts . push ( el ) ;
603
+ if ( el . shadowRoot ) {
604
+ nextContexts . push ( el . shadowRoot ) ;
605
+ }
606
+ } ) ;
607
+ }
608
+
609
+ if ( nextContexts . length === 0 ) {
610
+ return null ;
611
+ }
612
+
613
+ currentContexts = nextContexts ;
614
+ }
615
+
616
+ if ( currentContexts . length > 0 ) {
617
+ if ( isIndexed && indexedMatch ) {
618
+ const requestedIndex = parseInt ( indexedMatch [ 2 ] ) - 1 ;
619
+ if ( requestedIndex >= 0 && requestedIndex < currentContexts . length ) {
620
+ return currentContexts [ requestedIndex ] ;
621
+ } else {
622
+ console . warn (
623
+ `Requested index ${ requestedIndex + 1 } out of range (${ currentContexts . length } elements found)`
624
+ ) ;
625
+ return null ;
626
+ }
627
+ }
628
+
629
+ return currentContexts [ 0 ] ;
630
+ }
631
+
632
+ return null ;
633
+ } catch ( err ) {
634
+ console . error ( "Critical XPath failure:" , xpath , err ) ;
447
635
return null ;
448
636
}
449
637
} ;
@@ -1018,7 +1206,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
1018
1206
listSelector ,
1019
1207
containerIndex + 1
1020
1208
) ;
1021
- element = evaluateXPath ( document , indexedSelector ) ;
1209
+ element = evaluateXPath ( document , indexedSelector , field . isShadow ) ;
1022
1210
} else {
1023
1211
// Fallback for CSS selectors within XPath containers
1024
1212
const container = containers [ containerIndex ] ;
0 commit comments