1
1
# -*- coding: utf-8 -*-
2
- import json
3
2
import logging
4
- from datetime import datetime , timedelta
3
+ import uuid
4
+ from datetime import timedelta
5
5
from time import time
6
6
7
- from cassandra .cqlengine import columns
8
- from cassandra .cqlengine .models import Model
7
+ from cassandra .cqlengine .management import sync_table
8
+ from cassandra .cqlengine .query import BatchQuery
9
+ from w3lib .util import to_native_str
9
10
10
11
from frontera import Request
12
+ from frontera .contrib .backends import CommonRevisitingStorageBackendMixin
11
13
from frontera .contrib .backends .cassandra import CassandraBackend
14
+ from frontera .contrib .backends .cassandra .models import RevisitingQueueModel
12
15
from frontera .contrib .backends .partitioners import Crc32NamePartitioner
13
16
from frontera .core .components import Queue as BaseQueue
14
17
from frontera .core .components import States
15
- from frontera .utils .misc import get_crc32
18
+ from frontera .utils .misc import get_crc32 , utcnow_timestamp
16
19
from frontera .utils .url import parse_domain_from_url_fast
17
20
18
21
19
- class RevisitingQueueModel (Model ):
20
- __table_name__ = 'revisiting_queue'
21
-
22
- crawl_at = columns .DateTime (required = True , default = datetime .now (), index = True )
23
-
24
-
25
22
class RevisitingQueue (BaseQueue ):
26
- def __init__ (self , session , queue_cls , partitions ):
27
- self .session = session ()
23
+ def __init__ (self , queue_cls , partitions ):
28
24
self .queue_model = queue_cls
29
- self .logger = logging .getLogger ("frontera.contrib.backends.sqlalchemy .revisiting.RevisitingQueue" )
25
+ self .logger = logging .getLogger ("frontera.contrib.backends.cassandra .revisiting.RevisitingQueue" )
30
26
self .partitions = [i for i in range (0 , partitions )]
31
27
self .partitioner = Crc32NamePartitioner (self .partitions )
28
+ self .batch = BatchQuery ()
29
+ sync_table (queue_cls )
32
30
33
31
def frontier_stop (self ):
34
32
pass
35
33
36
34
def get_next_requests (self , max_n_requests , partition_id , ** kwargs ):
37
35
results = []
38
36
try :
39
- for item in self .queue_model .objects .filter (crawl_at = datetime . utcnow (), partition_id = partition_id ).\
40
- limit (max_n_requests ):
37
+ for item in self .queue_model .objects .filter (partition_id = partition_id ,
38
+ crawl_at__lte = utcnow_timestamp ()). limit (max_n_requests ):
41
39
method = 'GET' if not item .method else item .method
42
40
results .append (Request (item .url , method = method , meta = item .meta , headers = item .headers ,
43
41
cookies = item .cookies ))
44
- item .delete ()
42
+ item .batch (self .batch ).delete ()
43
+ self .batch .execute ()
45
44
except Exception as exc :
46
45
self .logger .exception (exc )
47
46
return results
48
47
49
48
def schedule (self , batch ):
50
- for fprint , score , request , schedule_at in batch :
51
- if schedule_at :
49
+ for fprint , score , request , schedule in batch :
50
+ if schedule :
52
51
_ , hostname , _ , _ , _ , _ = parse_domain_from_url_fast (request .url )
53
52
if not hostname :
54
53
self .logger .error ("Can't get hostname for URL %s, fingerprint %s" % (request .url , fprint ))
@@ -57,65 +56,46 @@ def schedule(self, batch):
57
56
else :
58
57
partition_id = self .partitioner .partition (hostname , self .partitions )
59
58
host_crc32 = get_crc32 (hostname )
60
- created_at = time ()* 1E+6
61
- q = self ._create_queue (request , fprint , score , partition_id , host_crc32 , created_at )
62
-
63
- q .save ()
64
- request .meta ['state' ] = States .QUEUED
65
-
66
- def _create_queue (self , obj , fingerprint , score , partition_id , host_crc32 , created_at ):
67
- db_queue = self .queue_model ()
68
- db_queue .fingerprint = fingerprint
69
- db_queue .score = score
70
- db_queue .partition_id = partition_id
71
- db_queue .host_crc32 = host_crc32
72
- db_queue .url = obj .url
73
- db_queue .created_at = created_at
74
-
75
- new_dict = {}
76
- for kmeta , vmeta in obj .meta .iteritems ():
77
- if type (vmeta ) is dict :
78
- new_dict [kmeta ] = json .dumps (vmeta )
79
- else :
80
- new_dict [kmeta ] = str (vmeta )
81
-
82
- db_queue .meta = new_dict
83
- db_queue .depth = 0
84
-
85
- db_queue .headers = obj .headers
86
- db_queue .method = obj .method
87
- db_queue .cookies = obj .cookies
88
-
89
- return db_queue
59
+ schedule_at = request .meta [b'crawl_at' ] if b'crawl_at' in request .meta else utcnow_timestamp ()
60
+ q = self .queue_model (id = uuid .uuid4 (),
61
+ fingerprint = to_native_str (fprint ),
62
+ score = score ,
63
+ url = request .url ,
64
+ meta = request .meta ,
65
+ headers = request .headers ,
66
+ cookies = request .cookies ,
67
+ method = to_native_str (request .method ),
68
+ partition_id = partition_id ,
69
+ host_crc32 = host_crc32 ,
70
+ created_at = time () * 1E+6 ,
71
+ crawl_at = schedule_at )
72
+ q .batch (self .batch ).save ()
73
+ request .meta [b'state' ] = States .QUEUED
74
+ self .batch .execute ()
75
+
76
+ def _create_queue_obj (self , fprint , score , request , partition_id , host_crc32 , schedule_at ):
77
+ q = self .queue_model (id = uuid .uuid4 (),
78
+ fingerprint = to_native_str (fprint ),
79
+ score = score ,
80
+ url = request .url ,
81
+ meta = request .meta ,
82
+ headers = request .headers ,
83
+ cookies = request .cookies ,
84
+ method = to_native_str (request .method ),
85
+ partition_id = partition_id ,
86
+ host_crc32 = host_crc32 ,
87
+ created_at = time () * 1E+6 ,
88
+ crawl_at = schedule_at )
89
+ return q
90
90
91
91
def count (self ):
92
- return self .session . query ( self . queue_model ).count ()
92
+ return self .queue_model . all ( ).count ()
93
93
94
94
95
- class Backend (CassandraBackend ):
95
+ class Backend (CommonRevisitingStorageBackendMixin , CassandraBackend ):
96
96
97
97
def _create_queue (self , settings ):
98
- self .interval = settings .get ("SQLALCHEMYBACKEND_REVISIT_INTERVAL " )
98
+ self .interval = settings .get ("CASSANDRABACKEND_REVISIT_INTERVAL " )
99
99
assert isinstance (self .interval , timedelta )
100
- return RevisitingQueue (self .session , RevisitingQueueModel , settings .get ('SPIDER_FEED_PARTITIONS' ))
101
-
102
- def _schedule (self , requests ):
103
- batch = []
104
- queue_incr = 0
105
- for request in requests :
106
- if request .meta ['state' ] in [States .NOT_CRAWLED , None ]:
107
- schedule_at = datetime .utcnow ()
108
- elif request .meta ['state' ] in [States .CRAWLED , States .ERROR ]:
109
- schedule_at = datetime .utcnow () + self .interval
110
- else : # QUEUED
111
- schedule_at = None
112
- batch .append ((request .meta ['fingerprint' ], self ._get_score (request ), request , schedule_at ))
113
- if schedule_at :
114
- queue_incr += 1
115
- self .queue .schedule (batch )
116
- self .metadata .update_score (batch )
117
- self .queue_size += queue_incr
118
-
119
- def page_crawled (self , response , links ):
120
- super (Backend , self ).page_crawled (response , links )
121
- self ._schedule ([response .request ])
100
+ self .interval = self .interval .total_seconds ()
101
+ return RevisitingQueue (RevisitingQueueModel , settings .get ('SPIDER_FEED_PARTITIONS' ))
0 commit comments