9
9
from cachetools import LRUCache
10
10
from cassandra import (OperationTimedOut , ReadFailure , ReadTimeout ,
11
11
WriteFailure , WriteTimeout )
12
- from cassandra .concurrent import execute_concurrent_with_args
13
12
from cassandra .cqlengine .query import BatchQuery
14
- from w3lib .util import to_bytes , to_native_str
15
13
16
14
from frontera .contrib .backends import CreateOrModifyPageMixin
17
15
from frontera .contrib .backends .memory import MemoryStates
22
20
from frontera .utils .misc import chunks , get_crc32
23
21
from frontera .utils .url import parse_domain_from_url_fast
24
22
23
+ from w3lib .util import to_bytes , to_native_str
24
+
25
25
26
26
def _retry (func ):
27
27
def func_wrapper (self , * args , ** kwargs ):
@@ -122,21 +122,26 @@ def flush(self, force_clear=False):
122
122
123
123
124
124
class Queue (BaseQueue ):
125
- def __init__ (self , session , queue_cls , partitions , crawl_id , generate_stats , ordering = 'default' ):
125
+
126
+ def __init__ (self , session , queue_cls , partitions , ordering = 'default' ):
126
127
self .session = session
127
128
self .queue_model = queue_cls
128
129
self .logger = logging .getLogger ("frontera.contrib.backends.cassandra.components.Queue" )
129
130
self .partitions = [i for i in range (0 , partitions )]
130
131
self .partitioner = Crc32NamePartitioner (self .partitions )
131
132
self .ordering = ordering
133
+ self .batch = BatchQuery ()
132
134
133
135
def frontier_stop (self ):
134
136
pass
135
137
136
- def _order_by (self ):
138
+ def _order_by (self , query ):
137
139
if self .ordering == 'created' :
138
- return "created_at"
139
- return "created_at"
140
+ return query .order_by ('created_at' )
141
+ if self .ordering == 'created_desc' :
142
+ return query .order_by ('-created_at' )
143
+ return query .order_by ('score' , 'created_at' ) # TODO: remove second parameter,
144
+ # it's not necessary for proper crawling, but needed for tests
140
145
141
146
def get_next_requests (self , max_n_requests , partition_id , ** kwargs ):
142
147
"""
@@ -148,53 +153,19 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs):
148
153
"""
149
154
results = []
150
155
try :
151
- dequeued_urls = 0
152
- cql_ditems = []
153
- d_query = self .session .prepare ("DELETE FROM queue WHERE crawl = ? AND fingerprint = ? AND partition_id = ? "
154
- "AND score = ? AND created_at = ?" )
155
- for item in self .queue_model .objects .filter (crawl = self .crawl_id , partition_id = partition_id ).\
156
- order_by ("partition_id" , "score" , self ._order_by ()).limit (max_n_requests ):
157
- method = 'GET' if not item .method else item .method
158
-
159
- meta_dict2 = dict ((name , getattr (item .meta , name )) for name in dir (item .meta )
160
- if not name .startswith ('__' ))
161
- # TODO: How the result can be an dict not an object -> Objects get error while encodeing for Message Bus
162
- # If I take meta_dict2 direct to Request i get the same error message
163
-
164
- meta_dict = dict ()
165
- meta_dict ["fingerprint" ] = meta_dict2 ["fingerprint" ]
166
- meta_dict ["domain" ] = meta_dict2 ["domain" ]
167
- meta_dict ["origin_is_frontier" ] = meta_dict2 ["origin_is_frontier" ]
168
- meta_dict ["scrapy_callback" ] = meta_dict2 ["scrapy_callback" ]
169
- meta_dict ["scrapy_errback" ] = meta_dict2 ["scrapy_errback" ]
170
- meta_dict ["scrapy_meta" ] = meta_dict2 ["scrapy_meta" ]
171
- meta_dict ["score" ] = meta_dict2 ["score" ]
172
- meta_dict ["jid" ] = meta_dict2 ["jid" ]
173
-
174
- r = Request (item .url , method = method , meta = meta_dict , headers = item .headers , cookies = item .cookies )
175
- r .meta ['fingerprint' ] = item .fingerprint
176
- r .meta ['score' ] = item .score
156
+ for item in self ._order_by (self .queue_model .filter (partition_id = partition_id ).allow_filtering ()).limit (max_n_requests ):
157
+ method = item .method or b'GET'
158
+ r = Request (item .url , method = method , meta = item .meta , headers = item .headers , cookies = item .cookies )
159
+ r .meta [b'fingerprint' ] = to_bytes (item .fingerprint )
160
+ r .meta [b'score' ] = item .score
177
161
results .append (r )
178
-
179
- cql_d = (item .crawl , item .fingerprint , item .partition_id , item .score , item .created_at )
180
- cql_ditems .append (cql_d )
181
- dequeued_urls += 1
182
-
183
- if dequeued_urls > 0 :
184
- execute_concurrent_with_args (self .session , d_query , cql_ditems , concurrency = 200 )
185
-
186
- self .counter_cls .cass_count ({"dequeued_urls" : dequeued_urls })
187
-
162
+ item .batch (self .batch ).delete ()
163
+ self .batch .execute ()
188
164
except Exception as exc :
189
165
self .logger .exception (exc )
190
-
191
166
return results
192
167
193
168
def schedule (self , batch ):
194
- query = self .session .prepare ("INSERT INTO queue (id, fingerprint, score, partition_id, host_crc32, url, "
195
- "created_at, meta, depth, headers, method, cookies) "
196
- "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" )
197
- cql_items = []
198
169
for fprint , score , request , schedule in batch :
199
170
if schedule :
200
171
_ , hostname , _ , _ , _ , _ = parse_domain_from_url_fast (request .url )
@@ -205,35 +176,23 @@ def schedule(self, batch):
205
176
else :
206
177
partition_id = self .partitioner .partition (hostname , self .partitions )
207
178
host_crc32 = get_crc32 (hostname )
208
- created_at = time ()* 1E+6
209
-
210
- if "domain" not in request .meta :
211
- request .meta ["domain" ] = {}
212
- if "origin_is_frontier" not in request .meta :
213
- request .meta ["origin_is_frontier" ] = ''
214
- if "scrapy_callback" not in request .meta :
215
- request .meta ["scrapy_callback" ] = None
216
- if "scrapy_errback" not in request .meta :
217
- request .meta ["scrapy_errback" ] = None
218
- if "scrapy_meta" not in request .meta :
219
- request .meta ["scrapy_meta" ] = {}
220
- if "score" not in request .meta :
221
- request .meta ["score" ] = 0
222
- if "jid" not in request .meta :
223
- request .meta ["jid" ] = 0
224
-
225
- cql_i = (uuid .uuid4 (), fprint , score , partition_id , host_crc32 , request .url , created_at ,
226
- request .meta , 0 , request .headers , request .method , request .cookies )
227
- cql_items .append (cql_i )
228
-
229
- request .meta ['state' ] = States .QUEUED
230
-
231
- execute_concurrent_with_args (self .session , query , cql_items , concurrency = 400 )
232
- self .counter_cls .cass_count ({"queued_urls" : len (cql_items )})
179
+ q = self .queue_model (id = uuid .uuid4 (),
180
+ fingerprint = to_native_str (fprint ),
181
+ score = score ,
182
+ url = request .url ,
183
+ meta = request .meta ,
184
+ headers = request .headers ,
185
+ cookies = request .cookies ,
186
+ method = to_native_str (request .method ),
187
+ partition_id = partition_id ,
188
+ host_crc32 = host_crc32 ,
189
+ created_at = time () * 1E+6 )
190
+ q .batch (self .batch ).save ()
191
+ request .meta [b'state' ] = States .QUEUED
192
+ self .batch .execute ()
233
193
234
194
def count (self ):
235
- count = self .queue_model .objects .filter ().count ()
236
- return count
195
+ return self .queue_model .all ().count ()
237
196
238
197
239
198
class BroadCrawlingQueue (Queue ):
@@ -265,12 +224,11 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs):
265
224
while tries < self .GET_RETRIES :
266
225
tries += 1
267
226
limit *= 5.5 if tries > 1 else 1.0
268
- self .logger .debug ("Try %d, limit %d, last attempt: requests %d, hosts %d" %
269
- ( tries , limit , count , len (queue .keys () )))
227
+ self .logger .debug ("Try %d, limit %d, last attempt: requests %d, hosts %d" ,
228
+ tries , limit , count , len (queue .keys ()))
270
229
queue .clear ()
271
230
count = 0
272
- for item in self .queue_model .objects .filter (crawl = self .crawl_id , partition_id = partition_id ).\
273
- order_by ("crawl" , "score" , self ._order_by ()).limit (limit ):
231
+ for item in self ._order_by (self .queue_model .filter (partition_id = partition_id )).limit (max_n_requests ):
274
232
if item .host_crc32 not in queue :
275
233
queue [item .host_crc32 ] = []
276
234
if max_requests_per_host is not None and len (queue [item .host_crc32 ]) > max_requests_per_host :
@@ -284,13 +242,14 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs):
284
242
if min_requests is not None and count < min_requests :
285
243
continue
286
244
break
287
- self .logger .debug ("Finished: tries %d, hosts %d, requests %d" % ( tries , len (queue .keys ()), count ) )
245
+ self .logger .debug ("Finished: tries %d, hosts %d, requests %d" , tries , len (queue .keys ()), count )
288
246
289
247
results = []
290
- for items in queue .itervalues ():
248
+ for items in six .itervalues (queue ):
291
249
for item in items :
292
- method = 'GET' if not item .method else item .method
293
- results .append (Request (item .url , method = method , meta = item .meta , headers = item .headers ,
294
- cookies = item .cookies ))
295
- item .delete ()
250
+ method = item .method or b'GET'
251
+ results .append (Request (item .url , method = method ,
252
+ meta = item .meta , headers = item .headers , cookies = item .cookies ))
253
+ item .batch (self .batch ).delete ()
254
+ self .batch .execute ()
296
255
return results
0 commit comments