1
1
import re
2
- from time import time , sleep
2
+ from time import time
3
3
4
4
from grequests import AsyncRequest , get as grequests_get , map as grequests_map
5
5
14
14
from crawlfrontier import Settings
15
15
16
16
SETTINGS = Settings ()
17
- SETTINGS .BACKEND = 'crawlfrontier.contrib.backends.memory.MemoryDFSOverusedBackend '
17
+ SETTINGS .BACKEND = 'crawlfrontier.contrib.backends.memory.MemoryRandomOverusedBackend '
18
18
SETTINGS .LOGGING_MANAGER_ENABLED = True
19
19
SETTINGS .LOGGING_BACKEND_ENABLED = False
20
20
SETTINGS .MAX_REQUESTS = 0
21
21
SETTINGS .MAX_NEXT_REQUESTS = 40
22
22
23
23
SEEDS = [
24
24
'http://www.imdb.com' ,
25
+ 'http://www.bbc.com/' ,
26
+ 'http://www.amazon.com/'
25
27
]
26
28
27
29
LINK_RE = re .compile (r'href="(.*?)"' )
28
30
29
- class GRequestConverter (BaseRequestConverter ):
31
+ class GRequestsConverter (BaseRequestConverter ):
30
32
"""Converts between crawlfrontier and grequests request objects"""
31
33
@classmethod
32
34
def to_frontier (cls , request ):
@@ -41,7 +43,7 @@ def from_frontier(cls, request):
41
43
42
44
43
45
class GRequestsFrontierManager (FrontierManagerWrapper ):
44
- request_converter_class = GRequestConverter
46
+ request_converter_class = GRequestsConverter
45
47
response_converter_class = ResponseConverter
46
48
47
49
@@ -53,8 +55,7 @@ def on_request(self, request):
53
55
key = get_slot_key (request , 'domain' )
54
56
self .stats [key ] = time ()
55
57
56
- def get_overused_keys (self ):
57
- overused = []
58
+ def collect_overused_keys (self , overused ):
58
59
ts = time ()
59
60
for key , timestamp in self .stats .iteritems ():
60
61
if ts - timestamp < 5.0 : # querying each hostname with at least 5 seconds delay
@@ -65,6 +66,15 @@ def get_overused_keys(self):
65
66
def extract_page_links (response ):
66
67
return [urljoin (response .url , link ) for link in LINK_RE .findall (response .text )]
67
68
69
+
70
+ """
71
+ The idea is to send requests to each domain with at least 5 seconds of delay. grequests only allows us to limit the
72
+ number of simultaneous requests. So, we basically performing checks every frontier iteration and limiting the contents
73
+ of new frontier batch by sending overused keys in DownloaderInfo. Therefore, we're getting to 5 seconds delays per
74
+ batch.
75
+ """
76
+
77
+
68
78
if __name__ == '__main__' :
69
79
70
80
frontier = GRequestsFrontierManager (SETTINGS )
@@ -80,10 +90,9 @@ def callback(response, **kwargs):
80
90
frontier .page_crawled (response = response , links = links )
81
91
82
92
dl_info = DownloaderInfo ()
83
- dl_info . _overused_keys = stats .get_overused_keys ( )
93
+ stats .collect_overused_keys ( dl_info . overused_keys )
84
94
next_requests = frontier .get_next_requests (downloader_info = dl_info )
85
95
if not next_requests :
86
- sleep (5 )
87
96
continue
88
97
89
98
for r in next_requests :
0 commit comments