Skip to content

Commit dcbfbb1

Browse files
committed
Tweaks and annotation.
1 parent 79fb1bd commit dcbfbb1

File tree

1 file changed

+17
-8
lines changed

1 file changed

+17
-8
lines changed

examples/grequests/links_follower.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import re
2-
from time import time, sleep
2+
from time import time
33

44
from grequests import AsyncRequest, get as grequests_get, map as grequests_map
55

@@ -14,19 +14,21 @@
1414
from crawlfrontier import Settings
1515

1616
SETTINGS = Settings()
17-
SETTINGS.BACKEND = 'crawlfrontier.contrib.backends.memory.MemoryDFSOverusedBackend'
17+
SETTINGS.BACKEND = 'crawlfrontier.contrib.backends.memory.MemoryRandomOverusedBackend'
1818
SETTINGS.LOGGING_MANAGER_ENABLED = True
1919
SETTINGS.LOGGING_BACKEND_ENABLED = False
2020
SETTINGS.MAX_REQUESTS = 0
2121
SETTINGS.MAX_NEXT_REQUESTS = 40
2222

2323
SEEDS = [
2424
'http://www.imdb.com',
25+
'http://www.bbc.com/',
26+
'http://www.amazon.com/'
2527
]
2628

2729
LINK_RE = re.compile(r'href="(.*?)"')
2830

29-
class GRequestConverter(BaseRequestConverter):
31+
class GRequestsConverter(BaseRequestConverter):
3032
"""Converts between crawlfrontier and grequests request objects"""
3133
@classmethod
3234
def to_frontier(cls, request):
@@ -41,7 +43,7 @@ def from_frontier(cls, request):
4143

4244

4345
class GRequestsFrontierManager(FrontierManagerWrapper):
44-
request_converter_class = GRequestConverter
46+
request_converter_class = GRequestsConverter
4547
response_converter_class = ResponseConverter
4648

4749

@@ -53,8 +55,7 @@ def on_request(self, request):
5355
key = get_slot_key(request, 'domain')
5456
self.stats[key] = time()
5557

56-
def get_overused_keys(self):
57-
overused = []
58+
def collect_overused_keys(self, overused):
5859
ts = time()
5960
for key, timestamp in self.stats.iteritems():
6061
if ts - timestamp < 5.0: # querying each hostname with at least 5 seconds delay
@@ -65,6 +66,15 @@ def get_overused_keys(self):
6566
def extract_page_links(response):
6667
return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)]
6768

69+
70+
"""
71+
The idea is to send requests to each domain with at least 5 seconds of delay. grequests only allows us to limit the
72+
number of simultaneous requests. So, we basically performing checks every frontier iteration and limiting the contents
73+
of new frontier batch by sending overused keys in DownloaderInfo. Therefore, we're getting to 5 seconds delays per
74+
batch.
75+
"""
76+
77+
6878
if __name__ == '__main__':
6979

7080
frontier = GRequestsFrontierManager(SETTINGS)
@@ -80,10 +90,9 @@ def callback(response, **kwargs):
8090
frontier.page_crawled(response=response, links=links)
8191

8292
dl_info = DownloaderInfo()
83-
dl_info._overused_keys = stats.get_overused_keys()
93+
stats.collect_overused_keys(dl_info.overused_keys)
8494
next_requests = frontier.get_next_requests(downloader_info=dl_info)
8595
if not next_requests:
86-
sleep(5)
8796
continue
8897

8998
for r in next_requests:

0 commit comments

Comments
 (0)