Skip to content

Commit 51159a2

Browse files
committed
resolving domain to sec level before checking black list
1 parent a26a0a9 commit 51159a2

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

frontera/worker/components/batch_generator.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from collections import defaultdict
77
from logging import DEBUG
88

9+
from bigbot_common.fingerprint import suffix_list
10+
911
from frontera.exceptions import NotConfigured
1012
from frontera.utils.url import parse_domain_from_url_fast
1113
from . import DBWorkerThreadComponent
@@ -103,8 +105,9 @@ def _is_domain_blacklisted(self, request):
103105
_, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url)
104106
if hostname:
105107
hostname = hostname.lower()
106-
if hostname in self.domains_blacklist:
107-
self.logger.debug("Dropping black-listed hostname, URL %s", request.url)
108+
second_level = suffix_list.get_public_suffix(hostname)
109+
if second_level in self.domains_blacklist:
110+
self.logger.debug("Dropping black-listed URL %s", request.url)
108111
return True
109112
return False
110113

0 commit comments

Comments
 (0)