From 51159a20236566ed5df78345845bc52e195c2770 Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Apr 2018 10:40:53 +0200 Subject: [PATCH 1/2] resolving domain to sec level before checking black list --- frontera/worker/components/batch_generator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 78f97747a..7e84fa379 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -6,6 +6,8 @@ from collections import defaultdict from logging import DEBUG +from bigbot_common.fingerprint import suffix_list + from frontera.exceptions import NotConfigured from frontera.utils.url import parse_domain_from_url_fast from . import DBWorkerThreadComponent @@ -103,8 +105,9 @@ def _is_domain_blacklisted(self, request): _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if hostname: hostname = hostname.lower() - if hostname in self.domains_blacklist: - self.logger.debug("Dropping black-listed hostname, URL %s", request.url) + second_level = suffix_list.get_public_suffix(hostname) + if second_level in self.domains_blacklist: + self.logger.debug("Dropping black-listed URL %s", request.url) return True return False From 205faf86c52b97ca5723269b0f8cc4483339bfdd Mon Sep 17 00:00:00 2001 From: Alexander Sibiryakov Date: Tue, 24 Apr 2018 11:01:24 +0200 Subject: [PATCH 2/2] checking hostname too --- frontera/worker/components/batch_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py index 7e84fa379..8474c1b69 100644 --- a/frontera/worker/components/batch_generator.py +++ b/frontera/worker/components/batch_generator.py @@ -106,7 +106,7 @@ def _is_domain_blacklisted(self, request): if hostname: hostname = hostname.lower() second_level = suffix_list.get_public_suffix(hostname) - if second_level in self.domains_blacklist: + if second_level in self.domains_blacklist or hostname in self.domains_blacklist: self.logger.debug("Dropping black-listed URL %s", request.url) return True return False