diff --git a/requirements.txt b/requirements.txt index bcc49e2..30f5d09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,5 @@ dynaconf pypdf numpy==1.24 wrapt>=1.14,<1.15 -scrapy \ No newline at end of file +scrapy +twisted \ No newline at end of file diff --git a/web_scraper/web_scraper/spiders/generic.py b/web_scraper/web_scraper/spiders/generic.py index 6cfc93b..013015d 100644 --- a/web_scraper/web_scraper/spiders/generic.py +++ b/web_scraper/web_scraper/spiders/generic.py @@ -3,6 +3,7 @@ from scrapy.linkextractors import LinkExtractor from scrapy.settings import Settings from scrapy.spiders import CrawlSpider, Spider, Rule +from scrapy.exporters import JsonLinesItemExporter from scrapy.http import Response import os from twisted.internet.asyncioreactor import install @@ -22,16 +23,33 @@ class ScrapedPage(scrapy.Item): text = scrapy.Field() links = scrapy.Field() + class GenericSpider(scrapy.Spider): name = "generic" allowed_domains = ["savantly.net"] - start_urls = ["https://savantly.net"] + + custom_settings = { + 'FEED_FORMAT': 'jsonlines', + 'FEED_URI': f"{DOCS_PATH}/output.jsonl", + 'MEDIA_ALLOW_REDIRECTS': True + } + + def __init__(self, start_url=None, *args, **kwargs): + super(GenericSpider, self).__init__(*args, **kwargs) + if start_url: + self.start_urls = [start_url] def parse(self, response): - item = { - 'url': response.url, - 'status': response.status, - 'text': '\n'.join(response.xpath('//body//text()').getall()), - } - yield item - \ No newline at end of file + scraped_page = ScrapedPage() + scraped_page['url'] = response.url + scraped_page['status'] = response.status + scraped_page['headers'] = response.headers + scraped_page['text'] = '\n'.join(response.xpath('//body//text()').getall()) + scraped_page['links'] = [link.url for link in LinkExtractor(allow_domains=self.allowed_domains).extract_links(response)] + yield scraped_page + + # Save HTML response to a file + domain_name = response.url.split('//')[-1].split('/')[0] + filename = f"{DOCS_PATH}/{domain_name}.html" + with open(filename, 'wb') as file: + file.write(response.body) diff --git a/web_scraper/web_scraper/spiders/savantlynet.py b/web_scraper/web_scraper/spiders/savantlynet.py index ee20d46..9bedcd0 100644 --- a/web_scraper/web_scraper/spiders/savantlynet.py +++ b/web_scraper/web_scraper/spiders/savantlynet.py @@ -118,4 +118,4 @@ def extract_contact_section_data(self, section): data['contact_info'] = extracted_data - return data + return data \ No newline at end of file