-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
70 lines (58 loc) · 2.39 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from urlparse import urlparse
import re
import json
from bs4 import BeautifulSoup
import scrapy
from scrapy.utils.markup import remove_tags
base_url = 'http://home.iitk.ac.in/~'
allusersFile = 'allUsers.txt'
resultsFile = 'scrapedData.json'
outFile=open(resultsFile,'w')
outFile.close()
outFile=open(resultsFile,'a')
auf = open(allusersFile,'r')
#do_not_visit_these = ['^#','facebook','linkedin','instagram','quora','twitter','google','youtube','void','oars','jpg','png','gif','pdf','mailto:','zip$','github.com','yahoo','flipkart','amazon','microsoft']
#DNVreg = r''+ '|'.join(do_not_visit_these)
## extend the allowed domains here
allowed_domains_defined = ['home.iitk.ac.in','www.iitk.ac.in','github.io']
def pageData(soup):
texts = soup.findAll(text=True)
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
visible_texts = filter(visible, texts)
return ' '.join(' '.join(list(visible_texts)).splitlines())
def htmlText(soup):
for script in soup.find_all('script'):
script.extract()
for style in soup.find_all('style'):
style.extract()
data = ' '.join(soup.get_text().strip().split())
return data
class IITKSpider(scrapy.Spider):
name = 'iitk_spider'
start_urls = [(base_url+'{0}').format(i.strip()) for i in auf.readlines()]
#start_urls += ['http://www.iitk.ac.in/']
def parse(self, response):
try:
allText = ' '.join(response.xpath("//html").extract()).strip()
soup = BeautifulSoup(allText,'html.parser')
visibleText = htmlText(soup)
outFile.write(json.dumps({"url":str(response.url),"pageData":str(visibleText)})+'\n')
except:
print "Unable to parse"
NEXT_PAGE_SELECTOR = 'a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract()
for np in next_page:
domainName = urlparse(response.urljoin(np)).netloc
if domainName in allowed_domains_defined and 'pdf' not in response.urljoin(np) :
try:
yield scrapy.Request(
response.urljoin(np),
callback=self.parse
)
except:
print "Unable to crawl", str(response.urljoin(np))