-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrape.js
61 lines (56 loc) · 1.51 KB
/
Scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// Scrape and download images from a website
const request = require('request')
const cheerio = require('cheerio')
const fs = require('fs')
const path = require('path')
const base_url = 'https://krosarchive.es/FR/profile/'
const images_dir = './images'
function downloadImage(url, filepath, delay) {
setTimeout(() => {
if (!fs.existsSync(filepath)) {
request
.get(url)
.on('error', (err) => console.error(err))
.pipe(fs.createWriteStream(filepath))
} else {
console.log('Skipping file ' + filepath + ' (already exists)')
}
}, delay)
}
function scrapePage(url) {
request(url, (error, response, body) => {
if (error) {
console.error(error)
return
}
const $ = cheerio.load(body)
$('img').each((i, el) => {
const src = $(el).attr('src')
if (src) {
const filename = path.basename(src)
const filepath = path.join(images_dir, filename)
const delay = i * 1000
downloadImage('https://krosarchive.es' + src, filepath, delay)
}
})
})
}
function scrapeWebsite(url) {
request(url, (error, response, body) => {
if (error) {
console.error(error)
return
}
const $ = cheerio.load(body)
const links = $('a')
.map((i, el) => $(el).attr('href'))
.get()
console.log('Links on ' + url + ':')
console.log(links)
links.forEach((link) => {
const absoluteLink = new URL(link, base_url).toString()
scrapePage(absoluteLink)
})
})
}
scrapeWebsite(base_url)