-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.js
134 lines (128 loc) · 4.24 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
const scrape = require('website-scraper')
const axios = require('axios')
const moment = require('moment')
const cheerio = require('cheerio')
const fs = require('fs-extra')
const URL = require('url').URL
const path = require('path')
require('colors')
/**
* The namespace for the project.
* @type {string}
*/
const startUrl = process.argv[2] || ''
const projectNamespace = process.argv[3] || (new URL(startUrl)).hostname
const visitedUrls = new Set()
const crawlPromises = []
/**
* Represents a logger clone.
*/
class loggerClone {
apply(registerAction) {
registerAction('beforeStart', async ({ options }) => {
console.log(`[${moment().format('HH:mm:ss')}] Starting Website Cloning...`.bold)
return options
})
registerAction('afterFinish', async () => {
console.log(`[${moment().format('HH:mm:ss')}] Finished Website Cloning...`.bold)
})
registerAction('getReference', async ({ resource }) => {
console.log(` - Get Reference: ${resource.url}`.cyan)
return resource
})
registerAction('onResourceSaved', async ({ resource }) => {
console.log(`[${moment().format('HH:mm:ss')}] Save Resource: ${resource.url}`.green)
return resource
})
registerAction('error', async ({ error }) => {
console.error(`[${moment().format('HH:mm:ss')}] Error: ${error.message}`.red)
})
}
}
/**
* Clones a website by crawling its pages and saving them to the output directory.
* @returns {Promise<boolean>} A promise that resolves to true if the website cloning is successful, or rejects with an error if it fails.
*/
const cloneWebsite = () => new Promise(async (resolve, reject) => {
const outputDirectory = path.resolve(__dirname, `output/${projectNamespace}`)
if (await fs.existsSync(outputDirectory)) {
await fs.remove(outputDirectory)
}
const normalizeURL = Array.from(visitedUrls).map(url => {
const filename = url.split('/').pop()
const filenameExt = (filename.split('.').pop() === filename) ? 'html' : filename.split('.').pop()
return { url, filename: (filenameExt === 'html') ? filename : `${filename}.${filenameExt}` }
})
const options = {
urls: normalizeURL,
directory: outputDirectory,
filenameGenerator: 'bySiteStructure',
maxDepth: 3,
recursive: true,
requestConcurrency: Infinity,
maxRecursiveDepth: 3,
prettifyUrls: true,
ignoreErrors: true,
urlFilter: (url) => {
return url.match(new RegExp(startUrl, 'g'))
},
plugins: [ new loggerClone() ]
}
try {
await scrape(options)
resolve(true)
} catch (error) {
reject(error)
}
})
/**
* Crawls a website and retrieves all the links within a given depth.
* @param {string} url - The URL of the website to crawl.
* @param {number} [depth=0] - The depth of the crawl (default is 0).
* @returns {Promise<void>} - A promise that resolves when the crawling is complete.
*/
const crawlWebsite = (url, depth = 0) => new Promise(async (resolve, reject) => {
if (depth > 3 || visitedUrls.has(url)) {
resolve()
return
}
visitedUrls.add(url)
try {
const response = await axios.get(url)
const $ = cheerio.load(response.data)
const links = $('a').map((i, link) => $(link).attr('href')).get()
links.forEach(link => {
try {
const absoluteLink = new URL(link, url).href.split('#')[0]
const shouldLink = (new URL(absoluteLink)).origin === (new URL(url)).origin
const notImages = !(absoluteLink.match(/\.(jpeg|jpg|gif|png|svg)$/))
if (shouldLink && notImages && !(visitedUrls.has(absoluteLink))) {
crawlPromises.push(crawlWebsite(absoluteLink, depth + 1))
}
} catch (error) { }
})
console.log(` - Saved URL: ${url}`.yellow)
resolve()
} catch (error) {
resolve()
}
})
/**
* Runs the website crawler app.
* @param {string} url - The URL of the website to crawl.
* @returns {Promise<void>} - A promise that resolves when the crawling and cloning process is complete.
*/
const run = async (url) => {
try {
console.log(`[${moment().format('HH:mm:ss')}] Starting Website Crawling...`.bold)
await crawlWebsite(url)
await Promise.all(crawlPromises)
await cloneWebsite()
} catch (error) {
console.error(`Runtime Error: ${error.message}`.red)
}
}
/**
* Run main function
*/
run(startUrl)