-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathindex.js
116 lines (90 loc) · 3.18 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
const puppeteer = require('puppeteer')
const axios = require('axios')
const cheerio = require('cheerio')
const async = require('async')
const fs = require('fs')
const slugify = require('slugify')
const { DateTime } = require('luxon')
class Mylot {
static async generatePagePDF(article, username, force = false) {
const browser = await puppeteer.launch({ headless: true })
const page = await browser.newPage()
await page.goto(article.url, { waitUntil: 'networkidle0' })
const name = await page.$eval('#discTit', el => el.innerText)
const slug = slugify(name)
const dateRaw = await page.$eval('#discDat', el => el.innerText)
const dateFormatted = DateTime.fromFormat(dateRaw.replace(' CST', ''), 'MMMM d, yyyy h:ma').toISODate()
const dir = `out-${username}`;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
const filename = `${dir}/${dateFormatted}-${slug}.pdf`
if (fs.existsSync(filename) && !force) {
console.info(`PDF for article ${article.url} already exists, skipping`)
return
}
// Cheat the ajax comment loading to load everything in one go
await page.evaluate(() => {
window.discussionResponseCount = 1000 &&
window.getDiscussionFull(window.discussionId, window.responseId, window.commentId, window.responseStartRow)
})
// Hide annoying UI bits
await page.evaluate(() => {
document.querySelector('#top-container').style.display = 'none'
document.querySelector('#discSoc').style.display = 'none'
})
await page.waitForTimeout(3000)
const pdfConfig = {
path: filename,
format: 'A4',
printBackground: true,
margin: {
top: '1cm',
bottom: '1cm',
left: '1cm',
right: '1cm',
},
}
await page.emulateMediaType('screen')
await page.pdf(pdfConfig)
await browser.close()
}
}
const extractArticles = r$ => {
let urls = []
const articles = r$('.atvDiscTit a').each(function () {
urls.push({ url: 'https://www.mylot.com' + r$(this).attr('href'), name: r$(this).text().trim() })
})
return urls
}
const getStartAction = html => {
const match = html.match(/startActionId = "(\d+)"/)
if (match) {
return match[1]
}
return null
}
(async () => {
const args = process.argv.slice(2)
let articles = []
const res = await axios.get(`https://www.mylot.com/${args[0]}/posts`)
const r$ = cheerio.load(res.data)
let startAction = getStartAction(res.data)
articles = articles.concat(extractArticles(r$))
do {
const url = `https://www.mylot.com/atv/more?activityTypeId=103&startActionId=${startAction}&tagname=&username=${args[0]}&_=${(new Date()).getTime()}`
console.log(`Getting URL ${url}, Articles Count: ${articles.length}`)
const res = await axios.get(url)
const r$ = cheerio.load(res.data)
startAction = getStartAction(res.data)
articles = articles.concat(extractArticles(r$))
} while (startAction)
await async.mapLimit(articles, 5, async article => {
console.info(`PDFing URL ${article.url}`)
try {
await Mylot.generatePagePDF(article, args[0])
} catch (e) {
console.error(`Error PDFing with URL ${article.url}, ${e.toString()}`)
}
})
})()