-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdownload.js
48 lines (41 loc) · 1.41 KB
/
download.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
var fs = require("fs");
var path = require("path");
const { URL } = require("url");
const FormData = require("form-data");
const fetch = require("node-fetch");
const { promisify } = require("util");
const { pipeline } = require("stream");
const streamPipeline = promisify(pipeline);
const URL_INPUT = "http://www2.scc.rutgers.edu/memdb/download_file_spuf.php";
/**
* @desc Scrapes Rutgers' search result PHP files and stores the lines of data locally
* @param {int} offset - which record to start at
* @param {string} rutgersTableName - value for Rutgers' 'table' param
*
* @throws {Error} is possible
*/
module.exports.scrapePageToTextFile = ({
offset = 1,
rutgersTableName = "spuf",
dstDir = __dirname,
}) => async () => {
const url = new URL(URL_INPUT);
url.searchParams.append("start", offset);
const form = new FormData();
form.append("table", rutgersTableName);
const fetchResults = await fetch(url, {
body: form,
method: "POST",
});
// TODO: backup collection to file instead of piping new results to file before upload
const today = new Date();
const dateString = `${today.getFullYear()}-${today.getMonth()}-${today.getDate()}_${today.getTime()}`;
const textFile = path.join(
dstDir,
`raw_${rutgersTableName}_${offset
.toString()
.padStart(5, "0")}_${dateString}`
);
await streamPipeline(fetchResults.body, fs.createWriteStream(textFile));
return { textFile };
};