extract functionality and basic cli

balaclark · Nov 19, 2013 · f03d0ca · f03d0ca
commit f03d0ca
Show file tree

Hide file tree

Showing 14 changed files with 10,353 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,19 @@
+# Created by http://gitignore.io
+
+### Node ###
+lib-cov
+*.seed
+*.log
+*.csv
+*.dat
+*.out
+*.pid
+*.gz
+
+pids
+logs
+results
+
+npm-debug.log
+node_modules
+coverage
diff --git a/.npmignore b/.npmignore
@@ -0,0 +1 @@
+test/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 Bala Clark
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,14 @@
+
+test:
+	@./node_modules/mocha/bin/mocha --reporter spec test
+
+setup:
+	@npm install
+
+cover:
+	@./node_modules/istanbul/lib/cli.js cover ./node_modules/mocha/bin/_mocha test -- --ui bdd -t 5000
+
+clean:
+	@rm -rf coverage
+
+.PHONY: test setup cover clean
diff --git a/README.md b/README.md
@@ -0,0 +1,22 @@
+MHTML
+-----
+
+Extract MHTML files to HTML.
+
+TODO
+----
+
+  * make callbacks optional
+  * return more errors where possible
+  * test with many different mhtml files
+  * update image srcs in css to relative when needed
+  * save assets to a subfolder
+  * allow mhtml files to be created from a folder
+  * convert to PDF
+  * fully support spec: http://www.ietf.org/rfc/rfc2110.txt
+
+Example images ©
+----------------
+
+  * Ben Fredericson (http://www.flickr.com/photos/xjrlokix/7670504246/sizes/m/in/photostream/, http://www.flickr.com/photos/xjrlokix/7670507852/sizes/m/in/photostream/)
+  * Major Clanger (http://www.flickr.com/photos/major_clanger/4850772/sizes/m/in/photostream/)
diff --git a/cli.js b/cli.js
@@ -0,0 +1,43 @@
+'use strict';
+
+var fs = require('fs');
+var path = require('path');
+var program = require('commander');
+var glob = require('glob');
+var mhtml = require(__dirname + '/mhtml');
+
+program
+  .version(require(__dirname + '/package').version)
+  .option('-e, --extract [value]', 'Extract MTHML archive / folder of MHTML archives')
+  .option('-o, --output [value]', 'Output destination (defaults to same folder as the source file)')
+  .option('-f, --force', 'Delete the existing output folder before extracting')
+  // .option('-v, --verbose', 'Verbose mode')
+  .parse(process.argv);
+
+function getdir(file, base) {
+  return path.join(base || path.dirname(file), path.basename(file, path.extname(file)));
+}
+
+if (program.extract) {
+
+  var output = program.output || getdir(program.extract);
+
+  fs.stat(program.extract, function (err, stats) {
+
+    if (stats.isDirectory()) {
+
+      glob(path.join(program.extract, '*.{mht,mhtml}'), function (err, files) {
+        files.forEach(function (file) {
+          mhtml.extract(file, getdir(file, program.output), function (err) {
+            if (err) console.error(err);
+          }, program.force);
+        });
+      });
+
+    } else {
+      mhtml.extract(program.extract, output, function (err) {
+        if (err) console.error(err);
+      }, program.force);
+    }
+  });
+}
diff --git a/lib/extract.js b/lib/extract.js
@@ -0,0 +1,139 @@
+'use strict';
+
+var fs = require('fs');
+var readline = require('readline');
+var stream = require('stream');
+var mkdir = require('mkdirp');
+var rmdir = require('rimraf');
+var path = require('path');
+var EOL = require('os').EOL;
+var Part = require(__dirname + '/part');
+
+module.exports = extractMHTML;
+
+function MHTMLExtractor(source, opts) {
+  this.boundary = null;
+  this.source = source;
+  this.parts = [];
+  return this;
+}
+
+MHTMLExtractor.prototype = {
+
+  getParts: function (cb) {
+
+    var instream = fs.createReadStream(this.source);
+    var outstream = new stream;
+    var rl = readline.createInterface(instream, outstream);
+
+    var lineno = 0;
+    var lines = [];
+    var boundary;
+    var validMhtml = false;
+    var part;
+    var meta;
+
+    var readMeta = false;
+    var readContent = false;
+
+    rl.on('line', function readLine(line) {
+
+      lines.push(line);
+
+      boundary = line.match(/boundary=["'](.+?)["']/);
+
+      if (!this.boundry && line.match('Content-Type: multipart/related')) {
+        validMhtml = true;
+      }
+
+      if (!this.boundary && boundary) {
+        this.boundary = boundary[1];
+      }
+
+      if (line.match(new RegExp('^--' + this.boundary))) {
+        readMeta = true;
+        readContent = false;
+        part = new Part();
+      }
+
+      if (readMeta === true) {
+
+        // a newline after the meta block signifies the start of the content block
+        if (line.match(/^$/)) {
+          readMeta = false;
+          readContent = true;
+          this.parts.push(part);
+        } else {
+          meta = line.match(/^(Content-[A-Za-z-]+):(?:\s+)?(.*)/i);
+          if (meta) {
+            part.meta[meta[1].toLowerCase()] = meta[2];
+          }
+        }
+      }
+
+      if (readContent === true && !line.match(/^$/)) {
+        part.content += line + EOL;
+      }
+
+      lineno++;
+
+    }.bind(this));
+
+    rl.on('close', function endRead() {
+      if (!validMhtml) cb(new Error('Invalid MHTML file'));
+      else cb(null);
+    }.bind(this));
+  },
+
+  extractParts: function (dest, cb, force) {
+
+    // TODO: optionally delete existing, otherwise return error
+    // TODO: allow mode to be passed as an argument
+    // TODO: save assets in subfolder?
+
+    var noParts = this.parts.length;
+    var done = 0;
+
+    var extract = function (err) {
+
+      mkdir(dest, function (err) {
+
+        if (err) return cb(err);
+
+        this.parts.forEach(function (part) {
+
+          var filePath = path.join(dest, part.filename());
+
+          part.decoded(function (err, content) {
+
+            fs.writeFile(filePath, content, function (err) {
+              if (err) return cb(err);
+              done++;
+              if (done == noParts) return cb(null);
+            });
+          });
+        });
+      }.bind(this));
+    }.bind(this);
+
+    if (force) {
+      rmdir(dest, extract);
+    } else {
+      extract();
+    }
+  },
+
+  extract: function (dest, cb, force) {
+    var force = (typeof force === 'undefined') ? false : force;
+    this.getParts(function readFile(err) {
+      if (err) return cb(err);
+      this.extractParts(dest, function extractParts(err) {
+        cb(err);
+      }, force);
+    }.bind(this));
+  }
+};
+
+function extractMHTML(source, destination, cb, force) {
+  new MHTMLExtractor(source).extract(destination, cb, force);
+}
diff --git a/lib/part.js b/lib/part.js
@@ -0,0 +1,63 @@
+'use strict';
+
+var path = require('path');
+var mimelib = require('mimelib')
+var cheerio = require('cheerio');
+
+module.exports = Part;
+
+function Part() {
+  this.meta = {};
+  this.content = '';
+}
+
+Part.prototype = {
+
+  decoder: {
+    base64: function (encoded) {
+      return new Buffer(encoded, 'base64');
+    },
+    'quoted-printable': mimelib.decodeQuotedPrintable
+  },
+
+  filename: function () {
+
+    if (this.meta.hasOwnProperty('content-location')) {
+      return path.basename(this.meta['content-location'].replace(/\\/g, '/'));
+    }
+
+    // TODO: use content-id if present?
+    var ext = (mimelib.contentTypesReversed.hasOwnProperty(this.meta['content-type']))
+      ? '.' + mimelib.contentTypesReversed[this.meta['content-type']] : '';
+
+    return Math.floor(Math.random() * 5000000000) + ext;
+  },
+
+  decoded: function (cb) {
+
+    var decoded = this.decoder[this.meta['content-transfer-encoding']](this.content);
+
+    // TODO: make this optional
+    if (this.meta['content-type'].match('text/html')) {
+
+      var $ = cheerio.load(decoded);
+      var attrs = $('[href]');
+      var todo = attrs.length;
+
+      attrs.each(function () {
+
+        var attr = $(this).attr('href');
+        if (attr.match(/^file:/)) {
+          decoded = decoded.replace(new RegExp(attr, 'g'), attr.match(/[^/]*$/)[0]);
+        }
+
+        todo--;
+      });
+
+      if (!todo) cb(null, decoded);
+
+    } else {
+      cb(null, decoded)
+    }
+  }
+};
diff --git a/mhtml.js b/mhtml.js
@@ -0,0 +1,2 @@
+
+module.exports.extract = require(__dirname + '/lib/extract');
diff --git a/package.json b/package.json
@@ -0,0 +1,37 @@
+{
+  "name": "mhtml",
+  "version": "0.0.1",
+  "description": "Convert MHTML files to HTML.",
+  "main": "mhtml.js",
+  "directories": {
+    "example": "example"
+  },
+  "scripts": {
+    "test": "make test"
+  },
+  "repository": {
+    "type": "git",
+    "url": "tba"
+  },
+  "keywords": [
+    "convert",
+    "mhtml",
+    "mhtml"
+  ],
+  "author": "Bala Clark <balaclark@gmail.com>",
+  "license": "MIT",
+  "dependencies": {
+    "mimelib": "~0.2.14",
+    "mkdirp": "~0.3.5",
+    "cheerio": "~0.12.4",
+    "commander": "~2.0.0",
+    "glob": "~3.2.7",
+    "rimraf": "~2.2.2"
+  },
+  "devDependencies": {
+    "mocha": "~1.14.0",
+    "should": "~2.1.0",
+    "fs-extra": "~0.8.1",
+    "istanbul": "~0.1.45"
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		module.exports.extract = require(__dirname + '/lib/extract');