Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Hash Property Parsing for DSM Variants #334

Merged
merged 7 commits into from
Feb 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions packages/task-runner/src/shopify/classes/parsers/dsm/dsmParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
const SpecialParser = require('../specialParser');
const { ParseType, matchKeywords } = require('../../utils/parse');
const ErrorCodes = require('../../utils/constants').ErrorCodes.Parser;

/**
* Base Special Parser for all DSM Sites
*/
class DsmParser extends SpecialParser {
constructor(request, task, proxy, logger, name = 'DsmParser') {
super(request, task, proxy, logger, name);

/**
* Some Dsm Sites requires specific hashes to be attached when adding
* to cart. We store all parsed hashes in a map (keyed by product id)
* so they can be used. By default, this map is not used, but
* subclasses can extend this class and add support in the following ways:
*
* 1. implementing parseInitialPageForHash($) - this method receives the
* initial page loaded with cheerio and expects a hash to be returned.
* The hash is set as the default hash for all products unless they are
* set specifically for a product
* 2. implementing parseProductPageForHash($) - this method receives the
* product page loaded with cheerio and expects a hash to be returned.
* The hash is set for that product's id. If the matched product has an
* id stored for it through this method, it is used, then the default hash
* is used and finally the backup hash is used.
*/
this._hashIds = {};
}

// eslint-disable-next-line class-methods-use-this
get initialPageContainsProducts() {
return false;
}

async parseInitialPageForUrls($) {
// Look for all `.grid-view-item`'s
const parsedItems = [];
$('.grid-view-item').each((i, el) => {
const link = $('.grid-view-item__link', el).attr('href');
const title = $('.grid-view-item__title', el).text();

if (!link || !title) {
return;
}
parsedItems.push({ link, title });
});

let items = parsedItems;
// If parsing keywords, reduce the number of pages to search by matching the title
if (this._type === ParseType.Keywords && items.length !== 0) {
const keywords = {
pos: this._task.product.pos_keywords,
neg: this._task.product.neg_keywords,
};
items = matchKeywords(parsedItems, keywords, null, null, true) || [];
}
this._logger.silly('%s: parsing initial page, found %d items', this._name, items.length);

if (!items.length) {
// If no products are found, throw an error, but specify a special status to stop the task
// TODO: Maybe replace with a custom error object?
const error = new Error('No Items Found');
error.status = ErrorCodes.ProductNotFound;
throw error;
}

// Convert items to full urls
const productUrls = items.map(({ link }) => new URL(link, this._task.site.url).href);

// Parse for hash
const hash = await this.parseInitialPageForHash($);
if (hash) {
this._hashIds.__default__ = hash;
this._logger.silly('%s: Set default hash property to %s', this._name, hash);
}

return productUrls;
}

parseInitialPageForHash() {
this._logger.silly(
"%s: this parser doesn't support parsing initial page for hash, skipping...",
this._name,
);
return null;
}

async parseProductInfoPageForProduct($) {
this._logger.silly('%s: Parsing product info page for product data...', this._name);
// Look for the script tag containing the product json
const product = $('script#ProductJson-product-template');
if (!product || product.attr('type') !== 'application/json') {
this._logger.silly('%s: No Items found in product script!', this._name);
// If no products are found, throw an error, but specify a special status to stop the task
// TODO: Maybe replace with a custom error object?
const error = new Error('No Items Found');
error.status = ErrorCodes.ProductNotFound;
throw error;
}

this._logger.silly('%s: Product script found, returning parsed output', this._name);

const parsedProduct = JSON.parse(product.html());

// Calcalate and store hash for this product
const hash = await this.parseProductInfoPageForHash($, this._task.site);
if (hash) {
this._hashIds[parsedProduct.id] = hash;
this._logger.silly(
'%s: Set hash property to %s for id %s',
this._name,
hash,
parsedProduct.id,
);
}

return parsedProduct;
}

parseProductInfoPageForHash() {
this._logger.silly(
"%s: this parser doesn't support parsing product info pages for hash, skiping...",
this._name,
);
return null;
}

async run() {
const matchedProduct = await super.run();

// Check for hash and store it before returning
const hash = this._hashIds[matchedProduct.id] || this._hashIds.__default__;
if (hash) {
this._logger.debug(
'%s, Found hash %s for matched product %s, storing on task...',
this._name,
hash,
matchedProduct.title,
);
this._task.product.hash = hash;
}

return matchedProduct;
}
}

module.exports = DsmParser;
135 changes: 135 additions & 0 deletions packages/task-runner/src/shopify/classes/parsers/dsm/dsmUkParser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
const cheerio = require('cheerio');

const DsmParser = require('./dsmParser');
const { formatProxy, userAgent } = require('../../utils');

class DsmUkParser extends DsmParser {
constructor(request, task, proxy, logger) {
super(request, task, proxy, logger, 'DsmUkParser');
}

_parseForCustomJsLink($) {
// Search for all script tags in the <head> element that:
// 1. have a src attribute
// 2. contain `custom.js` in the src
const customJsLinks = [];
$('script', 'head').each((_, e) => {
const srcAttr = $(e).attr('src');
if (srcAttr && /custom\.js/.test(srcAttr)) {
// Perform a quick replace to make sure we use https
if (srcAttr.startsWith('//')) {
customJsLinks.push(`https://${srcAttr.substr(2)}`);
} else {
customJsLinks.push(srcAttr);
}
}
});
this._logger.debug(
'%s: Found %d custom.js links',
this._name,
customJsLinks.length,
customJsLinks,
);
const [customJsLink] = customJsLinks;
if (!customJsLink) {
throw new Error('no custom js links found!');
}
if (customJsLinks.length > 1) {
this._logger.silly(
'%s: More than 1 custom links found! using the first one: %s',
this._name,
customJsLink,
);
}

return customJsLink;
}

_getCustomJsContent(uri) {
this._logger.silly('%s: Requesting custom js from %s ...', this._name, uri);
return this._request({
method: 'GET',
uri,
proxy: formatProxy(this._proxy) || undefined,
rejectUnauthorized: false,
resolveWithFullResponse: false,
json: false,
simple: true,
gzip: true,
headers: {
'User-Agent': userAgent,
},
});
}

_parseCustomJsContent(content) {
// Parse for the specific code in question, capturing the
// `input` tag that is being inserted.
// Example:
// $('form.product-form').append('<input type="hidden" value="ee3e8f7a9322eaa382e04f8539a7474c11555" name="properties[_hash]" />');
const regex = /\$\(\s*'form\.product-form'\s*\)\s*\.\s*append\(\s*'(.*)'\s*\)/;
const matches = regex.exec(content);
if (!matches) {
throw new Error("Couldn't find input tag in response!");
}

this._logger.silly('%s: Found matching element, parsing now...', this._name, matches[1]);

// Load the input tag into cheerio to easily get the name and value attributes
// (cheerio is used so we don't have to worry about the order of attributes)
const tag = cheerio.load(matches[1]);
const name = tag('input').attr('name');
const value = tag('input').attr('value');

// Check for correct name
if (name !== 'properties[_hash]') {
throw new Error(
`Invalid name property was used ("${name}" , but was expecting "properties[_hash]").`,
);
}
// Check for valid value
if (!value) {
throw new Error('No hash value was given!');
}
return value;
}

async parseInitialPageForHash($) {
this._logger.silly('%s: Parsing for hash on initial page...', this._name);

try {
const customJsLink = this._parseForCustomJsLink($);
const body = await this._getCustomJsContent(customJsLink);
const hash = this._parseCustomJsContent(body);
return hash;
} catch (err) {
this._logger.debug('%s: Error parsing custom.js! %s', this._name, err.message);
this._logger.silly('%s: Hash parsing failed, will try again on product page...', this._name);
return null;
}
}

async parseProductInfoPageForHash($) {
if (this._hashIds.__default__) {
this._logger.silly(
'%s: Default hash id already parsed! Skipping product specific hash parsing...',
this._name,
);
return null;
}
this._logger.silly('%s: Parsing for hash on product page...', this._name);

try {
const customJsLink = this._parseForCustomJsLink($);
const body = await this._getCustomJsContent(customJsLink);
const hash = this._parseCustomJsContent(body);
return hash;
} catch (err) {
this._logger.debug('%s: Error parsing custom.js! %s', this._name, err.message);
this._logger.silly('%s: Hash parsing failed, will use backup hash', this._name);
return null;
}
}
}

module.exports = DsmUkParser;
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
const DsmParser = require('./dsmParser');

class DsmUsParser extends DsmParser {
constructor(request, task, proxy, logger) {
super(request, task, proxy, logger, 'DsmUsParser');
}

parseProductInfoPageForHash($) {
const regex = /\$\(\s*atob\(\s*'PGlucHV0IHR5cGU9ImhpZGRlbiIgbmFtZT0icHJvcGVydGllc1tfSEFTSF0iIC8\+'\s*\)\s*\)\s*\.val\(\s*'(.+)'\s*\)/;
if (!regex) {
this._logger.debug(
'%s: Parsing for hash is not required for this site, skipping...',
this._name,
);
return null;
}
try {
const hashes = [];
$('#MainContent > script').each((i, e) => {
// should match only one, but just in case, let's loop over all possibilities
this._logger.silly('%s: parsing script element %d for hash...', this._name, i);
if (e.children) {
// check to see if we can find the hash property
const elements = regex.exec(e.children[0].data);
if (elements) {
this._logger.debug('%s: Found match %s', this._name, elements[0]);
hashes.push(elements[1]);
} else {
this._logger.debug('%s: No match found %s', this._name, e.children[0].data);
}
}
});
switch (hashes.length) {
case 0: {
this._logger.debug('%s: No Hash Found, returning null...', this._name);
return null;
}
case 1: {
const [hash] = hashes;
this._logger.debug('%s: Found 1 Hash: %s, returning...', this._name, hash);
return hash;
}
default: {
const [hash] = hashes;
this._logger.debug(
'%s: Found %d Hashes! using the first one: %s',
this._name,
hashes.length,
hash,
);
return hash;
}
}
} catch (err) {
this._logger.debug(
'%s: ERROR parsing %s hash property: %s %s',
this._name,
this._task.site.name,
err.statusCode || err.status,
err.message,
);
return null;
}
}
}

module.exports = DsmUsParser;
9 changes: 9 additions & 0 deletions packages/task-runner/src/shopify/classes/parsers/dsm/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
const DsmParser = require('./dsmParser');
const DsmUkParser = require('./dsmUkParser');
const DsmUsParser = require('./dsmUsParser');

module.exports = {
DsmParser,
DsmUkParser,
DsmUsParser,
};
Loading