lib/inspector.js
var util = require('util');
var EventEmitter = require('events').EventEmitter;
var fs = require('fs');
var parse = require('./parser').parse;
var Match = require('./match');
var NodeUtils = require('./nodeUtils');
var crypto = require('crypto');
var stable = require('stable');
/**
*
*/
class Inspector extends EventEmitter {
/**
* Creates a new Inspector, which extends EventEmitter. filePaths is expected
* to be an array of string paths. Also accepts an options object with any
* combination of the following: threshold, identifiers literals, and
* minInstances. Threshold indicates the minimum number of nodes to analyze.
* Identifiers indicates whether or not the nodes in a match should also have
* matching identifiers, and literals whether or not literal values should
* match. minInstances specifies the min number of instances for a match.
* An instance of Inspector emits the following events: start, match and end.
*
* @constructor
* @extends EventEmitter
*
* @param {string[]} filePaths The files on which to run the inspector
* @param {object} [opts] Options to set for the inspector
*/
constructor(filePaths, opts) {
super();
opts = opts || {};
this._filePaths = filePaths || [];
this._threshold = opts.threshold || 30;
this._identifiers = opts.identifiers;
this._literals = opts.literals;
this._minInstances = opts.minInstances || 2;
this._map = Object.create(null);
this._fileContents = {};
this._traversals = {};
this.numFiles = this._filePaths.length;
}
/**
* Runs the inspector on the given file paths, as provided in the constructor.
* Emits a start event, followed by a series of match events for any detected
* similarities, and an end event on completion.
*
* @fires Inspector#start
* @fires Inspector#match
* @fires Inspector#end
*/
run() {
this.emit('start');
// File contents are split to allow for specific line extraction
this._filePaths.forEach((filePath) => {
var src = fs.readFileSync(filePath, {encoding: 'utf8'});
this._fileContents[filePath] = src.split('\n');
try {
var syntaxTree = parse(src, filePath);
} catch (err) {
return console.error(err.message);
}
this._traversals[filePath] = NodeUtils.getDFSTraversal(syntaxTree);
this._walk(syntaxTree, (nodes) => this._insert(nodes));
});
this._analyze();
this.emit('end');
}
/**
* Walks a given node's AST, building up arrays of nodes that meet the
* inspector's threshold. When found, the callback is invoked and passed
* the array of nodes.
*
* @private
*
* @param {Node} node The node to traverse
* @param {function} fn The callback to invoke
*/
_walk(node, fn) {
NodeUtils.walkSubtrees(node, (node, parent, ancestors) => {
var state = ancestors.concat(node);
if (NodeUtils.isAMD(state) ||
NodeUtils.isCommonJS(state) ||
NodeUtils.isES6ModuleImport(state) ||
NodeUtils.isES6ClassBoilerplate(state)) {
return;
}
var nodes = NodeUtils.getDFSTraversal(node, this._threshold);
if (nodes.length === this._threshold) {
fn(nodes);
}
// TODO: Revisit logic
// Disabled for performance reasons
// this._walkSiblings(node, fn);
});
}
/**
* Walks sibling nodes under a parent, grouping their DFS traversals, and
* invoking the callback for those that wouldn't otherwise meet the threshold.
* Helpful for nodes like BlockStatements that hold a sequence. Note that
* this will generate overlapping instances, and so _omitOverlappingInstances
* helps cleanup the results.
*
* @private
*
* @param {Node} node The node to traverse
* @param {function} fn The callback to invoke
*/
_walkSiblings(parent, fn) {
// group siblings that wouldn't otherwise meet threshold
var children = NodeUtils.getChildren(parent);
var n = this._threshold;
for (let i = 0; i < children.length - 1; i++) {
let nodes = NodeUtils.getDFSTraversal(children[i], n);
if (nodes.length === n) continue;
for (let j = i + 1; j < children.length; j++) {
nodes = nodes.concat(NodeUtils.getDFSTraversal(children[j], n));
if (nodes.length >= n) {
fn(nodes.slice(0, n));
break;
}
}
}
}
/**
* Generates a key based on the combined types of each of the supplied nodes.
* Pushes the array to another array at the generated key in _map. Nodes
* are updated to keep a reference to all their occurrences in _map.
*
* @private
*
* @param {Node[]} nodes
*/
_insert(nodes) {
var key = this._getMapKey(nodes);
nodes.forEach(node => {
if (!node.occurrences) {
node.occurrences = {};
}
if (!node.occurrences[key]) {
node.occurrences[key] = [];
}
node.occurrences[key].push(nodes);
});
if (!this._map[key]) {
this._map[key] = [];
}
this._map[key].push(nodes);
}
/**
* Traverses the keys at which the various nodes are stored. A key containing
* an array of more than a single entry indicates a potential match. The nodes
* are then grouped if identifier matching is enabled. A match results in the
* relevant nodes being removed from any future results. This pruning ensures
* that we only include the greatest common parent in a set of matches.
*
* @private
*
* @fires Inspector#match
*/
_analyze() {
var keys = Object.keys(this._map)
.filter(key => this._map[key].length >= this._minInstances);
// Need to use a stable sort to ensure parent nodes are traversed
// before children when lengths are equal
var sortedKeys = stable(keys, (a, b) => {
return this._map[b].length - this._map[a].length;
});
for (let key of sortedKeys) {
if (!this._map[key] || this._map[key].length < this._minInstances) {
continue;
}
let nodeArrays = this._map[key].slice(0);
this._omitOverlappingInstances(nodeArrays);
// groups will be of type Node[][][]
let groups = [nodeArrays];
if (this._identifiers) {
groups = this._groupByMatchingIdentifiers(groups);
}
if (this._literals) {
groups = this._groupByMatchingLiterals(groups);
}
for (let i = 0; i < groups.length; i++) {
if (groups[i].length < this._minInstances) continue;
this._expand(groups[i]);
let match = new Match(groups[i]);
match.populateLines(this._fileContents);
this.emit('match', match);
this._prune(groups[i]);
}
}
}
/**
* Removes overlapping instances from a group of node arrays. That is,
* if one instance has nodes abcd, and another has bcde, then bcde will
* be removed from the array.
*
* @private
*
* @param {Node[][]} nodeArrays
*/
_omitOverlappingInstances(nodeArrays) {
var set = new Set();
var hasOverlap = (nodes) => {
return nodes.some(node => set.has(node));
};
var addNodes = (nodes) => {
nodes.forEach(node => set.add(node));
};
for (let i = 0; i < nodeArrays.length; i++) {
if (hasOverlap(nodeArrays[i])) {
nodeArrays.splice(i--, 1);
continue;
} else {
addNodes(nodeArrays[i]);
}
}
}
/**
* Iterates over the multi-dimensional array of nodes, and returns a new
* array grouping them based on matching identifiers.
*
* @private
*
* @param {Node[][][]} groups
* @returns {Node[][][]}
*/
_groupByMatchingIdentifiers(groups) {
return this._group(groups, (nodes) => {
return nodes
.filter(node => node.name)
.map(node => node.name)
.join(':');
});
}
/**
* Iterates over the multi-dimensional array of nodes, and returns a new
* array grouping them based on matching literals.
*
* @private
*
* @param {Node[][][]} groups
* @returns {Node[][][]}
*/
_groupByMatchingLiterals(groups) {
return this._group(groups, (nodes) => {
return nodes
.filter(node => node.type.includes('Literal'))
.map(node => node.value)
.join(':');
});
}
/**
* Expands each instance of a match to the largest common sequence of nodes
* with the same type, and optionally identifiers. Each array of nodes is
* modified in place.
*
* @private
*
* @param {Node[][]} nodeArrays
*/
_expand(nodeArrays) {
var traversals = nodeArrays.map(nodes => {
return this._traversals[nodes[0].loc.filename];
});
var headPositions = nodeArrays.map((nodes, i) => {
return traversals[i].indexOf(nodes[0]);
});
var tailPositions = nodeArrays.map((nodes, i) => {
var last = nodes[nodes.length - 1];
return traversals[i].indexOf(last);
});
var incr = (pos) => pos + 1;
var decr = (pos) => pos - 1;
var getNode = (pos, i) => traversals[i][pos];
var alreadyIncluded = (nodes) => {
return nodes.some(node => {
return nodeArrays.some(array => array.indexOf(node) !== -1)
});
};
var isComplete = (nodes) => {
return (!NodeUtils.typesMatch(nodes) || alreadyIncluded(nodes)) ||
(this._identifiers && !NodeUtils.identifiersMatch(nodes)) ||
(this._literals && !NodeUtils.literalsMatch(nodes));
};
while (true) {
headPositions = headPositions.map(decr);
let nodes = headPositions.map(getNode);
if (isComplete(nodes)) break;
nodeArrays.forEach((array, i) => array.unshift(nodes[i]));
}
while (true) {
tailPositions = tailPositions.map(incr);
let nodes = tailPositions.map(getNode);
if (isComplete(nodes)) break;
nodeArrays.forEach((array, i) => array.push(nodes[i]));
}
}
/**
* Removes the nodes from consideration in any additional matches.
*
* @private
*
* @param {Node[][]} nodeArrays
*/
_prune(nodeArrays) {
for (let i = 0; i < nodeArrays.length; i++) {
let nodes = nodeArrays[i];
for (let j = 0; j < nodes.length; j++) {
this._removeNode(nodes[j]);
}
}
}
/**
* Removes all occurrences of a given node.
*
* @private
*
* @param {Node} node The node to remove
*/
_removeNode(node) {
for (let key in node.occurrences) {
for (let i = 0; i < node.occurrences[key].length; i++) {
if (!this._map[key]) break;
let index = this._map[key].indexOf(node.occurrences[key][i]);
if (index > -1) {
this._map[key].splice(index, 1);
}
// Delete empty buckets
if (!this._map[key].length) {
delete this._map[key];
}
}
delete node.occurrences[key];
}
}
/**
* Generates a key based on the type of each of the passed nodes, returned
* as a base64-encoded sha1 hash.
*
* @private
*
* @param {Node[]} nodes The nodes for which to generate the key
* @returns {string}
*/
_getMapKey(nodes) {
var key = nodes[0].type;
var length = nodes.length;
// Significantly faster than a map & join
for (var i = 1; i < length; i++) {
key += ':' + nodes[i].type;
}
// Prefer shorter key lengths (base64 < hex)
return crypto.createHash('sha1').update(key).digest('base64');
}
/**
* Accepts a multi-dimensional array of nodes and groups them based on the
* supplied function, which is expected to return a string.
*
* @private
*
* @param {Node[][][]} groups The groups of nodes to further group
* @param {function} fn Synchronous function for generating group ids
* @returns {Node[][][]}
*/
_group(groups, fn) {
var res = [];
var map = Object.create(null);
for (let i = 0; i < groups.length; i++) {
for (let j = 0; j < groups[i].length; j++) {
let id = fn(groups[i][j]);
if (!map[id]) {
map[id] = [];
}
map[id].push(groups[i][j]);
}
for (let key in map) {
res.push(map[key]);
}
map = Object.create(null);
}
return res;
}
}
module.exports = Inspector;