You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

214 lines
6.1 KiB

/*
Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
*/
var EventEmitter = require('events').EventEmitter,
inherits = require('util').inherits;
function jsmemcmp(buf1, pos1, buf2, pos2, num) {
for (var i = 0; i < num; ++i, ++pos1, ++pos2)
if (buf1[pos1] !== buf2[pos2])
return false;
return true;
}
function SBMH(needle) {
if (typeof needle === 'string')
needle = new Buffer(needle);
var i, j, needle_len = needle.length;
this.maxMatches = Infinity;
this.matches = 0;
this._occ = new Array(256);
this._lookbehind_size = 0;
this._needle = needle;
this._bufpos = 0;
this._lookbehind = new Buffer(needle_len);
// Initialize occurrence table.
for (j = 0; j < 256; ++j)
this._occ[j] = needle_len;
// Populate occurrence table with analysis of the needle,
// ignoring last letter.
if (needle_len >= 1) {
for (i = 0; i < needle_len - 1; ++i)
this._occ[needle[i]] = needle_len - 1 - i;
}
}
inherits(SBMH, EventEmitter);
SBMH.prototype.reset = function() {
this._lookbehind_size = 0;
this.matches = 0;
this._bufpos = 0;
};
SBMH.prototype.push = function(chunk, pos) {
var r, chlen;
if (!Buffer.isBuffer(chunk))
chunk = new Buffer(chunk, 'binary');
chlen = chunk.length;
this._bufpos = pos || 0;
while (r !== chlen && this.matches < this.maxMatches)
r = this._sbmh_feed(chunk);
return r;
};
SBMH.prototype._sbmh_feed = function(data) {
var len = data.length, needle = this._needle, needle_len = needle.length;
// Positive: points to a position in `data`
// pos == 3 points to data[3]
// Negative: points to a position in the lookbehind buffer
// pos == -2 points to lookbehind[lookbehind_size - 2]
var pos = -this._lookbehind_size,
last_needle_char = needle[needle_len - 1],
occ = this._occ,
lookbehind = this._lookbehind;
if (pos < 0) {
// Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
// search with character lookup code that considers both the
// lookbehind buffer and the current round's haystack data.
//
// Loop until
// there is a match.
// or until
// we've moved past the position that requires the
// lookbehind buffer. In this case we switch to the
// optimized loop.
// or until
// the character to look at lies outside the haystack.
while (pos < 0 && pos <= len - needle_len) {
var ch = this._sbmh_lookup_char(data, pos + needle_len - 1);
if (ch === last_needle_char
&& this._sbmh_memcmp(data, pos, needle_len - 1)) {
this._lookbehind_size = 0;
++this.matches;
if (pos > -this._lookbehind_size)
this.emit('info', true, lookbehind, 0, this._lookbehind_size + pos);
else
this.emit('info', true);
this._bufpos = pos + needle_len;
return pos + needle_len;
} else
pos += occ[ch];
}
// No match.
if (pos < 0) {
// There's too few data for Boyer-Moore-Horspool to run,
// so let's use a different algorithm to skip as much as
// we can.
// Forward pos until
// the trailing part of lookbehind + data
// looks like the beginning of the needle
// or until
// pos == 0
while (pos < 0 && !this._sbmh_memcmp(data, pos, len - pos))
pos++;
}
if (pos >= 0) {
// Discard lookbehind buffer.
this.emit('info', false, lookbehind, 0, this._lookbehind_size);
this._lookbehind_size = 0;
} else {
// Cut off part of the lookbehind buffer that has
// been processed and append the entire haystack
// into it.
var bytesToCutOff = this._lookbehind_size + pos;
if (bytesToCutOff > 0) {
// The cut off data is guaranteed not to contain the needle.
this.emit('info', false, lookbehind, 0, bytesToCutOff);
}
lookbehind.copy(lookbehind, 0, bytesToCutOff,
this._lookbehind_size - bytesToCutOff);
this._lookbehind_size -= bytesToCutOff;
data.copy(lookbehind, this._lookbehind_size);
this._lookbehind_size += len;
this._bufpos = len;
return len;
}
}
if (pos >= 0)
pos += this._bufpos;
// Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
// search with optimized character lookup code that only considers
// the current round's haystack data.
while (pos <= len - needle_len) {
var ch = data[pos + needle_len - 1];
if (ch === last_needle_char
&& data[pos] === needle[0]
&& jsmemcmp(needle, 0, data, pos, needle_len - 1)) {
++this.matches;
if (pos > 0)
this.emit('info', true, data, this._bufpos, pos);
else
this.emit('info', true);
this._bufpos = pos + needle_len;
return pos + needle_len;
} else
pos += occ[ch];
}
// There was no match. If there's trailing haystack data that we cannot
// match yet using the Boyer-Moore-Horspool algorithm (because the trailing
// data is less than the needle size) then match using a modified
// algorithm that starts matching from the beginning instead of the end.
// Whatever trailing data is left after running this algorithm is added to
// the lookbehind buffer.
if (pos < len) {
while (pos < len && (data[pos] !== needle[0]
|| !jsmemcmp(data, pos, needle, 0, len - pos))) {
++pos;
}
if (pos < len) {
data.copy(lookbehind, 0, pos, pos + (len - pos));
this._lookbehind_size = len - pos;
}
}
// Everything until pos is guaranteed not to contain needle data.
if (pos > 0)
this.emit('info', false, data, this._bufpos, pos < len ? pos : len);
this._bufpos = len;
return len;
};
SBMH.prototype._sbmh_lookup_char = function(data, pos) {
if (pos < 0)
return this._lookbehind[this._lookbehind_size + pos];
else
return data[pos];
}
SBMH.prototype._sbmh_memcmp = function(data, pos, len) {
var i = 0;
while (i < len) {
if (this._sbmh_lookup_char(data, pos + i) === this._needle[i])
++i;
else
return false;
}
return true;
}
module.exports = SBMH;