diff --git a/lib/tokenize.js b/lib/tokenize.js index 6ac47a4..1c9a502 100644 --- a/lib/tokenize.js +++ b/lib/tokenize.js @@ -6,9 +6,19 @@ * @return {array} Array of tokens */ module.exports = function(input) { - return input + input = input .toLowerCase() .replace(/\n/g, ' ') .replace(/[.,\/#!$%\^&\*;:{}=_`\"~()]/g, '') .split(' '); + // remove single quotes surrounding words but preserve apostrophes + var i; + for (i = 0; i < input.length; i++) { + var word = input[i]; + if (word[0] === '\'' && word[word.length-1] === '\'') { + word = word.substr(1, word.length-2); + input[i] = word; + } + } + return input; }; diff --git a/test/unit/tokenize.js b/test/unit/tokenize.js index 4d8795f..81b0230 100644 --- a/test/unit/tokenize.js +++ b/test/unit/tokenize.js @@ -28,6 +28,18 @@ test('english', function (t) { tokenize('That\'ll cause problems for the farmer\'s pigs'), ['that\'ll', 'cause', 'problems', 'for', 'the', 'farmer\'s', 'pigs'] ); + t.deepEqual( + tokenize('Evan is \'wrong,\', says the more qualified governor'), + ['evan', 'is', 'wrong', 'says', 'the','more', 'qualified', 'governor'] + ); + t.deepEqual( + tokenize('Dad told me I can be the \'honorary\' chef!'), + ['dad', 'told', 'me', 'i', 'can', 'be', 'the','honorary', 'chef'] + ); + t.deepEqual( + tokenize('\'Complacent\' and \'undervalued\' is a bad combination'), + ['complacent', 'and', 'undervalued', 'is', 'a', 'bad', 'combination'] + ); t.end(); });