From 815e75721869636e46fababa7d0efb3bb61dee46 Mon Sep 17 00:00:00 2001 From: c2huc2hu Date: Sat, 9 Dec 2017 15:00:20 -0500 Subject: [PATCH] ran a test, showing that this model is slightly better, because it ranks a sentence with less common words higher --- model_test.js | 32 ++++++++++++++++++++++++++++++++ part2.js | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 model_test.js diff --git a/model_test.js b/model_test.js new file mode 100644 index 0000000..03b677f --- /dev/null +++ b/model_test.js @@ -0,0 +1,32 @@ +const _ = require('lodash'); +const part2 = require('./part2'); + +// sentences courtesy of http://thewessens.net/ClassroomApps/Main/gibberish.html?topic=probability&id=6 +sentences = [ +"01THISSENTENCEISEASYITSREALENGLISHWORDSSOITMATCHESFREQUENCYANDSTUFFTHEREARELOTSOF", +"02GAZELLESGRAZINGZEBRAQUIZZICALEQUALLYZESTRIVETQUICKZENAMAZINGZIGZAGPIZZAKUMQUATB", +"03CESTENFRANCAISCESTMOINSANGLAISTOUJOURSFRAISJESUISUNPIZZAQUESTCEQUECESTILFAUTPEN", +"04LOREMIPSUMDOLORSITAMETCONSECTETURADIPISICINGELITSEDDOEIUSMODTEMPORINCIDIDUNTUTL", +"05EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE", +"06WWGEVDOFGRREGHEWIOFSDIJIREAOFJEWIAOGHREIOSDJWESELOFJEWALFHWEUIFAIFHWEAIFUEWALFH", +"10INAHANDANDATHOUSANDTIMESALLHEMEANTTOTHEMERETHEONCEMOREANDIFISAWHERINYOUROWNWAYO", +"11ABOUTSAWBOYSHOULDANDACAMEANDBYANOTHERSCHAISEMOMENTINWRITINGSANKIANDINTERPOSEDMO", +"12BESOMETHINGTHATITSANDTOHAVEHAVISHAMWITHHERBERTANDPUTTOHADREASONTOGOTOMISSHANDSA", +"13FROMANDASTUMBLETHEMANDTOTHEBLEDISTENTOFBEARDIDISAIDTIMESSOINTHEFORFORTHEFELLAND", +"14DONBUTTOFROPANOTHERAYOUTEVINMAYFARETHATMUCHWOUGHTHEOVERSUCTGREMYHEANDEREDNTLAHO", +"15NOIVIOITANDIDGHHEADBAMISILINDPLLWSSOONEWACKITIDINTORFILAVEMENGEHHADRSKETARARAMI", +"16BIOEALFKEIFHSETUIYEUSTIEGDWRASASDRTOCEIKTIONVTRHLUTLDODHERTICEALEATOCMSPRLORESM", +"17OGTXEZMXGBKRHEFDESFXOGGGIIQDAOWOBVPKRFKQSNXVMQLLRQKRGYPVJVBWARZHMULUSUJGGIRYOHP", +]; + +const fs = require('fs'); +const lambda = [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,0.9]; +let ptb = fs.readFileSync('./ptb.train.txt').toString(); + +ptb = ptb.replace(/ /g, '').replace(/N /g, ''); // get rid of added tokens +let ptb_counts = part2.get_counts(ptb, true); +let score_fcn = _.partialRight(part2.log_prob, ptb_counts, lambda); +console.log(_.sortBy(sentences, score_fcn).join('\n')); +sentences.forEach(sent => { + console.log(sent, score_fcn(sent)); +}); \ No newline at end of file diff --git a/part2.js b/part2.js index 8ebf727..0410a6b 100644 --- a/part2.js +++ b/part2.js @@ -80,7 +80,7 @@ function get_prob2(q, counts, lambda) { let next_count = curr_count[q.get(n)]; if (next_count && next_count.sum) { - total_prob += lambda[n] * next_count.sum // curr_count.sum; + total_prob += lambda[n] * next_count.sum / curr_count.sum; } else { break;