Skip to content

Commit

Permalink
ran a test, showing that this model is slightly better, because it ra…
Browse files Browse the repository at this point in the history
…nks a sentence with less common words higher
  • Loading branch information
c2huc2hu committed Dec 9, 2017
1 parent cedc53a commit 815e757
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
32 changes: 32 additions & 0 deletions model_test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
const _ = require('lodash');
const part2 = require('./part2');

// sentences courtesy of http://thewessens.net/ClassroomApps/Main/gibberish.html?topic=probability&id=6
sentences = [
"01THISSENTENCEISEASYITSREALENGLISHWORDSSOITMATCHESFREQUENCYANDSTUFFTHEREARELOTSOF",
"02GAZELLESGRAZINGZEBRAQUIZZICALEQUALLYZESTRIVETQUICKZENAMAZINGZIGZAGPIZZAKUMQUATB",
"03CESTENFRANCAISCESTMOINSANGLAISTOUJOURSFRAISJESUISUNPIZZAQUESTCEQUECESTILFAUTPEN",
"04LOREMIPSUMDOLORSITAMETCONSECTETURADIPISICINGELITSEDDOEIUSMODTEMPORINCIDIDUNTUTL",
"05EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE",
"06WWGEVDOFGRREGHEWIOFSDIJIREAOFJEWIAOGHREIOSDJWESELOFJEWALFHWEUIFAIFHWEAIFUEWALFH",
"10INAHANDANDATHOUSANDTIMESALLHEMEANTTOTHEMERETHEONCEMOREANDIFISAWHERINYOUROWNWAYO",
"11ABOUTSAWBOYSHOULDANDACAMEANDBYANOTHERSCHAISEMOMENTINWRITINGSANKIANDINTERPOSEDMO",
"12BESOMETHINGTHATITSANDTOHAVEHAVISHAMWITHHERBERTANDPUTTOHADREASONTOGOTOMISSHANDSA",
"13FROMANDASTUMBLETHEMANDTOTHEBLEDISTENTOFBEARDIDISAIDTIMESSOINTHEFORFORTHEFELLAND",
"14DONBUTTOFROPANOTHERAYOUTEVINMAYFARETHATMUCHWOUGHTHEOVERSUCTGREMYHEANDEREDNTLAHO",
"15NOIVIOITANDIDGHHEADBAMISILINDPLLWSSOONEWACKITIDINTORFILAVEMENGEHHADRSKETARARAMI",
"16BIOEALFKEIFHSETUIYEUSTIEGDWRASASDRTOCEIKTIONVTRHLUTLDODHERTICEALEATOCMSPRLORESM",
"17OGTXEZMXGBKRHEFDESFXOGGGIIQDAOWOBVPKRFKQSNXVMQLLRQKRGYPVJVBWARZHMULUSUJGGIRYOHP",
];

const fs = require('fs');
const lambda = [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,0.9];
let ptb = fs.readFileSync('./ptb.train.txt').toString();

ptb = ptb.replace(/<unk> /g, '').replace(/N /g, ''); // get rid of added tokens
let ptb_counts = part2.get_counts(ptb, true);
let score_fcn = _.partialRight(part2.log_prob, ptb_counts, lambda);
console.log(_.sortBy(sentences, score_fcn).join('\n'));
sentences.forEach(sent => {
console.log(sent, score_fcn(sent));
});
2 changes: 1 addition & 1 deletion part2.js
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ function get_prob2(q, counts, lambda) {
let next_count = curr_count[q.get(n)];

if (next_count && next_count.sum) {
total_prob += lambda[n] * next_count.sum // curr_count.sum;
total_prob += lambda[n] * next_count.sum / curr_count.sum;
}
else {
break;
Expand Down

0 comments on commit 815e757

Please sign in to comment.