ptwobrussell · min9079 · Apr 15, 2014 · Apr 16, 2014 · ptwobrussell · May 13, 2014
diff --git a/ipynb/Chapter 5 - Mining Web Pages.ipynb b/ipynb/Chapter 5 - Mining Web Pages.ipynb
@@ -412,7 +412,7 @@
       "            if score > max_cluster_score:\n",
       "                max_cluster_score = score\n",
       "\n",
-      "        scores.append((sentence_idx, score))\n",
+      "        scores.append((sentence_idx, max_cluster_score))\n",
       "\n",
       "    return scores\n",
       "\n",
@@ -569,22 +569,26 @@
       "    pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]\n",
       "\n",
       "    all_entity_chunks = []\n",
-      "    previous_pos = None\n",
+      "    current_status = 'None-NN'\n",
       "    current_entity_chunk = []\n",
       "    for (token, pos) in pos_tagged_tokens:\n",
       "\n",
-      "        if pos == previous_pos and pos.startswith('NN'):\n",
-      "            current_entity_chunk.append(token)\n",
-      "        elif pos.startswith('NN'):\n",
-      "            if current_entity_chunk != []:\n",
-      "\n",
+      "        if pos.startswith('NN'):\n",
+      "            if current_status == 'None-NN':\n",
+      "                current_status = 'NN'\n",
+      "                current_entity_chunk = [token]\n",
+      "            else:\n",
+      "                current_entity_chunk.append(token)\n",
+      "        else:\n",
+      "            if current_status == 'None-NN':\n",
+      "                pass\n",
+      "            else:\n",
+      "                current_status = 'None-NN'\n",
+      "        \n",
       "                # Note that current_entity_chunk could be a duplicate when appended,\n",
       "                # so frequency analysis again becomes a consideration\n",
       "\n",
       "                all_entity_chunks.append((' '.join(current_entity_chunk), pos))\n",
-      "            current_entity_chunk = [token]\n",
-      "\n",
-      "        previous_pos = pos\n",
       "\n",
       "    # Store the chunks as an index for the document\n",
       "    # and account for frequency while we're at it...\n",
@@ -633,20 +637,25 @@
       "    for sentence in pos_tagged_tokens:\n",
       "\n",
       "        all_entity_chunks = []\n",
-      "        previous_pos = None\n",
+      "        current_status = 'None-NN'\n",
       "        current_entity_chunk = []\n",
       "\n",
       "        for (token, pos) in sentence:\n",
       "\n",
-      "            if pos == previous_pos and pos.startswith('NN'):\n",
-      "                current_entity_chunk.append(token)\n",
-      "            elif pos.startswith('NN'):\n",
+      "            if pos.startswith('NN'):\n",
+      "                if current_status == 'None-NN':\n",
+      "                    current_status = 'NN'\n",
+      "                    current_entity_chunk = [token]\n",
+      "                else:\n",
+      "                    current_entity_chunk.append(token)\n",
+      "            else:\n",
+      "                if current_status == 'None-NN':\n",
+      "                    pass\n",
+      "                else:\n",
+      "                    current_status = 'None-NN'\n",
       "                if current_entity_chunk != []:\n",
       "                    all_entity_chunks.append((' '.join(current_entity_chunk),\n",
       "                            pos))\n",
-      "                current_entity_chunk = [token]\n",
-      "\n",
-      "            previous_pos = pos\n",
       "\n",
       "        if len(all_entity_chunks) > 1:\n",
       "            entity_interactions.append(all_entity_chunks)\n",