From c6b95bb7a97aadba1de8a531ac6791529d5af149 Mon Sep 17 00:00:00 2001
From: Jorge Balazs <jabalazst@gmail.com>
Date: Thu, 23 Mar 2017 09:19:34 +0900
Subject: [PATCH 1/5] Fix several paths and commands in data download script

---
 create_data_folder.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 create_data_folder.sh

diff --git a/create_data_folder.sh b/create_data_folder.sh
old mode 100644
new mode 100755
index 49a711e..dce8a07
--- a/create_data_folder.sh
+++ b/create_data_folder.sh
@@ -10,21 +10,21 @@ mkdir -p "$DATAPATH"/data/corpora/opinion/{semeval-absa-2014,youtube}
 # Liu Customer Review Dataset
 wget http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip
 unzip ./CustomerReviewData.zip
-mv ./customer_review_data/Apex AD2600 Progressive-scan DVD player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt
-mv ./customer_review_data/Nikon coolpix 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt
-mv ./customer_review_data/Canon G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt
-mv ./customer_review_data/Nokia 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt
-mv ./customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt
+mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt
+mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt
+mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt
+mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt
+mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt
 
 # stanford stopwords
 wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt
 mv ./stopwords.txt "$DATAPATH"/data/stopwords/stanford_stopwords.txt
 
 # nltk stopwords
-cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt
+# cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt
 
 # opinion words
 wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar
-unzip ./opinion-lexicon-English.rar
-mv ./opinion-lexicon-English/positive_words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
-mv ./opinion-lexicon-English/negative_words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt
+unrar e ./opinion-lexicon-English.rar
+mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
+mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt

From f527192f5033d307342cfc6e2d168c63f23702e5 Mon Sep 17 00:00:00 2001
From: Jorge Balazs <jabalazst@gmail.com>
Date: Thu, 23 Mar 2017 09:26:22 +0900
Subject: [PATCH 2/5] Clarify some points in the README

---
 README.md | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 75c2b7f..fded3d2 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,16 @@
 Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"
 
 - Download and install the software needed
-    1. download and install Senna, http://ronan.collobert.com/senna/
-    2. download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
-    3. download and install pyFIM http://www.borgelt.net/pyfim.html
-    4. `pip install scipy numpy Levenshtein`
-    
+    1. Download and install Senna, http://ronan.collobert.com/senna/
+    2. Download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
+    3. Download and install pyFIM http://www.borgelt.net/pyfim.html
+    4. Run `pip install scipy numpy python-Levenshtein`
+
 - Download data and create environment
-    1. `create_data_folder.sh path_where_to_create_data_folder`
-    2. modify ./enlp/settings.py accordingly
-    
+    1. Make `create_data_folder.sh` executable: `chmod +x create_data_folder.sh`
+    2. Run `./create_data_folder.sh path/to/data/folder`
+    3. Modify `./enlp/settings.py` accordingly
+
 - Pre-process datasets
     1. `python process_corpus.py`
 
@@ -17,4 +18,4 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opi
     1. `python run.py path_to_store_output_json_files`
 
 
-Check `process_corpus.py --help` and `run.py --help` for more details on how run them.
\ No newline at end of file
+Check `process_corpus.py --help` and `run.py --help` for more details on how run them.

From 6b1426e0900c6827d50989a4777471bc111f41c2 Mon Sep 17 00:00:00 2001
From: Jorge Balazs <jabalazst@gmail.com>
Date: Thu, 23 Mar 2017 09:51:04 +0900
Subject: [PATCH 3/5] Further fix paths in data download script

---
 README.md             |  2 +-
 create_data_folder.sh | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index fded3d2..0b198ff 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opi
 
 - Download data and create environment
     1. Make `create_data_folder.sh` executable: `chmod +x create_data_folder.sh`
-    2. Run `./create_data_folder.sh path/to/data/folder`
+    2. Run `./create_data_folder.sh path/to/data/folder` (No trailing slash!)
     3. Modify `./enlp/settings.py` accordingly
 
 - Pre-process datasets
diff --git a/create_data_folder.sh b/create_data_folder.sh
index dce8a07..36e0b49 100755
--- a/create_data_folder.sh
+++ b/create_data_folder.sh
@@ -5,16 +5,16 @@ DATAPATH=$1
 mkdir -p "$DATAPATH"/data/{pickle,corpora}
 mkdir -p "$DATAPATH"/data/lexicon/liu
 mkdir -p "$DATAPATH"/data/stopwords
-mkdir -p "$DATAPATH"/data/corpora/opinion/{semeval-absa-2014,youtube}
+mkdir -p "$DATAPATH"/data/corpora/opinion/{liu,semeval-absa-2014,youtube}
 
 # Liu Customer Review Dataset
 wget http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip
 unzip ./CustomerReviewData.zip
-mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt
-mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt
-mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt
-mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt
-mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt
+mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/liu/apex_dvd_player.txt
+mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/liu/nikon_camera.txt
+mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/liu/canon_camera.txt
+mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/liu/nokia_cellphone.txt
+mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/liu/creative_mp3_player.txt
 
 # stanford stopwords
 wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt

From 46b332c03f676314c783ae5e93b3826e41ce35b3 Mon Sep 17 00:00:00 2001
From: Jorge Balazs <jabalazst@gmail.com>
Date: Thu, 23 Mar 2017 15:40:17 +0900
Subject: [PATCH 4/5] Add patch to liu corpora in data download script

Also improve error handling in LiuCorpus class
---
 create_data_folder.sh | 12 +++++++++
 enlp/corpus/liu.py    | 61 ++++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/create_data_folder.sh b/create_data_folder.sh
index 36e0b49..1f8cbaf 100755
--- a/create_data_folder.sh
+++ b/create_data_folder.sh
@@ -28,3 +28,15 @@ wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar
 unrar e ./opinion-lexicon-English.rar
 mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
 mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt
+
+rm -r CustomerReviewData.zip opinion-lexicon-English.rar customer\ review\ data
+
+echo "Patching corpora"
+cd "$DATAPATH"/data/corpora/opinion/liu
+
+sed -i '155s/feature/feature[+2]/' nokia_cellphone.txt
+sed -i '480s/look{+1]/look[+1]/' nokia_cellphone.txt
+sed -i '79s/^feature\[+2\]\,\s/feature[+2]/' canon_camera.txt
+sed -i '157s/look##/look[+2]##/' creative_mp3_player.txt
+sed -i '334s/setup\[2\]/setup[+2]/' creative_mp3_player.txt
+sed -i '485s/#/##/' apex_dvd_player.txt
diff --git a/enlp/corpus/liu.py b/enlp/corpus/liu.py
index d06452d..b6cea36 100644
--- a/enlp/corpus/liu.py
+++ b/enlp/corpus/liu.py
@@ -15,6 +15,13 @@
 # --- LIU CORPUS CLASSES ------------------------------------------------------
 
 
+class LiuCorpusError(Exception):
+    def __init__(self, line, filepath):
+
+        msg = ('Inconsistency found on line {}, on file '
+               '{}'.format(line, filepath))
+        super(LiuCorpusError, self).__init__(msg)
+
 class LiuCorpus(Corpus):
     """
     Class to read Liu corpus. Sentences in the corpus are extracted as
@@ -33,10 +40,8 @@ def __init__(self):
         the source file
         """
         # self.name = filepath.split('/')[-1].replace('.txt', '')
-        if self._check():
-            self._read()
-        else:
-            raise CorpusError("Corpus was not properly built. Check for consistency")
+        self._check()
+        self._read()
 
     def __repr__(self):
         return "<LiuCorpus {0}>".format(self.name)
@@ -58,30 +63,25 @@ def _check(self):
         mfile = open(self.filepath, "r")
         for i in mfile.readlines():
             linea = i.replace('\n', '')
-            if (self._is_liu_comment(linea) is False):
-                if (self._is_new_comment(linea) is False):
-                    partes = linea.split('##')
-                    if (len(partes) < 2):
-                        print counter
-                        return False
-                    tags = partes[0]
-                    real_sentence = partes[1].replace("\r", "").replace("\n", "").strip()
-                    if (real_sentence == ""):
-                        print counter
-                        return False
-                    if (tags != ""):
-                        aspects_list = tags.strip().split(",")
-                        for aspect_item in aspects_list:
-                            aspect = self._extract_aspect(aspect_item)
-                            if (aspect == ""):
-                                print counter
-                                return False
-                            orientation = self._extract_orientation(aspect_item)
-                            if (orientation is None):
-                                print counter
-                                return False
+            if (not self._is_liu_comment(linea)
+               and not self._is_new_comment(linea)):
+                partes = linea.split('##')
+                if (len(partes) < 2):
+                    raise LiuCorpusError(counter, self.filepath)
+                tags = partes[0]
+                real_sentence = partes[1].replace("\r", "").replace("\n", "").strip()
+                if (real_sentence == ""):
+                    raise LiuCorpusError(counter, self.filepath)
+                if (tags != ""):
+                    aspects_list = tags.strip().split(",")
+                    for aspect_item in aspects_list:
+                        aspect = self._extract_aspect(aspect_item)
+                        if (aspect == ""):
+                            raise LiuCorpusError(counter, self.filepath)
+                        orientation = self._extract_orientation(aspect_item)
+                        if (orientation is None):
+                            raise LiuCorpusError(counter, self.filepath)
             counter += 1
-        return True
 
     def _read(self):
         self._comment_counter = 0
@@ -91,7 +91,8 @@ def _read(self):
         self._sentences = OrderedDict()
 
         for line in open(self.filepath, "r").readlines():
-            self.parse_line(line)
+            if not self._is_liu_comment(line):
+                self.parse_line(line)
 
     def parse_line(self, string):
         if not self._is_new_comment(string):
@@ -107,7 +108,7 @@ def parse_line(self, string):
             sentence = Sentence(string=string, id=id, document=review)
             sentence.aspects = []
             self._sentences[id] = sentence
-            review.append(sentence) 
+            review.append(sentence)
             if aspects_string:
                 for aspect_string in aspects_string.strip().split(", "):
                     term = self._extract_aspect(aspect_string)
@@ -123,7 +124,7 @@ def parse_line(self, string):
             self._comment_counter += 1
 
     def _is_liu_comment(self, string):
-        if (string == ("\n") or string == ("\r")):
+        if (string == ("\n") or string == ("\r") or string == ("\r\n")):
             return True
         else:
             pos = re.search("\A\*+", string)

From 807eb5f102d0093fc640d63eb9c967c948b646b1 Mon Sep 17 00:00:00 2001
From: Jorge Balazs <jabalazst@gmail.com>
Date: Thu, 23 Mar 2017 17:06:14 +0900
Subject: [PATCH 5/5] Automate nltk_stopwords file download and improve README

---
 README.md             | 6 ++++--
 create_data_folder.sh | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 0b198ff..326e88c 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,11 @@
-Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"
+# Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"
 
 - Download and install the software needed
     1. Download and install Senna, http://ronan.collobert.com/senna/
     2. Download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
+      - Needs Java 8
     3. Download and install pyFIM http://www.borgelt.net/pyfim.html
+      - Download `fim.so` and place it in python's `dist-packages` directory
     4. Run `pip install scipy numpy python-Levenshtein`
 
 - Download data and create environment
@@ -15,7 +17,7 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opi
     1. `python process_corpus.py`
 
 - Run (will take several hours depending on the number of cores available)
-    1. `python run.py path_to_store_output_json_files`
+    1. `python run.py path/to/store/output/json/files`
 
 
 Check `process_corpus.py --help` and `run.py --help` for more details on how run them.
diff --git a/create_data_folder.sh b/create_data_folder.sh
index 1f8cbaf..0b868a3 100755
--- a/create_data_folder.sh
+++ b/create_data_folder.sh
@@ -4,7 +4,7 @@ DATAPATH=$1
 
 mkdir -p "$DATAPATH"/data/{pickle,corpora}
 mkdir -p "$DATAPATH"/data/lexicon/liu
-mkdir -p "$DATAPATH"/data/stopwords
+mkdir -p "$DATAPATH"/data/corpora/stopwords
 mkdir -p "$DATAPATH"/data/corpora/opinion/{liu,semeval-absa-2014,youtube}
 
 # Liu Customer Review Dataset
@@ -21,7 +21,8 @@ wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanf
 mv ./stopwords.txt "$DATAPATH"/data/stopwords/stanford_stopwords.txt
 
 # nltk stopwords
-# cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt
+wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
+unzip -p stopwords.zip stopwords/english >./data/corpora/stopwords/nltk_stopwords.txt
 
 # opinion words
 wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar
@@ -30,6 +31,7 @@ mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
 mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt
 
 rm -r CustomerReviewData.zip opinion-lexicon-English.rar customer\ review\ data
+rm stopwords.zip
 
 echo "Patching corpora"
 cd "$DATAPATH"/data/corpora/opinion/liu