Merge pull request #1 from jabalazs/fixes

Several minor fixes to the readme and download scripts. Many thanks to jabalazs!
epochx · Mar 23, 2017 · 622693f · 622693f
2 parents 30de846 + 807eb5f
commit 622693f
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -1,20 +1,23 @@
-Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"
+# Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"
 
 - Download and install the software needed
-    1. download and install Senna, http://ronan.collobert.com/senna/
-    2. download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
-    3. download and install pyFIM http://www.borgelt.net/pyfim.html
-    4. `pip install scipy numpy Levenshtein`
-
+    1. Download and install Senna, http://ronan.collobert.com/senna/
+    2. Download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
+      - Needs Java 8
+    3. Download and install pyFIM http://www.borgelt.net/pyfim.html
+      - Download `fim.so` and place it in python's `dist-packages` directory
+    4. Run `pip install scipy numpy python-Levenshtein`
+
 - Download data and create environment
-    1. `create_data_folder.sh path_where_to_create_data_folder`
-    2. modify ./enlp/settings.py accordingly
-
+    1. Make `create_data_folder.sh` executable: `chmod +x create_data_folder.sh`
+    2. Run `./create_data_folder.sh path/to/data/folder` (No trailing slash!)
+    3. Modify `./enlp/settings.py` accordingly
+
 - Pre-process datasets
     1. `python process_corpus.py`
 
 - Run (will take several hours depending on the number of cores available)
-    1. `python run.py path_to_store_output_json_files`
+    1. `python run.py path/to/store/output/json/files`
 
 
-Check `process_corpus.py --help` and `run.py --help` for more details on how run them.
+Check `process_corpus.py --help` and `run.py --help` for more details on how run them.
diff --git a/create_data_folder.sh b/create_data_folder.sh
@@ -4,27 +4,41 @@ DATAPATH=$1
 
 mkdir -p "$DATAPATH"/data/{pickle,corpora}
 mkdir -p "$DATAPATH"/data/lexicon/liu
-mkdir -p "$DATAPATH"/data/stopwords
-mkdir -p "$DATAPATH"/data/corpora/opinion/{semeval-absa-2014,youtube}
+mkdir -p "$DATAPATH"/data/corpora/stopwords
+mkdir -p "$DATAPATH"/data/corpora/opinion/{liu,semeval-absa-2014,youtube}
 
 # Liu Customer Review Dataset
 wget http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip
 unzip ./CustomerReviewData.zip
-mv ./customer_review_data/Apex AD2600 Progressive-scan DVD player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt
-mv ./customer_review_data/Nikon coolpix 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt
-mv ./customer_review_data/Canon G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt
-mv ./customer_review_data/Nokia 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt
-mv ./customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt
+mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/liu/apex_dvd_player.txt
+mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/liu/nikon_camera.txt
+mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/liu/canon_camera.txt
+mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/liu/nokia_cellphone.txt
+mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/liu/creative_mp3_player.txt
 
 # stanford stopwords
 wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt
 mv ./stopwords.txt "$DATAPATH"/data/stopwords/stanford_stopwords.txt
 
 # nltk stopwords
-cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt
+wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
+unzip -p stopwords.zip stopwords/english >./data/corpora/stopwords/nltk_stopwords.txt
 
 # opinion words
 wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar
-unzip ./opinion-lexicon-English.rar
-mv ./opinion-lexicon-English/positive_words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
-mv ./opinion-lexicon-English/negative_words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt
+unrar e ./opinion-lexicon-English.rar
+mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
+mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt
+
+rm -r CustomerReviewData.zip opinion-lexicon-English.rar customer\ review\ data
+rm stopwords.zip
+
+echo "Patching corpora"
+cd "$DATAPATH"/data/corpora/opinion/liu
+
+sed -i '155s/feature/feature[+2]/' nokia_cellphone.txt
+sed -i '480s/look{+1]/look[+1]/' nokia_cellphone.txt
+sed -i '79s/^feature\[+2\]\,\s/feature[+2]/' canon_camera.txt
+sed -i '157s/look##/look[+2]##/' creative_mp3_player.txt
+sed -i '334s/setup\[2\]/setup[+2]/' creative_mp3_player.txt
+sed -i '485s/#/##/' apex_dvd_player.txt
diff --git a/enlp/corpus/liu.py b/enlp/corpus/liu.py
@@ -15,6 +15,13 @@
 # --- LIU CORPUS CLASSES ------------------------------------------------------
 
 
+class LiuCorpusError(Exception):
+    def __init__(self, line, filepath):
+
+        msg = ('Inconsistency found on line {}, on file '
+               '{}'.format(line, filepath))
+        super(LiuCorpusError, self).__init__(msg)
+
 class LiuCorpus(Corpus):
     """
     Class to read Liu corpus. Sentences in the corpus are extracted as
@@ -33,10 +40,8 @@ def __init__(self):
         the source file
         """
         # self.name = filepath.split('/')[-1].replace('.txt', '')
-        if self._check():
-            self._read()
-        else:
-            raise CorpusError("Corpus was not properly built. Check for consistency")
+        self._check()
+        self._read()
 
     def __repr__(self):
         return "<LiuCorpus {0}>".format(self.name)
@@ -58,30 +63,25 @@ def _check(self):
         mfile = open(self.filepath, "r")
         for i in mfile.readlines():
             linea = i.replace('\n', '')
-            if (self._is_liu_comment(linea) is False):
-                if (self._is_new_comment(linea) is False):
-                    partes = linea.split('##')
-                    if (len(partes) < 2):
-                        print counter
-                        return False
-                    tags = partes[0]
-                    real_sentence = partes[1].replace("\r", "").replace("\n", "").strip()
-                    if (real_sentence == ""):
-                        print counter
-                        return False
-                    if (tags != ""):
-                        aspects_list = tags.strip().split(",")
-                        for aspect_item in aspects_list:
-                            aspect = self._extract_aspect(aspect_item)
-                            if (aspect == ""):
-                                print counter
-                                return False
-                            orientation = self._extract_orientation(aspect_item)
-                            if (orientation is None):
-                                print counter
-                                return False
+            if (not self._is_liu_comment(linea)
+               and not self._is_new_comment(linea)):
+                partes = linea.split('##')
+                if (len(partes) < 2):
+                    raise LiuCorpusError(counter, self.filepath)
+                tags = partes[0]
+                real_sentence = partes[1].replace("\r", "").replace("\n", "").strip()
+                if (real_sentence == ""):
+                    raise LiuCorpusError(counter, self.filepath)
+                if (tags != ""):
+                    aspects_list = tags.strip().split(",")
+                    for aspect_item in aspects_list:
+                        aspect = self._extract_aspect(aspect_item)
+                        if (aspect == ""):
+                            raise LiuCorpusError(counter, self.filepath)
+                        orientation = self._extract_orientation(aspect_item)
+                        if (orientation is None):
+                            raise LiuCorpusError(counter, self.filepath)
             counter += 1
-        return True
 
     def _read(self):
         self._comment_counter = 0
@@ -91,7 +91,8 @@ def _read(self):
         self._sentences = OrderedDict()
 
         for line in open(self.filepath, "r").readlines():
-            self.parse_line(line)
+            if not self._is_liu_comment(line):
+                self.parse_line(line)
 
     def parse_line(self, string):
         if not self._is_new_comment(string):
@@ -107,7 +108,7 @@ def parse_line(self, string):
             sentence = Sentence(string=string, id=id, document=review)
             sentence.aspects = []
             self._sentences[id] = sentence
-            review.append(sentence) 
+            review.append(sentence)
             if aspects_string:
                 for aspect_string in aspects_string.strip().split(", "):
                     term = self._extract_aspect(aspect_string)
@@ -123,7 +124,7 @@ def parse_line(self, string):
             self._comment_counter += 1
 
     def _is_liu_comment(self, string):
-        if (string == ("\n") or string == ("\r")):
+        if (string == ("\n") or string == ("\r") or string == ("\r\n")):
             return True
         else:
             pos = re.search("\A\*+", string)