From c6b95bb7a97aadba1de8a531ac6791529d5af149 Mon Sep 17 00:00:00 2001 From: Jorge Balazs Date: Thu, 23 Mar 2017 09:19:34 +0900 Subject: [PATCH 1/5] Fix several paths and commands in data download script --- create_data_folder.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) mode change 100644 => 100755 create_data_folder.sh diff --git a/create_data_folder.sh b/create_data_folder.sh old mode 100644 new mode 100755 index 49a711e..dce8a07 --- a/create_data_folder.sh +++ b/create_data_folder.sh @@ -10,21 +10,21 @@ mkdir -p "$DATAPATH"/data/corpora/opinion/{semeval-absa-2014,youtube} # Liu Customer Review Dataset wget http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip unzip ./CustomerReviewData.zip -mv ./customer_review_data/Apex AD2600 Progressive-scan DVD player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt -mv ./customer_review_data/Nikon coolpix 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt -mv ./customer_review_data/Canon G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt -mv ./customer_review_data/Nokia 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt -mv ./customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt +mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt +mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt +mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt +mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt +mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt # stanford stopwords wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt mv ./stopwords.txt "$DATAPATH"/data/stopwords/stanford_stopwords.txt # nltk stopwords -cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt +# cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt # opinion words wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar -unzip ./opinion-lexicon-English.rar -mv ./opinion-lexicon-English/positive_words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt -mv ./opinion-lexicon-English/negative_words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt +unrar e ./opinion-lexicon-English.rar +mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt +mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt From f527192f5033d307342cfc6e2d168c63f23702e5 Mon Sep 17 00:00:00 2001 From: Jorge Balazs Date: Thu, 23 Mar 2017 09:26:22 +0900 Subject: [PATCH 2/5] Clarify some points in the README --- README.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 75c2b7f..fded3d2 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,16 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining" - Download and install the software needed - 1. download and install Senna, http://ronan.collobert.com/senna/ - 2. download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html - 3. download and install pyFIM http://www.borgelt.net/pyfim.html - 4. `pip install scipy numpy Levenshtein` - + 1. Download and install Senna, http://ronan.collobert.com/senna/ + 2. Download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html + 3. Download and install pyFIM http://www.borgelt.net/pyfim.html + 4. Run `pip install scipy numpy python-Levenshtein` + - Download data and create environment - 1. `create_data_folder.sh path_where_to_create_data_folder` - 2. modify ./enlp/settings.py accordingly - + 1. Make `create_data_folder.sh` executable: `chmod +x create_data_folder.sh` + 2. Run `./create_data_folder.sh path/to/data/folder` + 3. Modify `./enlp/settings.py` accordingly + - Pre-process datasets 1. `python process_corpus.py` @@ -17,4 +18,4 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opi 1. `python run.py path_to_store_output_json_files` -Check `process_corpus.py --help` and `run.py --help` for more details on how run them. \ No newline at end of file +Check `process_corpus.py --help` and `run.py --help` for more details on how run them. From 6b1426e0900c6827d50989a4777471bc111f41c2 Mon Sep 17 00:00:00 2001 From: Jorge Balazs Date: Thu, 23 Mar 2017 09:51:04 +0900 Subject: [PATCH 3/5] Further fix paths in data download script --- README.md | 2 +- create_data_folder.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index fded3d2..0b198ff 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opi - Download data and create environment 1. Make `create_data_folder.sh` executable: `chmod +x create_data_folder.sh` - 2. Run `./create_data_folder.sh path/to/data/folder` + 2. Run `./create_data_folder.sh path/to/data/folder` (No trailing slash!) 3. Modify `./enlp/settings.py` accordingly - Pre-process datasets diff --git a/create_data_folder.sh b/create_data_folder.sh index dce8a07..36e0b49 100755 --- a/create_data_folder.sh +++ b/create_data_folder.sh @@ -5,16 +5,16 @@ DATAPATH=$1 mkdir -p "$DATAPATH"/data/{pickle,corpora} mkdir -p "$DATAPATH"/data/lexicon/liu mkdir -p "$DATAPATH"/data/stopwords -mkdir -p "$DATAPATH"/data/corpora/opinion/{semeval-absa-2014,youtube} +mkdir -p "$DATAPATH"/data/corpora/opinion/{liu,semeval-absa-2014,youtube} # Liu Customer Review Dataset wget http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip unzip ./CustomerReviewData.zip -mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt -mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt -mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt -mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt -mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt +mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/liu/apex_dvd_player.txt +mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/liu/nikon_camera.txt +mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/liu/canon_camera.txt +mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/liu/nokia_cellphone.txt +mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/liu/creative_mp3_player.txt # stanford stopwords wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt From 46b332c03f676314c783ae5e93b3826e41ce35b3 Mon Sep 17 00:00:00 2001 From: Jorge Balazs Date: Thu, 23 Mar 2017 15:40:17 +0900 Subject: [PATCH 4/5] Add patch to liu corpora in data download script Also improve error handling in LiuCorpus class --- create_data_folder.sh | 12 +++++++++ enlp/corpus/liu.py | 61 ++++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/create_data_folder.sh b/create_data_folder.sh index 36e0b49..1f8cbaf 100755 --- a/create_data_folder.sh +++ b/create_data_folder.sh @@ -28,3 +28,15 @@ wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar unrar e ./opinion-lexicon-English.rar mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt + +rm -r CustomerReviewData.zip opinion-lexicon-English.rar customer\ review\ data + +echo "Patching corpora" +cd "$DATAPATH"/data/corpora/opinion/liu + +sed -i '155s/feature/feature[+2]/' nokia_cellphone.txt +sed -i '480s/look{+1]/look[+1]/' nokia_cellphone.txt +sed -i '79s/^feature\[+2\]\,\s/feature[+2]/' canon_camera.txt +sed -i '157s/look##/look[+2]##/' creative_mp3_player.txt +sed -i '334s/setup\[2\]/setup[+2]/' creative_mp3_player.txt +sed -i '485s/#/##/' apex_dvd_player.txt diff --git a/enlp/corpus/liu.py b/enlp/corpus/liu.py index d06452d..b6cea36 100644 --- a/enlp/corpus/liu.py +++ b/enlp/corpus/liu.py @@ -15,6 +15,13 @@ # --- LIU CORPUS CLASSES ------------------------------------------------------ +class LiuCorpusError(Exception): + def __init__(self, line, filepath): + + msg = ('Inconsistency found on line {}, on file ' + '{}'.format(line, filepath)) + super(LiuCorpusError, self).__init__(msg) + class LiuCorpus(Corpus): """ Class to read Liu corpus. Sentences in the corpus are extracted as @@ -33,10 +40,8 @@ def __init__(self): the source file """ # self.name = filepath.split('/')[-1].replace('.txt', '') - if self._check(): - self._read() - else: - raise CorpusError("Corpus was not properly built. Check for consistency") + self._check() + self._read() def __repr__(self): return "".format(self.name) @@ -58,30 +63,25 @@ def _check(self): mfile = open(self.filepath, "r") for i in mfile.readlines(): linea = i.replace('\n', '') - if (self._is_liu_comment(linea) is False): - if (self._is_new_comment(linea) is False): - partes = linea.split('##') - if (len(partes) < 2): - print counter - return False - tags = partes[0] - real_sentence = partes[1].replace("\r", "").replace("\n", "").strip() - if (real_sentence == ""): - print counter - return False - if (tags != ""): - aspects_list = tags.strip().split(",") - for aspect_item in aspects_list: - aspect = self._extract_aspect(aspect_item) - if (aspect == ""): - print counter - return False - orientation = self._extract_orientation(aspect_item) - if (orientation is None): - print counter - return False + if (not self._is_liu_comment(linea) + and not self._is_new_comment(linea)): + partes = linea.split('##') + if (len(partes) < 2): + raise LiuCorpusError(counter, self.filepath) + tags = partes[0] + real_sentence = partes[1].replace("\r", "").replace("\n", "").strip() + if (real_sentence == ""): + raise LiuCorpusError(counter, self.filepath) + if (tags != ""): + aspects_list = tags.strip().split(",") + for aspect_item in aspects_list: + aspect = self._extract_aspect(aspect_item) + if (aspect == ""): + raise LiuCorpusError(counter, self.filepath) + orientation = self._extract_orientation(aspect_item) + if (orientation is None): + raise LiuCorpusError(counter, self.filepath) counter += 1 - return True def _read(self): self._comment_counter = 0 @@ -91,7 +91,8 @@ def _read(self): self._sentences = OrderedDict() for line in open(self.filepath, "r").readlines(): - self.parse_line(line) + if not self._is_liu_comment(line): + self.parse_line(line) def parse_line(self, string): if not self._is_new_comment(string): @@ -107,7 +108,7 @@ def parse_line(self, string): sentence = Sentence(string=string, id=id, document=review) sentence.aspects = [] self._sentences[id] = sentence - review.append(sentence) + review.append(sentence) if aspects_string: for aspect_string in aspects_string.strip().split(", "): term = self._extract_aspect(aspect_string) @@ -123,7 +124,7 @@ def parse_line(self, string): self._comment_counter += 1 def _is_liu_comment(self, string): - if (string == ("\n") or string == ("\r")): + if (string == ("\n") or string == ("\r") or string == ("\r\n")): return True else: pos = re.search("\A\*+", string) From 807eb5f102d0093fc640d63eb9c967c948b646b1 Mon Sep 17 00:00:00 2001 From: Jorge Balazs Date: Thu, 23 Mar 2017 17:06:14 +0900 Subject: [PATCH 5/5] Automate nltk_stopwords file download and improve README --- README.md | 6 ++++-- create_data_folder.sh | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0b198ff..326e88c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ -Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining" +# Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining" - Download and install the software needed 1. Download and install Senna, http://ronan.collobert.com/senna/ 2. Download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html + - Needs Java 8 3. Download and install pyFIM http://www.borgelt.net/pyfim.html + - Download `fim.so` and place it in python's `dist-packages` directory 4. Run `pip install scipy numpy python-Levenshtein` - Download data and create environment @@ -15,7 +17,7 @@ Code for the paper "Replication issues in syntax-based aspect extraction for opi 1. `python process_corpus.py` - Run (will take several hours depending on the number of cores available) - 1. `python run.py path_to_store_output_json_files` + 1. `python run.py path/to/store/output/json/files` Check `process_corpus.py --help` and `run.py --help` for more details on how run them. diff --git a/create_data_folder.sh b/create_data_folder.sh index 1f8cbaf..0b868a3 100755 --- a/create_data_folder.sh +++ b/create_data_folder.sh @@ -4,7 +4,7 @@ DATAPATH=$1 mkdir -p "$DATAPATH"/data/{pickle,corpora} mkdir -p "$DATAPATH"/data/lexicon/liu -mkdir -p "$DATAPATH"/data/stopwords +mkdir -p "$DATAPATH"/data/corpora/stopwords mkdir -p "$DATAPATH"/data/corpora/opinion/{liu,semeval-absa-2014,youtube} # Liu Customer Review Dataset @@ -21,7 +21,8 @@ wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanf mv ./stopwords.txt "$DATAPATH"/data/stopwords/stanford_stopwords.txt # nltk stopwords -# cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt +wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip +unzip -p stopwords.zip stopwords/english >./data/corpora/stopwords/nltk_stopwords.txt # opinion words wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar @@ -30,6 +31,7 @@ mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt rm -r CustomerReviewData.zip opinion-lexicon-English.rar customer\ review\ data +rm stopwords.zip echo "Patching corpora" cd "$DATAPATH"/data/corpora/opinion/liu