Skip to content

Commit

Permalink
Merge pull request #1 from jabalazs/fixes
Browse files Browse the repository at this point in the history
Several minor fixes to the readme and download scripts. Many thanks to jabalazs!
  • Loading branch information
epochx authored Mar 23, 2017
2 parents 30de846 + 807eb5f commit 622693f
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 52 deletions.
25 changes: 14 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"
# Code for the paper "Replication issues in syntax-based aspect extraction for opinion mining"

- Download and install the software needed
1. download and install Senna, http://ronan.collobert.com/senna/
2. download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
3. download and install pyFIM http://www.borgelt.net/pyfim.html
4. `pip install scipy numpy Levenshtein`

1. Download and install Senna, http://ronan.collobert.com/senna/
2. Download and install CoreNLP 3.6, http://stanfordnlp.github.io/CoreNLP/history.html
- Needs Java 8
3. Download and install pyFIM http://www.borgelt.net/pyfim.html
- Download `fim.so` and place it in python's `dist-packages` directory
4. Run `pip install scipy numpy python-Levenshtein`

- Download data and create environment
1. `create_data_folder.sh path_where_to_create_data_folder`
2. modify ./enlp/settings.py accordingly

1. Make `create_data_folder.sh` executable: `chmod +x create_data_folder.sh`
2. Run `./create_data_folder.sh path/to/data/folder` (No trailing slash!)
3. Modify `./enlp/settings.py` accordingly

- Pre-process datasets
1. `python process_corpus.py`

- Run (will take several hours depending on the number of cores available)
1. `python run.py path_to_store_output_json_files`
1. `python run.py path/to/store/output/json/files`


Check `process_corpus.py --help` and `run.py --help` for more details on how run them.
Check `process_corpus.py --help` and `run.py --help` for more details on how run them.
36 changes: 25 additions & 11 deletions create_data_folder.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,41 @@ DATAPATH=$1

mkdir -p "$DATAPATH"/data/{pickle,corpora}
mkdir -p "$DATAPATH"/data/lexicon/liu
mkdir -p "$DATAPATH"/data/stopwords
mkdir -p "$DATAPATH"/data/corpora/opinion/{semeval-absa-2014,youtube}
mkdir -p "$DATAPATH"/data/corpora/stopwords
mkdir -p "$DATAPATH"/data/corpora/opinion/{liu,semeval-absa-2014,youtube}

# Liu Customer Review Dataset
wget http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip
unzip ./CustomerReviewData.zip
mv ./customer_review_data/Apex AD2600 Progressive-scan DVD player.txt "$DATAPATH"/data/corpora/opinion/apex_dvd_player.txt
mv ./customer_review_data/Nikon coolpix 4300.txt "$DATAPATH"/data/corpora/opinion/nikon_camera.txt
mv ./customer_review_data/Canon G3.txt "$DATAPATH"/data/corpora/opinion/canon_camera.txt
mv ./customer_review_data/Nokia 6610.txt "$DATAPATH"/data/corpora/opinion/nokia_cellphone.txt
mv ./customer_review_data/Creative Labs Nomad Jukebox Zen Xtra 40GB.txt "$DATAPATH"/data/corpora/opinion/creative_mp3_player.txt
mv ./customer\ review\ data/Apex\ AD2600\ Progressive-scan\ DVD\ player.txt "$DATAPATH"/data/corpora/opinion/liu/apex_dvd_player.txt
mv ./customer\ review\ data/Nikon\ coolpix\ 4300.txt "$DATAPATH"/data/corpora/opinion/liu/nikon_camera.txt
mv ./customer\ review\ data/Canon\ G3.txt "$DATAPATH"/data/corpora/opinion/liu/canon_camera.txt
mv ./customer\ review\ data/Nokia\ 6610.txt "$DATAPATH"/data/corpora/opinion/liu/nokia_cellphone.txt
mv ./customer\ review\ data/Creative\ Labs\ Nomad\ Jukebox\ Zen\ Xtra\ 40GB.txt "$DATAPATH"/data/corpora/opinion/liu/creative_mp3_player.txt

# stanford stopwords
wget https://raw.githubusercontent.com/stanfordnlp/CoreNLP/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt
mv ./stopwords.txt "$DATAPATH"/data/stopwords/stanford_stopwords.txt

# nltk stopwords
cp ~/nltk_data/stopwords.txt "$DATAPATH"/data/stopwords/nltk_stopwords.txt
wget https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip
unzip -p stopwords.zip stopwords/english >./data/corpora/stopwords/nltk_stopwords.txt

# opinion words
wget http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar
unzip ./opinion-lexicon-English.rar
mv ./opinion-lexicon-English/positive_words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
mv ./opinion-lexicon-English/negative_words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt
unrar e ./opinion-lexicon-English.rar
mv ./positive-words.txt "$DATAPATH"/data/lexicon/liu/positive_words.txt
mv ./negative-words.txt "$DATAPATH"/data/lexicon/liu/negative_words.txt

rm -r CustomerReviewData.zip opinion-lexicon-English.rar customer\ review\ data
rm stopwords.zip

echo "Patching corpora"
cd "$DATAPATH"/data/corpora/opinion/liu

sed -i '155s/feature/feature[+2]/' nokia_cellphone.txt
sed -i '480s/look{+1]/look[+1]/' nokia_cellphone.txt
sed -i '79s/^feature\[+2\]\,\s/feature[+2]/' canon_camera.txt
sed -i '157s/look##/look[+2]##/' creative_mp3_player.txt
sed -i '334s/setup\[2\]/setup[+2]/' creative_mp3_player.txt
sed -i '485s/#/##/' apex_dvd_player.txt
61 changes: 31 additions & 30 deletions enlp/corpus/liu.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
# --- LIU CORPUS CLASSES ------------------------------------------------------


class LiuCorpusError(Exception):
def __init__(self, line, filepath):

msg = ('Inconsistency found on line {}, on file '
'{}'.format(line, filepath))
super(LiuCorpusError, self).__init__(msg)

class LiuCorpus(Corpus):
"""
Class to read Liu corpus. Sentences in the corpus are extracted as
Expand All @@ -33,10 +40,8 @@ def __init__(self):
the source file
"""
# self.name = filepath.split('/')[-1].replace('.txt', '')
if self._check():
self._read()
else:
raise CorpusError("Corpus was not properly built. Check for consistency")
self._check()
self._read()

def __repr__(self):
return "<LiuCorpus {0}>".format(self.name)
Expand All @@ -58,30 +63,25 @@ def _check(self):
mfile = open(self.filepath, "r")
for i in mfile.readlines():
linea = i.replace('\n', '')
if (self._is_liu_comment(linea) is False):
if (self._is_new_comment(linea) is False):
partes = linea.split('##')
if (len(partes) < 2):
print counter
return False
tags = partes[0]
real_sentence = partes[1].replace("\r", "").replace("\n", "").strip()
if (real_sentence == ""):
print counter
return False
if (tags != ""):
aspects_list = tags.strip().split(",")
for aspect_item in aspects_list:
aspect = self._extract_aspect(aspect_item)
if (aspect == ""):
print counter
return False
orientation = self._extract_orientation(aspect_item)
if (orientation is None):
print counter
return False
if (not self._is_liu_comment(linea)
and not self._is_new_comment(linea)):
partes = linea.split('##')
if (len(partes) < 2):
raise LiuCorpusError(counter, self.filepath)
tags = partes[0]
real_sentence = partes[1].replace("\r", "").replace("\n", "").strip()
if (real_sentence == ""):
raise LiuCorpusError(counter, self.filepath)
if (tags != ""):
aspects_list = tags.strip().split(",")
for aspect_item in aspects_list:
aspect = self._extract_aspect(aspect_item)
if (aspect == ""):
raise LiuCorpusError(counter, self.filepath)
orientation = self._extract_orientation(aspect_item)
if (orientation is None):
raise LiuCorpusError(counter, self.filepath)
counter += 1
return True

def _read(self):
self._comment_counter = 0
Expand All @@ -91,7 +91,8 @@ def _read(self):
self._sentences = OrderedDict()

for line in open(self.filepath, "r").readlines():
self.parse_line(line)
if not self._is_liu_comment(line):
self.parse_line(line)

def parse_line(self, string):
if not self._is_new_comment(string):
Expand All @@ -107,7 +108,7 @@ def parse_line(self, string):
sentence = Sentence(string=string, id=id, document=review)
sentence.aspects = []
self._sentences[id] = sentence
review.append(sentence)
review.append(sentence)
if aspects_string:
for aspect_string in aspects_string.strip().split(", "):
term = self._extract_aspect(aspect_string)
Expand All @@ -123,7 +124,7 @@ def parse_line(self, string):
self._comment_counter += 1

def _is_liu_comment(self, string):
if (string == ("\n") or string == ("\r")):
if (string == ("\n") or string == ("\r") or string == ("\r\n")):
return True
else:
pos = re.search("\A\*+", string)
Expand Down

0 comments on commit 622693f

Please sign in to comment.