hasadna · OriHoch · Nov 2, 2015 · Nov 9, 2015 · Nov 16, 2015 · Nov 16, 2015
diff --git a/apis/resources/__init__.py b/apis/resources/__init__.py
@@ -6,7 +6,7 @@
 from mks.api import MemberResource, PartyResource, MemberBillsResource, MemberAgendasResource
 from video.api import VideoResource
 from links.api import LinkResource
-from laws.api import BillResource, LawResource, VoteResource, VoteActionResource
+from laws.api import BillResource, LawResource, VoteResource, VoteActionResource, ProvateProposalResource
 from agendas.api import AgendaResource, AgendaTodoResource
 from committees.api import CommitteeResource, CommitteeMeetingResource, ProtocolPartResource
 from auxiliary.api import PostResource, TagResource
@@ -27,6 +27,7 @@
 v2_api.register(VoteResource())
 v2_api.register(VoteActionResource())
 v2_api.register(LawResource())
+v2_api.register(ProvateProposalResource())
 v2_api.register(AgendaResource())
 v2_api.register(AgendaTodoResource())
 v2_api.register(CommitteeResource())
@@ -39,4 +40,4 @@
 v2_api.register(PersonResource())
 v2_api.register(LobbyistsChangeResource())
 v2_api.register(LobbyistResource())
-v2_api.register(LobbyistCorporationResource())
+v2_api.register(LobbyistCorporationResource())
diff --git a/docs/devel/source/autotagging.rst b/docs/devel/source/autotagging.rst
@@ -0,0 +1,65 @@
+.. autotagging:
+
+=======
+Intro
+=======
+
+Auto tagging is used to create auto tags out of the explanition info of a private proposal.
+It uses machine learning techniques.
+
+======
+Flow
+======
+
+* Reading - Read the data from the server using API.
+* Parsing - Parse the downloaded HTML files of the PP.
+* Building - Make histograms out of the parsed HTML.
+* Train/Test the information.
+
+Reading
+=========
+
+Run:
+
+.. code-block:: sh
+
+    python read_pp.py
+
+Parsing
+=========
+
+Run:
+
+.. code-block:: sh
+
+    python parse_htmls.py
+
+Building
+==========
+
+Run:
+
+.. code-block:: sh
+
+    python make_histograms.py
+
+Training/Testing
+==================
+
+Auto tagging is in testing phase, so no auto-tagging script exist.
+To execute the test, run:
+
+.. code-block:: sh
+
+    python tags_autolearn_play.py
+
+Building HTML
+===============
+
+This is used to understand to find errors in the tagging process. Just use your logic here. 
+To execute the test, run:
+
+.. code-block:: sh
+
+    python build_important_keyword_html.py
+
diff --git a/laws/api.py b/laws/api.py
@@ -5,7 +5,6 @@
 from django.core.urlresolvers import reverse
 from tastypie.constants import ALL
 import tastypie.fields as fields
-
 from agendas.templatetags.agendas_tags import agendas_for
 
 from apis.resources.base import BaseResource
@@ -27,6 +26,36 @@ class LawResource(BaseResource):
     class Meta(BaseResource.Meta):
         queryset = Law.objects.all()
         allowed_methods = ['get']
+class TagResource(BaseResource):pass
+class ProvateProposalResource(BaseResource):
+    class Meta(BaseResource.Meta):
+        queryset = PrivateProposal.objects.all()
+        allowed_methods = ['get']
+
+        filtering = dict(from_date=ALL,
+                         to_date=ALL)
+
+
+    tags = fields.ToManyField(TagResource,
+                              'tags',
+
+                              null=True,
+                              full=False)
+
+    def dehydrate_tags(self, bundle):
+		return [tag.name for tag in bundle.obj.bill.tags]
+
+    def build_filters(self, filters={}):
+        orm_filters = super(ProvateProposalResource, self).build_filters(filters)
+        if 'from_date' in filters:
+            orm_filters["date__gte"] = filters['from_date']
+        if 'to_date' in filters:
+            # the to_date needs to be incremented by a day since when humans say to_date=2014-07-30 they
+            # actually mean midnight between 30 to 31. python on the other hand interperts this as midnight between
+            # 29 and 30
+            to_date = datetime.strptime(filters["to_date"], "%Y-%M-%d")+timedelta(days=1)
+            orm_filters["date__lte"] = to_date.strftime("%Y-%M-%d")
+        return orm_filters
 
 class VoteActionResource(BaseResource):
     class Meta(BaseResource.Meta):

diff --git a/laws/tags/build_important_keyword_html.py b/laws/tags/build_important_keyword_html.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+import numpy
+import numpy as np
+import re
+from functools import partial
+from sklearn.externals import joblib
+import cPickle as pickle
+from collections import defaultdict
+lb = pickle.load(open("classifier_data/label_binarizer.pkl", "rb"))
+keywords, data = pickle.load(open('histograms_with_tags.pkl', 'rb'))
+pp_with_tags = pickle.load(open('pp_with_tags.pkl', "rb"))
+trained_classifier = joblib.load('classifier_data/linear_svc_classifier.jlb') 
+keywords = numpy.array(sorted(list(keywords)))
+
+coefs = np.vstack([e.steps[-1][1].coef_ for e in trained_classifier.estimators_])
+
+import os
+import codecs
+try:
+    os.makedirs("important_keywords/tags")
+except OSError, e:
+    pass
+try:
+    os.makedirs("important_keywords/keywords")
+except OSError, e:
+    pass
+
+doc_in_class = defaultdict(list)
+
+print "creating tag dicts"
+for a in pp_with_tags:
+    for tag in a['tags']:
+        doc_in_class[tag].append(a)
+
+
+
+print "creating keyword dict"
+keyword_dict = {}
+for i,w in enumerate(keywords):
+    keyword_dict[w] = i
+
+
+print "splitting docs"
+pp_with_tags = pickle.load(open('pp_with_tags.pkl', "rb"))
+pp_dict = {}
+for pp in pp_with_tags:
+    pp_dict[pp['id']] = pp['text']
+
+def output_tag(class_file, w, estimator_max, estimator_min, weight, hyperlink = False):
+    orig_weight = weight
+    pre = post = ''
+    if weight > 0 and estimator_max != 0:
+        weight /= estimator_max
+        if weight > 0.05 and hyperlink:
+            pre = u"<a href='../tags/" + w.replace(u'/', u' ') + u".html' title='" + str(orig_weight) + u"'>"
+            post = u"</a>"
+        print >>class_file, pre + u"<span style='color:#00%02X00;'>" % (int(255 * weight),) + w + u"</span>" + post + "&nbsp;"
+    elif weight < 0 and estimator_min != 0:
+        weight /= estimator_min
+        if weight > 0.05 and hyperlink:
+            pre = u"<a href='../tags/" + w.replace(u'/', u' ') + u".html' title='" + str(orig_weight) + u"'>"
+            post = u"</a>"
+        print >>class_file, pre + u"<span style='color:#%02X0000;'>" % (int(255 * weight),) + w + u"</span>" + post + "&nbsp;"
+    else:
+        print >>class_file, u"<span style='color:#000000;'>" + w + u"</span>&nbsp;"
+
+def output_word(class_file, w, estimator_max, estimator_min, coefs, keyword_dict, hyperlink = False):
+    weight = coefs[keyword_dict[w]]
+    pre = post = ''
+    if weight > 0 and estimator_max != 0:
+        weight /= estimator_max
+        if weight > 0.05 and hyperlink:
+            pre = u"<a href='../keywords/" + w.replace(u'/', u' ') + u".html' title='" + str(coefs[keyword_dict[w]]) + u"'>"
+            post = u"</a>"
+        print >>class_file, pre + u"<span style='color:#00%02X00;'>" % (int(255 * weight),) + w + u"</span>" + post + "&nbsp;"
+    elif weight < 0 and estimator_min != 0:
+        weight /= estimator_min
+        if weight > 0.05 and hyperlink:
+            pre = u"<a href='../keywords/" + w.replace(u'/', u' ') + u".html' title='" + str(coefs[keyword_dict[w]]) + u"'>"
+            post = u"</a>"
+        print >>class_file, pre + u"<span style='color:#%02X0000;'>" % (int(255 * weight),) + w + u"</span>" + post + "&nbsp;"
+    else:
+        print >>class_file, u"<span style='color:#000000;'>" + w + u"</span>&nbsp;"
+
+
+def format_word(w, estimator_max, estimator_min, coefs, keyword_dict, hyperlink = False):
+    orig_Weight = weight = coefs[keyword_dict[w]]
+    pre = post = ''
+    if weight > 0 and estimator_max != 0:
+        weight /= estimator_max
+        if hyperlink:
+            pre = u"<a href='../keywords/" + w.replace(u'/', u' ') + u".html' title='" + str(orig_Weight) + u"'>"
+            post = u"</a>"
+        v = u"<span style='color:#00%02X00;'>" % (int(255 * weight),) + w + u"</span>"
+    elif weight < 0 and estimator_min != 0:
+        weight /= estimator_min
+        if hyperlink:
+            pre = u"<a href='../keywords/" + w.replace(u'/', u' ') + u".html' title='" + str(orig_Weight) + u"'>"
+            post = u"</a>"
+        v = u"<span style='color:#%02X0000;'>" % (int(255 * weight),) + w + u"</span>"
+    else:
+        v = u"<span style='color:#000000;'>" + w + u"</span>"
+    return u"%s%s%s" % (pre, v, post,)
+
+
+ATTR_AMOUNT = 10
+important_keywords = keywords[numpy.abs(coefs).argsort()[:,-ATTR_AMOUNT:]]
+
+MAX_RE_GROUP = 100
+def create_re(estimator_max, estimator_min, coefs, keyword_dict):
+    #return ("|".join(p) for p in zip(*(
+    #        (
+    #            u"(?P<pre_%d>\\b)%s(?P<post_%d>\\b)" % (i, re.escape(k), i, ), 
+    #            u"\\g<pre_%d>%s\\g<post_%d>" % (i, re.escape(format_word(k, estimator_max, estimator_min, estimator, keyword_dict, hyperlink = True)), i, ),
+    #        )
+    #        for i, k 
+    #        in enumerate(keywords[numpy.where(estimator.coef_[0] != 0)])
+    #    )
+    #))
+
+    return {        
+        k : format_word(k, estimator_max, estimator_min, coefs, keyword_dict, hyperlink = True)
+        for k 
+        in keywords[numpy.where(coefs != 0)]
+
+    }
+ALLOWED_PRELETTERS = 3
+MIN_WORD_LEN = 2
+def replace_match(replacements, match):
+    whole_match_str = match_str = match.group(0)
+    if match_str in replacements:
+        return replacements[match_str]
+
+    for i in xrange(1, min(ALLOWED_PRELETTERS + 1, len(match_str) - MIN_WORD_LEN + 1)):
+        pre, pre_match_str = match_str[:i], match_str[i:]
+        if pre_match_str in replacements:
+            return pre + replacements[pre_match_str]
+
+    return u"[%s]" % (match.group(0),)
+
+print "creating files"
+for i, (class_name, important_keyword_for_class, estimator) in enumerate(zip(lb.classes_, important_keywords, trained_classifier.estimators_)):
+    print "tag", str(i), "/", str(len(lb.classes_))
+    class_file = codecs.open("important_keywords/tags/" + class_name.replace("/", ' ') + ".html", "wt", encoding="utf-8")
+
+    print >>class_file, u"<html dir='rtl'><head><title>Keyword Explanation for Class " + class_name + u"</title></head><body><meta http-equiv='Content-Type' content='text/html;charset=UTF-8'>"
+    print >>class_file, u"<h1>" + class_name + u"</h1><br/>"
+    print >>class_file, u"<h2>Most influencing words</h2>"
+    estimator_min, estimator_max = coefs[i,:].min(), coefs[i,:].max()
+    for keyword in important_keyword_for_class:
+        output_word(class_file, keyword, estimator_max, estimator_min, coefs[i], keyword_dict)
+        print >>class_file, u"(", str(coefs[i,keyword_dict[keyword]]), u")<br/>"
+
+#    tag_pattern, tag_replacement_pattern = create_re(estimator_max, estimator_min, estimator, keyword_dict)    
+#    tag_re = re.compile(tag_pattern, re.MULTILINE | re.UNICODE)
+    replacements = create_re(estimator_max, estimator_min, coefs[i], keyword_dict)
+    replacements_re = re.compile('|'.join(('|'.join((u'\\b%s%s\\b' % (u'\\w' * i, p,) for p in replacements.iterkeys())) for i in xrange(ALLOWED_PRELETTERS + 1))), re.UNICODE)
+    print >>class_file, u"<table border='1'>"
+    for doc in doc_in_class[class_name]:
+
+        print >>class_file, u"<tr><td valign='top'>" + unicode(doc['id']) + u"</td><td valign='top'>"
+        print >>class_file, replacements_re.sub(partial(replace_match, replacements), pp_dict[doc['id']])
+
+        #import pdb; pdb.set_trace()
+#        print >>class_file, reduce(lambda s,(o,r): re.sub(o,r,s, flags=re.UNICODE), replacements, pp_dict[doc['id']])
+#        print >>class_file, tag_re.sub(tag_replacement_pattern, pp_dict[doc['id']])
+        print >>class_file, u"</td></tr>"
+    print >>class_file, u"</table></body></html>"
+    class_file.close()
+
+coef_abs = numpy.abs(coefs)
+sorted_keywords = coef_abs.argsort(axis = 0)
+min_weight, max_weight = coefs.min(axis = 1), coefs.max(axis = 1)
+for keyword_count, i in enumerate(coef_abs.sum(axis = 0).argsort()[::-1]):
+    keyword = keywords[i]
+    print "keyword", str(keyword_count), "/", str(len(keywords)), "(", str(i), ")"
+    keyword_file = codecs.open("important_keywords/keywords/" + keyword.replace("/", ' ') + ".html", "wt", encoding="utf-8")
+    print >>keyword_file, u"<html dir='rtl'><head><title>Tag Explanation for Keyword " + keyword + u"</title></head><body><meta http-equiv='Content-Type' content='text/html;charset=UTF-8'>"
+    print >>keyword_file, u"<h1>" + keyword + u"</h1><br/>"
+    print >>keyword_file, u"<h2>Most influenced tags</h2>"
+    weights = coefs[:,i]
+    class_keywords = sorted_keywords[::-1,i]
+
+    for class_name, weight, max_weight_, min_weight_ in zip(lb.classes_[class_keywords], weights[class_keywords], max_weight[class_keywords], min_weight[class_keywords]):
+        if weight != 0:
+            output_tag(keyword_file, class_name, max_weight_, min_weight_, weight, hyperlink = True)
+            print >>keyword_file, u"<br/>"
+    print >>keyword_file, u"</body></html>"
+    keyword_file.close()
diff --git a/laws/tags/make_histograms.py b/laws/tags/make_histograms.py
@@ -0,0 +1,62 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+import cPickle as pickle
+from collections import defaultdict
+
+#REMOVE = ['"', "'", '/', '\\', ',', '.', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '=', '_', '+', '\n', '[', ']', '{', '}', '|', '<', '>', '?', '~', ":", ';']
+REMOVE = [u'"', u"'", u'/', u'\\', u',', u'.', u'!', u'@', u'#', u'$', u'%', u'^', u'&', u'*', u'(', u')', u'-', u'=', u'_', u'+', u'\n', u'[', u']', u'{', u'}', u'|', u'<', u'>', u'?', u'~', u":", u';', u'–' , u'”', u'`']
+REMOVE.extend([unicode(i) for i in xrange(10)])
+REMOVE.extend([str(i) for i in xrange(10)])
+
+def make_histogram(t):
+    dd = defaultdict(int)
+#    for w in t.split():
+#        dd[reduce(lambda o, s: o.replace(s, ' '), REMOVE, w)] += 1
+    for w in reduce(lambda o, s: o.replace(s, u' '), REMOVE, t).split():
+        dd[w] += 1
+    return dict(dd)
+
+from collections import defaultdict
+ACCEPTED_POS = ['NN', 'JJ', 'VB', 'NNT', 'BN', 'NNP', 'TTL',  ]
+def make_pos_histogram(t):
+    dd = defaultdict(int)
+    for w in [ppp.split('\t')[1] for ppp in t.splitlines() if ppp.count('\t') > 5 and (ppp.split('\t')[3] in ACCEPTED_POS or ppp.split('\t')[4] in ACCEPTED_POS)]:
+        dd[w] += 1
+    return dict(dd)
+
+
+# needed phrases: NN, JJ, VB, NNT, BN
+# Use this:
+# http://www.cs.bgu.ac.il/~yoavg/depparse/gparse
+# or this:
+# http://www.cs.bgu.ac.il/~yoavg/constparse/gparse
+# or this:
+# http://www.cs.bgu.ac.il/~nlpproj/demo/
+# from here:
+# http://www.cs.bgu.ac.il/~nlpproj/
+# seems like:
+# http://www.cs.bgu.ac.il/~yoavg/software/hebparsers/hebdepparser/hebdepparser.tgz
+# is working good enough
+
+pp_with_tags = pickle.load(open('pp_pos_with_tags.pkl', "rb"))
+
+keywords = set()
+data = []
+for i, pp in enumerate(pp_with_tags):
+    pp_pos, pp_tag, pp_id = pp['pos'], pp['tags'], pp['id']
+
+    #print str(i), "/", str(len(pp_with_tags))
+    try:
+        histogram = make_pos_histogram(pp_pos)
+#        histogram = make_histogram(pp_text)
+    except:
+        import traceback; traceback.print_exc()
+        print "at pp", i
+        import pdb; pdb.set_trace()
+    data.append({'histogram' : histogram, 'tags' : pp_tag, 'id' : pp_id})
+    keywords |= set(histogram.keys()) # this will add the keys
+
+
+
+pickle.dump((keywords, data), open('histograms_with_tags.pkl', 'wb'))
+