[study] Add repository analysis study

inishchith · Jul 19, 2019 · 12ec7bf · valeriocos · Jul 19, 2019 · inishchith
1 parent 30ea904
commit 12ec7bf
Show file tree

Hide file tree

Showing 2 changed files with 258 additions and 2 deletions.
diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py
@@ -21,10 +21,24 @@
 #   Nishchith Shetty <[email protected]>
 #
 
+import datetime
 import logging
+from pprint import pprint
+
+import dateutil.tz
+from dateutil.relativedelta import relativedelta
+
+from elasticsearch import Elasticsearch as ES, RequestsHttpConnection
+
+from .enrich import (Enrich,
+                     metadata)
+from .graal_utils import (get_unique_repository,
+                                   get_last_study_date,
+                                   get_first_enriched_date,
+                                   get_files_at_time)
 
-from .enrich import Enrich, metadata
 from grimoirelab_toolkit.datetime import str_to_datetime
+from grimoire_elk.elastic import ElasticSearch
 
 MAX_SIZE_BULK_ENRICHED_ITEMS = 200
 
@@ -33,6 +47,14 @@
 
 class CocomEnrich(Enrich):
 
+    def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
+                 db_user='', db_password='', db_host=''):
+        super().__init__(db_sortinghat, db_projects_map, json_projects_map,
+                         db_user, db_password, db_host)
+
+        self.studies = []
+        self.studies.append(self.enrich_repo_analysis)
+
     def get_identities(self, item):
         """ Return the identities from an item """
         identities = []
@@ -98,7 +120,6 @@ def get_rich_items(self, item):
             eitem['commit_sha'] = entry['commit']
             eitem['author'] = entry['Author']
             eitem['committer'] = entry['Commit']
-            eitem['commit'] = entry['commit']
             eitem['message'] = entry['message']
             eitem['author_date'] = self.__fix_field_date(entry['AuthorDate'])
             eitem['commit_date'] = self.__fix_field_date(entry['CommitDate'])
@@ -158,6 +179,107 @@ def enrich_items(self, ocean_backend, events=False):
 
         return num_items
 
+    def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
+                             out_index="cocom_enrich_graal_repo", interval_months=3,
+                             date_field="grimoire_creation_date"):
+
+        logger.info("Doing enrich_repository_analysis study for index {}"
+                    .format(self.elastic.anonymize_url(self.elastic.index_url)))
+
+        es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
+                   verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
+
+        unique_repos = es_in.search(
+            index=enrich_backend.elastic.index,
+            body=get_unique_repository)
+
+        repos = [repo['key'] for repo in unique_repos['aggregations']['unique_repos']['buckets']]
+        metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"]
+        num_items = 0
+        ins_items = 0
+
+        for repo in repos:
+            study_data_available = False
+
+            if es_in.indices.exists(index=out_index):
+                last_study_date = es_in.search(
+                    index=out_index,
+                    body=get_last_study_date % (repo))["aggregations"]["1"]
+
+                if last_study_date["value"] is not None:
+                    study_data_available = True
+                    to_date = str_to_datetime(last_study_date["value_as_string"])
+
+            if not study_data_available:
+                first_item_date = es_in.search(
+                    index=enrich_backend.elastic.index,
+                    body=get_first_enriched_date % (repo))["aggregations"]["1"]["hits"]["hits"][0]["_source"]
+                to_date = str_to_datetime(first_item_date["metadata__updated_on"])
+
+            es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
+            evolution_items = []
+
+            # TODO: Incorporate full month logic
+            to_date = to_date.replace(day=1)
+
+            while to_date <= datetime.datetime.now(dateutil.tz.tzutc()):
+                files_at_time = es_in.search(
+                    index=enrich_backend.elastic.index,
+                    body=get_files_at_time % (repo, to_date.isoformat()))
+
+                if not len(files_at_time['aggregations']['file_stats']['buckets']):
+                    to_date = to_date + relativedelta(months=+interval_months)
+                    continue
+
+                # TODO: Requires fix
+                evolution_item = {
+                    "id": to_date.strftime('%Y-%m-%dT%H:%M:%S') + "_study_cocom",
+                    # "commit_sha": eitem["commit_sha"],
+                    "origin": repo,
+                    # "file_path": eitem["file_path"],
+                    # "modules": eitem["modules"],
+                    # "author_date": eitem["author_date"],
+                    # "commit_date": eitem["commit_date"],
+                    "to_date": to_date.isoformat()
+                    # "to_date": self.__fix_field_date(to_date.strftime('%Y-%m-%dT%H:%M:%S'))
+                }
+
+                for file_ in files_at_time['aggregations']['file_stats']['buckets']:
+                    file_details = file_["1"]["hits"]["hits"][0]["_source"]
+
+                    for metric in metrics:
+                        if file_details[metric] is not None:
+                            evolution_item["total_" + metric] = evolution_item.get("total_" + metric, 0) + file_details[metric]
+
+                if evolution_item["total_loc"]:
+                    total_lines = evolution_item["total_comments"] + evolution_item["total_blanks"] + evolution_item["total_loc"]
+                    evolution_item["total_comments_ratio"] = evolution_item["total_comments"] / total_lines
+                    evolution_item["total_blanks_ratio"] = evolution_item["total_blanks"] / total_lines
+                else:
+                    evolution_item["total_comments_ratio"] = evolution_item["total_comments"]
+                    evolution_item["total_blanks_ratio"] = evolution_item["total_blanks"]
+
+                pprint(evolution_item)
+                evolution_items.append(evolution_item)
+
+                if len(evolution_items) >= self.elastic.max_items_bulk:
+                    num_items += len(evolution_items)
+                    ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+                    evolution_items = []
+
+                to_date = to_date + relativedelta(months=+interval_months)
+
+            if len(evolution_items) > 0:
+                num_items += len(evolution_items)
+                ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+
+            if num_items != ins_items:
+                missing = num_items - ins_items
+                logger.error("%s/%s missing items for Study Enricher", str(missing), str(num_items))
+            else:
+                logger.info("%s items inserted for Study Enricher", str(num_items))
+
+
     def __fix_field_date(self, date_value):
         """Fix possible errors in the field date"""
 

diff --git a/grimoire_elk/enriched/graal_utils.py b/grimoire_elk/enriched/graal_utils.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2019 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# Authors:
+#   Valerio Cosentino <[email protected]>
+#   Nishchith Shetty <[email protected]>
+#
+
+# CONTAINS QUERIES
+
+get_unique_repository = """
+{
+    "size": 0,
+    "aggs": {
+        "unique_repos": {
+            "terms": {
+                "field": "origin"
+            }
+        }
+    }
+}
+"""
+
+get_last_study_date = """
+{
+    "size": 0,
+    "aggs": {
+        "1": {
+            "max": {
+                "field": "to_date"
+            }
+        }
+    },
+    "query": {
+        "bool": {
+            "filter": [{
+                "term": {
+                    "origin.keyword": "%s"
+                }
+            }]
+        }
+    }
+}
+"""
+
+get_first_enriched_date = """
+{
+    "size": 0,
+    "aggs": {
+        "1": {
+            "top_hits": {
+                "docvalue_fields": [
+                    "metadata__updated_on"
+                ],
+                "_source": "metadata__updated_on",
+                "size": 1,
+                "sort": [{
+                    "commit_date": {
+                        "order": "asc"
+                    }
+                }]
+            }
+        }
+    },
+    "query": {
+        "bool": {
+            "filter": [{
+                "term": {
+                    "origin": "%s"
+                }
+            }]
+        }
+    }
+}
+"""
+
+get_files_at_time = """
+{
+    "size": 0,
+    "aggs": {
+        "file_stats": {
+            "terms": {
+                "field": "file_path",
+                "size": 2147483647,
+                "order": {
+                    "_key": "desc"
+                }
+            },
+            "aggs": {
+                "1": {
+                    "top_hits": {
+                        "size": 1,
+                        "sort": [{
+                            "metadata__updated_on": {
+                                "order": "desc"
+                            }
+                        }]
+                    }
+                }
+            }
+        }
+    },
+    "query": {
+        "bool": {
+            "filter": [{
+                "term": {
+                    "origin": "%s"
+                }
+            }, {
+                "range": {
+                    "metadata__updated_on": {
+                        "lte": "%s"
+                    }
+                }
+            }]
+        }
+    }
+}
+"""