[study] Add repository analysis study

Signed-off-by: inishchith <[email protected]>
chaoss · Jul 26, 2019 · cb01d31 · cb01d31
1 parent 30ea904
commit cb01d31
Show file tree

Hide file tree

Showing 2 changed files with 295 additions and 8 deletions.
diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py
@@ -22,9 +22,21 @@
 #
 
 import logging
+from pprint import pprint
 
-from .enrich import Enrich, metadata
-from grimoirelab_toolkit.datetime import str_to_datetime
+from dateutil.relativedelta import relativedelta
+
+from elasticsearch import Elasticsearch as ES, RequestsHttpConnection
+
+from .enrich import (Enrich,
+                     metadata)
+from .graal_study_evolution import (get_to_date,
+                                    get_unique_repository,
+                                    get_files_at_time)
+
+from grimoirelab_toolkit.datetime import (str_to_datetime,
+                                          datetime_utcnow)
+from grimoire_elk.elastic import ElasticSearch
 
 MAX_SIZE_BULK_ENRICHED_ITEMS = 200
 
@@ -33,6 +45,14 @@
 
 class CocomEnrich(Enrich):
 
+    def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
+                 db_user='', db_password='', db_host=''):
+        super().__init__(db_sortinghat, db_projects_map, json_projects_map,
+                         db_user, db_password, db_host)
+
+        self.studies = []
+        self.studies.append(self.enrich_repo_analysis)
+
     def get_identities(self, item):
         """ Return the identities from an item """
         identities = []
@@ -98,7 +118,6 @@ def get_rich_items(self, item):
             eitem['commit_sha'] = entry['commit']
             eitem['author'] = entry['Author']
             eitem['committer'] = entry['Commit']
-            eitem['commit'] = entry['commit']
             eitem['message'] = entry['message']
             eitem['author_date'] = self.__fix_field_date(entry['AuthorDate'])
             eitem['commit_date'] = self.__fix_field_date(entry['CommitDate'])
@@ -120,13 +139,15 @@ def get_rich_items(self, item):
 
     def __add_derived_metrics(self, file_analysis, eitem):
         """ Add derived metrics fields """
-        if eitem['loc']:
+
+        # TODO: Fix Logic: None rather than 1
+        if eitem["loc"]:
             total_lines = eitem['loc'] + eitem['comments'] + eitem['blanks']
-            eitem["comments_ratio"] = eitem['comments'] / total_lines
-            eitem["blanks_ratio"] = eitem['blanks'] / total_lines
+            eitem["lines_per_comment_lines"] = total_lines / max(eitem["comments"], 1)
+            eitem["lines_per_blank_lines"] = total_lines / max(eitem["blanks"], 1)
         else:
-            eitem["comments_ratio"] = eitem['comments']
-            eitem["blanks_ratio"] = eitem['blanks']
+            eitem["lines_per_comment_lines"] = 0
+            eitem["lines_per_blank_lines"] = 0
 
         return eitem
 
@@ -158,6 +179,84 @@ def enrich_items(self, ocean_backend, events=False):
 
         return num_items
 
+    def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
+                             out_index="cocom_enrich_graal_repo", interval_months=3,
+                             date_field="grimoire_creation_date"):
+
+        logger.info("Doing enrich_repository_analysis study for index {}"
+                    .format(self.elastic.anonymize_url(self.elastic.index_url)))
+
+        es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
+                   verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
+        in_index = enrich_backend.elastic.index
+
+        unique_repos = es_in.search(
+            index=in_index,
+            body=get_unique_repository())
+
+        repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos']['buckets']]
+        metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"]
+        num_items = 0
+        ins_items = 0
+
+        for repository_url in repositories:
+            es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
+            evolution_items = []
+
+            to_date = get_to_date(es_in, in_index, out_index, repository_url)
+            to_date = to_date.replace(day=1, hour=0, minute=0, second=0)
+            today_date = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
+
+            while to_date < today_date:
+                files_at_time = es_in.search(
+                    index=in_index,
+                    body=get_files_at_time(repository_url, to_date.isoformat()))
+
+                if not len(files_at_time['aggregations']['file_stats']['buckets']):
+                    to_date = to_date + relativedelta(months=+interval_months)
+                    continue
+
+                repository_name = repository_url.split("/")[-1]
+                evolution_item = {
+                    "id": to_date.strftime('%Y-%m-%dT%H:%M:%S') + "_" + repository_name + "_" + str(interval_months),
+                    "origin": repository_url,
+                    "interval_months": interval_months,
+                    "study_creation_date": to_date.isoformat(),
+                    "total_files": len(files_at_time['aggregations']['file_stats']['buckets'])
+                }
+
+                for file_ in files_at_time['aggregations']['file_stats']['buckets']:
+                    file_details = file_["1"]["hits"]["hits"][0]["_source"]
+
+                    for metric in metrics:
+                        if file_details[metric] is not None:
+                            evolution_item["total_" + metric] = evolution_item.get("total_" + metric, 0) + file_details[metric]
+
+                total_lines = evolution_item["total_comments"] + evolution_item["total_blanks"] + evolution_item["total_loc"]
+                # TODO: Fix Logic: None rather than 1
+                evolution_item["total_lines_per_comment_lines"] = total_lines / max(evolution_item["total_comments"], 1)
+                evolution_item["total_lines_per_blank_lines"] = total_lines / max(evolution_item["total_blanks"], 1)
+
+                pprint(evolution_item)
+                evolution_items.append(evolution_item)
+
+                if len(evolution_items) >= self.elastic.max_items_bulk:
+                    num_items += len(evolution_items)
+                    ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+                    evolution_items = []
+
+                to_date = to_date + relativedelta(months=+interval_months)
+
+            if len(evolution_items) > 0:
+                num_items += len(evolution_items)
+                ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+
+            if num_items != ins_items:
+                missing = num_items - ins_items
+                logger.error("%s/%s missing items for Study Enricher", str(missing), str(num_items))
+            else:
+                logger.info("%s items inserted for Study Enricher", str(num_items))
+
     def __fix_field_date(self, date_value):
         """Fix possible errors in the field date"""
 

diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2019 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# Authors:
+#   Valerio Cosentino <[email protected]>
+#   Nishchith Shetty <[email protected]>
+#
+
+from grimoirelab_toolkit.datetime import str_to_datetime
+
+
+def get_unique_repository():
+    """ Retrieve all the repository names from the index. """
+
+    query_unique_repository = """
+    {
+        "size": 0,
+        "aggs": {
+            "unique_repos": {
+                "terms": {
+                    "field": "origin"
+                }
+            }
+        }
+    }
+    """
+
+    return query_unique_repository
+
+
+def get_last_study_date(repository_url):
+    """ Retrieve the last study_creation_date of the item corresponding
+    to given repository from the study index.
+    """
+
+    query_last_study_date = """
+    {
+        "size": 0,
+        "aggs": {
+            "1": {
+                "max": {
+                    "field": "study_creation_date"
+                }
+            }
+        },
+        "query": {
+            "bool": {
+                "filter": [{
+                    "term": {
+                        "origin.keyword": "%s"
+                    }
+                }]
+            }
+        }
+    }
+    """ % (repository_url)
+
+    return query_last_study_date
+
+
+def get_first_enriched_date(repository_url):
+    """ Retrieve the first/oldest metadata__updated_on of the item
+    corresponding to given repository.
+    """
+
+    query_first_enriched_date = """
+    {
+        "size": 0,
+        "aggs": {
+            "1": {
+                "top_hits": {
+                    "docvalue_fields": [
+                        "metadata__updated_on"
+                    ],
+                    "_source": "metadata__updated_on",
+                    "size": 1,
+                    "sort": [{
+                        "commit_date": {
+                            "order": "asc"
+                        }
+                    }]
+                }
+            }
+        },
+        "query": {
+            "bool": {
+                "filter": [{
+                    "term": {
+                        "origin": "%s"
+                    }
+                }]
+            }
+        }
+    }
+    """ % (repository_url)
+
+    return query_first_enriched_date
+
+
+def get_files_at_time(repository_url, to_date):
+    """ Retrieve all the latest changes wrt files until the to_date, 
+    corresponding to the given repository.
+    """
+
+    query_files_at_time = """
+    {
+        "size": 0,
+        "aggs": {
+            "file_stats": {
+                "terms": {
+                    "field": "file_path",
+                    "size": 2147483647,
+                    "order": {
+                        "_key": "desc"
+                    }
+                },
+                "aggs": {
+                    "1": {
+                        "top_hits": {
+                            "size": 1,
+                            "sort": [{
+                                "metadata__updated_on": {
+                                    "order": "desc"
+                                }
+                            }]
+                        }
+                    }
+                }
+            }
+        },
+        "query": {
+            "bool": {
+                "filter": [{
+                    "term": {
+                        "origin": "%s"
+                    }
+                },
+                {
+                    "range": {
+                        "metadata__updated_on": {
+                            "lte": "%s"
+                        }
+                    }
+                }]
+            }
+        }
+    }
+    """ % (repository_url, to_date)
+
+    return query_files_at_time
+
+
+def get_to_date(es_in, in_index, out_index, repository_url):
+    """ Get the appropriate to_date value for incremental insertion. """
+    study_data_available = False
+
+    if es_in.indices.exists(index=out_index):
+        last_study_date = es_in.search(
+            index=out_index,
+            body=get_last_study_date(repository_url))["aggregations"]["1"]
+
+        if last_study_date["value"] is not None:
+            study_data_available = True
+            to_date = str_to_datetime(last_study_date["value_as_string"])
+
+    if not study_data_available:
+        first_item_date = es_in.search(
+            index=in_index,
+            body=get_first_enriched_date(repository_url))["aggregations"]["1"]["hits"]["hits"][0]["_source"]
+
+        to_date = str_to_datetime(first_item_date["metadata__updated_on"])
+
+    return to_date