[graal:tests] Add appropriate tests for Graal integration (WIP)

Signed-off-by: inishchith <[email protected]>
chaoss · Jul 31, 2019 · 20eb820 · 20eb820
1 parent 1308300
commit 20eb820
Show file tree

Hide file tree

Showing 8 changed files with 758 additions and 113 deletions.
diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py
@@ -138,12 +138,14 @@ def __add_derived_metrics(self, file_analysis, eitem):
         """ Add derived metrics fields """
 
         # TODO: Fix Logic: None rather than 1
-        if None not in [eitem["loc"], eitem["comments"], eitem["num_funs"]]:
-            eitem["loc_per_comment_lines"] = eitem["loc"] / max(eitem["comments"], 1)
-            eitem["loc_per_blank_lines"] = eitem["loc"] / max(eitem["blanks"], 1)
-            eitem["loc_per_function"] = eitem["loc"] / max(eitem["num_funs"], 1)
+        if eitem["loc"] is not None and eitem["comments"] is not None and eitem["num_funs"] is not None:
+            eitem["comments_per_loc"] = round(eitem["comments"] / max(eitem["loc"], 1), 2)
+            eitem["blanks_per_loc"] = round(eitem["blanks"] / max(eitem["loc"], 1), 2)
+            eitem["loc_per_function"] = round(eitem["loc"] / max(eitem["num_funs"], 1), 2)
         else:
-            eitem["loc_per_comment_lines"] = eitem["loc_per_blank_lines"] = eitem["loc_per_function"] = None
+            eitem["comments_per_loc"] = None
+            eitem["blanks_per_loc"] = None
+            eitem["loc_per_function"] = None
 
         return eitem
 
@@ -176,7 +178,7 @@ def enrich_items(self, ocean_backend, events=False):
         return num_items
 
     def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
-                             out_index="cocom_enrich_graal_repo", interval_months=3,
+                             out_index="cocom_enrich_graal_repo", interval_months=[3],
                              date_field="grimoire_creation_date"):
 
         logger.info("Doing enrich_repository_analysis study for index {}"
@@ -185,71 +187,76 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
         es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
                    verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
         in_index = enrich_backend.elastic.index
+        interval_months = list(map(int, interval_months))
 
         unique_repos = es_in.search(
             index=in_index,
             body=get_unique_repository())
 
         repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
+        current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
         num_items = 0
         ins_items = 0
 
         for repository_url in repositories:
             es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
             evolution_items = []
 
-            to_month = get_to_date(es_in, in_index, out_index, repository_url)
-            to_month = to_month.replace(day=1, hour=0, minute=0, second=0)
-            current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
-
-            while to_month < current_month:
-                files_at_time = es_in.search(
-                    index=in_index,
-                    body=get_files_at_time(repository_url, to_month.isoformat())
-                )['aggregations']['file_stats'].get("buckets", [])
-
-                if not len(files_at_time):
-                    to_month = to_month + relativedelta(months=+interval_months)
-                    continue
-
-                repository_name = repository_url.split("/")[-1]
-                evolution_item = {
-                    "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months),
-                    "origin": repository_url,
-                    "interval_months": interval_months,
-                    "study_creation_date": to_month.isoformat(),
-                    "total_files": len(files_at_time)
-                }
-
-                for file_ in files_at_time:
-                    file_details = file_["1"]["hits"]["hits"][0]["_source"]
-
-                    for metric in self.metrics:
-                        total_metric = "total_" + metric
-                        evolution_item[total_metric] = evolution_item.get(total_metric, 0)
-                        evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0
-
-                # TODO: Fix Logic: None rather than 1
-                evolution_item["total_loc_per_comment_lines"] = evolution_item["total_loc"] / \
-                    max(evolution_item["total_comments"], 1)
-                evolution_item["total_loc_per_blank_lines"] = evolution_item["total_loc"] / max(evolution_item["total_blanks"], 1)
-                evolution_item["total_loc_per_function"] = evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1)
-
-                evolution_items.append(evolution_item)
-
-                if len(evolution_items) >= self.elastic.max_items_bulk:
-                    num_items += len(evolution_items)
-                    ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
-                    evolution_items = []
+            for interval in interval_months:
 
-                to_month = to_month + relativedelta(months=+interval_months)
+                to_month = get_to_date(es_in, in_index, out_index, repository_url, interval)
+                to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0)
 
-            if len(evolution_items) > 0:
-                num_items += len(evolution_items)
-                ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+                while to_month < current_month:
+                    files_at_time = es_in.search(
+                        index=in_index,
+                        body=get_files_at_time(repository_url, to_month.isoformat())
+                    )['aggregations']['file_stats'].get("buckets", [])
 
-            if num_items != ins_items:
-                missing = num_items - ins_items
-                logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
-            else:
-                logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
+                    if not len(files_at_time):
+                        to_month = to_month + relativedelta(months=+interval)
+                        continue
+
+                    repository_name = repository_url.split("/")[-1]
+                    evolution_item = {
+                        "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval),
+                        "origin": repository_url,
+                        "interval_months": interval,
+                        "study_creation_date": to_month.isoformat(),
+                        "total_files": len(files_at_time)
+                    }
+
+                    for file_ in files_at_time:
+                        file_details = file_["1"]["hits"]["hits"][0]["_source"]
+
+                        for metric in self.metrics:
+                            total_metric = "total_" + metric
+                            evolution_item[total_metric] = evolution_item.get(total_metric, 0)
+                            evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0
+
+                    # TODO: Fix Logic: None rather than 1
+                    evolution_item["total_comments_per_loc"] = round(
+                        evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2)
+                    evolution_item["total_blanks_per_loc"] = round(
+                        evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2)
+                    evolution_item["total_loc_per_function"] = round(
+                        evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2)
+
+                    evolution_items.append(evolution_item)
+
+                    if len(evolution_items) >= self.elastic.max_items_bulk:
+                        num_items += len(evolution_items)
+                        ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+                        evolution_items = []
+
+                    to_month = to_month + relativedelta(months=+interval)
+
+                if len(evolution_items) > 0:
+                    num_items += len(evolution_items)
+                    ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+
+                if num_items != ins_items:
+                    missing = num_items - ins_items
+                    logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
+                else:
+                    logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py
@@ -63,7 +63,7 @@ def has_identities(self):
     def get_field_unique_id(self):
         return "id"
 
-    def get_licensed_files(repository_url, to_date):
+    def get_licensed_files(self, repository_url, to_date):
         """ Retrieve all the licensed files until the to_date, corresponding
         to the given repository.
         """
@@ -108,7 +108,7 @@ def get_licensed_files(repository_url, to_date):
 
         return query_licensed_files
 
-    def get_copyrighted_files(repository_url, to_date):
+    def get_copyrighted_files(self, repository_url, to_date):
         """ Retrieve all the copyrighted files until the to_date, corresponding
         to the given repository.
         """
@@ -260,7 +260,7 @@ def enrich_items(self, ocean_backend, events=False):
         return num_items
 
     def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
-                              out_index="colic_enrich_graal_repo", interval_months=3,
+                              out_index="colic_enrich_graal_repo", interval_months=[3],
                               date_field="grimoire_creation_date"):
 
         logger.info("Doing enrich_colic_analysis study for index {}"
@@ -269,66 +269,74 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
         es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
                    verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
         in_index = enrich_backend.elastic.index
+        interval_months = list(map(int, interval_months))
 
         unique_repos = es_in.search(
             index=in_index,
             body=get_unique_repository())
 
         repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
+        current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
         num_items = 0
         ins_items = 0
 
         for repository_url in repositories:
             es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
             evolution_items = []
 
-            to_month = get_to_date(es_in, in_index, out_index, repository_url)
-            to_month = to_month.replace(day=1, hour=0, minute=0, second=0)
-            current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
-
-            while to_month < current_month:
-                copyrighted_files_at_time = es_in.search(
-                    index=in_index,
-                    body=self.get_copyrighted_files(repository_url, to_month.isoformat()))
-
-                licensed_files_at_time = es_in.search(
-                    index=in_index,
-                    body=self.get_licensed_files(repository_url, to_month.isoformat()))
-
-                files_at_time = es_in.search(
-                    index=in_index,
-                    body=get_files_at_time(repository_url, to_month.isoformat()))
-
-                licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
-                copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
-                total_files = int(files_at_time["aggregations"]["1"]["value"])
-
-                repository_name = repository_url.split("/")[-1]
-                evolution_item = {
-                    "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months),
-                    "origin": repository_url,
-                    "interval_months": interval_months,
-                    "study_creation_date": to_month.isoformat(),
-                    "licensed_files": licensed_files,
-                    "copyrighted_files": copyrighted_files,
-                    "total_files": total_files
-                }
+            for interval in interval_months:
+
+                to_month = get_to_date(es_in, in_index, out_index, repository_url, interval)
+                to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0)
+
+                while to_month < current_month:
+                    copyrighted_files_at_time = es_in.search(
+                        index=in_index,
+                        body=self.get_copyrighted_files(repository_url, to_month.isoformat()))
+
+                    licensed_files_at_time = es_in.search(
+                        index=in_index,
+                        body=self.get_licensed_files(repository_url, to_month.isoformat()))
+
+                    files_at_time = es_in.search(
+                        index=in_index,
+                        body=get_files_at_time(repository_url, to_month.isoformat()))
+
+                    licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
+                    copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
+                    # TODO: Fix - need more efficient query
+                    total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", []))
+
+                    if not total_files:
+                        to_month = to_month + relativedelta(months=+interval)
+                        continue
+
+                    repository_name = repository_url.split("/")[-1]
+                    evolution_item = {
+                        "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval),
+                        "origin": repository_url,
+                        "interval_months": interval,
+                        "study_creation_date": to_month.isoformat(),
+                        "licensed_files": licensed_files,
+                        "copyrighted_files": copyrighted_files,
+                        "total_files": total_files
+                    }
 
-                evolution_items.append(evolution_item)
+                    evolution_items.append(evolution_item)
 
-                if len(evolution_items) >= self.elastic.max_items_bulk:
-                    num_items += len(evolution_items)
-                    ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
-                    evolution_items = []
+                    if len(evolution_items) >= self.elastic.max_items_bulk:
+                        num_items += len(evolution_items)
+                        ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+                        evolution_items = []
 
-                to_month = to_month + relativedelta(months=+interval_months)
+                    to_month = to_month + relativedelta(months=+interval)
 
-            if len(evolution_items) > 0:
-                num_items += len(evolution_items)
-                ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
+                if len(evolution_items) > 0:
+                    num_items += len(evolution_items)
+                    ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
 
-            if num_items != ins_items:
-                missing = num_items - ins_items
-                logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
-            else:
-                logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
+                if num_items != ins_items:
+                    missing = num_items - ins_items
+                    logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
+                else:
+                    logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py
@@ -43,7 +43,7 @@ def get_unique_repository():
     return query_unique_repository
 
 
-def get_last_study_date(repository_url):
+def get_last_study_date(repository_url, interval):
     """ Retrieve the last study_creation_date of the item corresponding
     to given repository from the study index.
     """
@@ -64,11 +64,15 @@ def get_last_study_date(repository_url):
                     "term": {
                         "origin.keyword": "%s"
                     }
+                },{
+                    "term":{
+                        "interval_months": "%s"
+                    }
                 }]
             }
         }
     }
-    """ % (repository_url)
+    """ % (repository_url, interval)
 
     return query_last_study_date
 
@@ -117,8 +121,6 @@ def get_files_at_time(repository_url, to_date):
     corresponding to the given repository.
     """
 
-    # TODO: Fix for interval month matching
-
     query_files_at_time = """
     {
         "size": 0,
@@ -167,14 +169,14 @@ def get_files_at_time(repository_url, to_date):
     return query_files_at_time
 
 
-def get_to_date(es_in, in_index, out_index, repository_url):
+def get_to_date(es_in, in_index, out_index, repository_url, interval):
     """ Get the appropriate to_date value for incremental insertion. """
     study_data_available = False
 
     if es_in.indices.exists(index=out_index):
         last_study_date = es_in.search(
             index=out_index,
-            body=get_last_study_date(repository_url))["aggregations"]["1"]
+            body=get_last_study_date(repository_url, interval))["aggregations"]["1"]
 
         if last_study_date["value"] is not None:
             study_data_available = True

diff --git a/grimoire_elk/raw/graal.py b/grimoire_elk/raw/graal.py
@@ -56,7 +56,7 @@ def get_elastic_mappings(es_major):
 
 
 class GraalOcean(ElasticOcean):
-    """CoLic Ocean feeder"""
+    """Graal Ocean feeder"""
 
     mapping = Mapping