From 7afa5c5d89d59dbc4c70a5a2a4af87d173ca5539 Mon Sep 17 00:00:00 2001 From: inishchith Date: Mon, 29 Jul 2019 12:43:29 +0530 Subject: [PATCH 1/8] [requirements.txt] Add Graal module dependency for integration Signed-off-by: inishchith --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 484e0a24d..5b3d433f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ pandas==0.18.1 -e git+https://github.com/chaoss/grimoirelab-toolkit/#egg=grimoirelab-toolkit -e git+https://github.com/chaoss/grimoirelab-cereslib/#egg=grimoirelab-cereslib -e git+https://github.com/chaoss/grimoirelab-kingarthur/#egg=grimoirelab-kingarthur +-e git+https://github.com/chaoss/grimoirelab-graal/#egg=grimoirelab-graal -e git+https://github.com/chaoss/grimoirelab-perceval/#egg=grimoirelab-perceval -e git+https://github.com/chaoss/grimoirelab-perceval-mozilla/#egg=grimoirelab-perceval-mozilla -e git+https://github.com/chaoss/grimoirelab-perceval-opnfv/#egg=grimoirelab-perceval-opnfv From 82b2a11ce53140d7c0479801ead2a351cc0ccc06 Mon Sep 17 00:00:00 2001 From: inishchith Date: Mon, 29 Jul 2019 12:45:55 +0530 Subject: [PATCH 2/8] [graal] Add CoCom & CoLic enricher along with study implementation This commit adds support for Graal's CoCom & CoLic Backend data enricher along with their study implementation Signed-off-by: inishchith --- grimoire_elk/enriched/cocom.py | 255 +++++++++++++++++++++++++ grimoire_elk/enriched/colic.py | 334 +++++++++++++++++++++++++++++++++ 2 files changed, 589 insertions(+) create mode 100644 grimoire_elk/enriched/cocom.py create mode 100644 grimoire_elk/enriched/colic.py diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py new file mode 100644 index 000000000..92ff46525 --- /dev/null +++ b/grimoire_elk/enriched/cocom.py @@ -0,0 +1,255 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Valerio Cosentino +# Nishchith Shetty +# + +import logging +from dateutil.relativedelta import relativedelta + +from elasticsearch import Elasticsearch as ES, RequestsHttpConnection + +from .enrich import (Enrich, + metadata) +from .graal_study_evolution import (get_to_date, + get_unique_repository, + get_files_at_time) +from .utils import fix_field_date + +from grimoirelab_toolkit.datetime import datetime_utcnow +from grimoire_elk.elastic import ElasticSearch + +MAX_SIZE_BULK_ENRICHED_ITEMS = 200 + +logger = logging.getLogger(__name__) + + +class CocomEnrich(Enrich): + metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"] + + def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, + db_user='', db_password='', db_host=''): + super().__init__(db_sortinghat, db_projects_map, json_projects_map, + db_user, db_password, db_host) + + self.studies = [] + self.studies.append(self.enrich_repo_analysis) + + def get_identities(self, item): + """ Return the identities from an item """ + identities = [] + + return identities + + def has_identities(self): + """ Return whether the enriched items contains identities """ + + return False + + def get_field_unique_id(self): + return "id" + + def extract_modules(self, file_path): + """ Extracts module path from the given file path """ + path_chunks = file_path.split('/') + + modules = [] + for idx in range(len(path_chunks)): + sub_path = '/'.join(path_chunks[:idx]) + + if sub_path: + modules.append(sub_path) + + return modules + + @metadata + def get_rich_item(self, file_analysis): + + eitem = {} + for metric in self.metrics: + if file_analysis.get(metric, None) is not None: + eitem[metric] = file_analysis[metric] + else: + eitem[metric] = None + + eitem["file_path"] = file_analysis.get("file_path", None) + eitem["ext"] = file_analysis.get("ext", None) + eitem['modules'] = self.extract_modules(eitem['file_path']) + eitem = self.__add_derived_metrics(file_analysis, eitem) + + return eitem + + def get_rich_items(self, item): + # The real data + entry = item['data'] + + enriched_items = [] + + for file_analysis in entry["analysis"]: + eitem = self.get_rich_item(file_analysis) + + for f in self.RAW_FIELDS_COPY: + if f in item: + eitem[f] = item[f] + else: + eitem[f] = None + + # common attributes + eitem['commit_sha'] = entry['commit'] + eitem['author'] = entry['Author'] + eitem['committer'] = entry['Commit'] + eitem['message'] = entry['message'] + eitem['author_date'] = fix_field_date(entry['AuthorDate']) + eitem['commit_date'] = fix_field_date(entry['CommitDate']) + + if self.prjs_map: + eitem.update(self.get_item_project(eitem)) + + # uuid + eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) + + eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) + + self.add_repository_labels(eitem) + self.add_metadata_filter_raw(eitem) + + enriched_items.append(eitem) + + return enriched_items + + def __add_derived_metrics(self, file_analysis, eitem): + """ Add derived metrics fields """ + + # TODO: Fix Logic: None rather than 1 + if None not in [eitem["loc"], eitem["comments"], eitem["num_funs"]]: + eitem["loc_per_comment_lines"] = eitem["loc"] / max(eitem["comments"], 1) + eitem["loc_per_blank_lines"] = eitem["loc"] / max(eitem["blanks"], 1) + eitem["loc_per_function"] = eitem["loc"] / max(eitem["num_funs"], 1) + else: + eitem["loc_per_comment_lines"] = eitem["loc_per_blank_lines"] = eitem["loc_per_function"] = None + + return eitem + + def enrich_items(self, ocean_backend, events=False): + items_to_enrich = [] + num_items = 0 + ins_items = 0 + + for item in ocean_backend.fetch(): + rich_items = self.get_rich_items(item) + + items_to_enrich.extend(rich_items) + if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: + continue + + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + items_to_enrich = [] + + if len(items_to_enrich) > 0: + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for Cocom", str(missing), str(num_items)) + else: + logger.info("%s items inserted for Cocom", str(num_items)) + + return num_items + + def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False, + out_index="cocom_enrich_graal_repo", interval_months=3, + date_field="grimoire_creation_date"): + + logger.info("Doing enrich_repository_analysis study for index {}" + .format(self.elastic.anonymize_url(self.elastic.index_url))) + + es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, + verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) + in_index = enrich_backend.elastic.index + + unique_repos = es_in.search( + index=in_index, + body=get_unique_repository()) + + repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + num_items = 0 + ins_items = 0 + + for repository_url in repositories: + es_out = ElasticSearch(enrich_backend.elastic.url, out_index) + evolution_items = [] + + to_month = get_to_date(es_in, in_index, out_index, repository_url) + to_month = to_month.replace(day=1, hour=0, minute=0, second=0) + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) + + while to_month < current_month: + files_at_time = es_in.search( + index=in_index, + body=get_files_at_time(repository_url, to_month.isoformat()) + )['aggregations']['file_stats'].get("buckets", []) + + if not len(files_at_time): + to_month = to_month + relativedelta(months=+interval_months) + continue + + repository_name = repository_url.split("/")[-1] + evolution_item = { + "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months), + "origin": repository_url, + "interval_months": interval_months, + "study_creation_date": to_month.isoformat(), + "total_files": len(files_at_time) + } + + for file_ in files_at_time: + file_details = file_["1"]["hits"]["hits"][0]["_source"] + + for metric in self.metrics: + total_metric = "total_" + metric + evolution_item[total_metric] = evolution_item.get(total_metric, 0) + evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0 + + # TODO: Fix Logic: None rather than 1 + evolution_item["total_loc_per_comment_lines"] = evolution_item["total_loc"] / \ + max(evolution_item["total_comments"], 1) + evolution_item["total_loc_per_blank_lines"] = evolution_item["total_loc"] / max(evolution_item["total_blanks"], 1) + evolution_item["total_loc_per_function"] = evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1) + + evolution_items.append(evolution_item) + + if len(evolution_items) >= self.elastic.max_items_bulk: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + evolution_items = [] + + to_month = to_month + relativedelta(months=+interval_months) + + if len(evolution_items) > 0: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)) + else: + logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items)) diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py new file mode 100644 index 000000000..1e9469d41 --- /dev/null +++ b/grimoire_elk/enriched/colic.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# + +import logging +from dateutil.relativedelta import relativedelta + +from elasticsearch import Elasticsearch as ES, RequestsHttpConnection +from .enrich import (Enrich, + metadata) +from .graal_study_evolution import (get_to_date, + get_unique_repository, + get_files_at_time) +from .utils import fix_field_date + +from grimoirelab_toolkit.datetime import datetime_utcnow +from grimoire_elk.elastic import ElasticSearch + +MAX_SIZE_BULK_ENRICHED_ITEMS = 200 + +logger = logging.getLogger(__name__) + + +class ColicEnrich(Enrich): + + def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, + db_user='', db_password='', db_host=''): + super().__init__(db_sortinghat, db_projects_map, json_projects_map, + db_user, db_password, db_host) + + self.studies = [] + self.studies.append(self.enrich_colic_analysis) + + def get_identities(self, item): + """ Return the identities from an item """ + identities = [] + + return identities + + def has_identities(self): + """ Return whether the enriched items contains identities """ + + return False + + def get_field_unique_id(self): + return "id" + + def get_licensed_files(repository_url, to_date): + """ Retrieve all the licensed files until the to_date, corresponding + to the given repository. + """ + + query_licensed_files = """ + { + "size": 0, + "aggs": { + "1": { + "cardinality": { + "field": "file_path" + } + } + }, + "query": { + "bool": { + "must": [{ + "match_phrase": { + "has_license": { + "query": 1 + } + } + }, + { + "match_phrase": { + "origin": { + "query": "%s" + } + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_licensed_files + + def get_copyrighted_files(repository_url, to_date): + """ Retrieve all the copyrighted files until the to_date, corresponding + to the given repository. + """ + + query_copyrighted_files = """ + { + "size": 0, + "aggs": { + "1": { + "cardinality": { + "field": "file_path" + } + } + }, + "query": { + "bool": { + "must": [{ + "match_phrase": { + "has_copyright": { + "query": 1 + } + } + }, + { + "match_phrase": { + "origin": { + "query": "%s" + } + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_copyrighted_files + + def extract_modules(self, file_path): + """ Extracts module path from the given file path """ + path_chunks = file_path.split('/') + + modules = [] + for idx in range(len(path_chunks)): + sub_path = '/'.join(path_chunks[:idx]) + + if sub_path: + modules.append(sub_path) + + return modules + + @metadata + def get_rich_item(self, file_analysis): + # TODO: requires adjustments regarding category of backend used + + eitem = {} + eitem["file_path"] = file_analysis["file_path"] + eitem["modules"] = self.extract_modules(eitem["file_path"]) + eitem["copyrights"] = [] + eitem["licenses"] = [] + eitem["license_name"] = [] + eitem["has_license"] = 0 + eitem["has_copyright"] = 0 + + if file_analysis.get("licenses", False): + eitem["has_license"] = 1 + for _license in file_analysis["licenses"]: + eitem["licenses"].extend(_license["matched_rule"]["licenses"]) + eitem["license_name"].append(_license["name"]) + + if file_analysis.get("copyrights", False): + eitem["has_copyright"] = 1 + for _copyright in file_analysis["copyrights"]: + eitem["copyrights"].append(_copyright["value"]) + + return eitem + + def get_rich_items(self, item): + # The real data + entry = item['data'] + + enriched_items = [] + + for file_analysis in entry["analysis"]: + eitem = self.get_rich_item(file_analysis) + + for f in self.RAW_FIELDS_COPY: + if f in item: + eitem[f] = item[f] + else: + eitem[f] = None + + # common attributes + eitem['commit_sha'] = entry['commit'] + eitem['author'] = entry['Author'] + eitem['committer'] = entry['Commit'] + eitem['commit'] = entry['commit'] + eitem['message'] = entry['message'] + eitem['author_date'] = fix_field_date(entry['AuthorDate']) + eitem['commit_date'] = fix_field_date(entry['CommitDate']) + + if self.prjs_map: + eitem.update(self.get_item_project(eitem)) + + # uuid + eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) + + eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) + + self.add_repository_labels(eitem) + self.add_metadata_filter_raw(eitem) + + enriched_items.append(eitem) + + return enriched_items + + def enrich_items(self, ocean_backend, events=False): + items_to_enrich = [] + num_items = 0 + ins_items = 0 + + for item in ocean_backend.fetch(): + rich_items = self.get_rich_items(item) + + items_to_enrich.extend(rich_items) + if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: + continue + + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + items_to_enrich = [] + + if len(items_to_enrich) > 0: + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for CoLic", str(missing), str(num_items)) + else: + logger.info("%s items inserted for CoLic", str(num_items)) + + return num_items + + def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False, + out_index="colic_enrich_graal_repo", interval_months=3, + date_field="grimoire_creation_date"): + + logger.info("Doing enrich_colic_analysis study for index {}" + .format(self.elastic.anonymize_url(self.elastic.index_url))) + + es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, + verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) + in_index = enrich_backend.elastic.index + + unique_repos = es_in.search( + index=in_index, + body=get_unique_repository()) + + repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + num_items = 0 + ins_items = 0 + + for repository_url in repositories: + es_out = ElasticSearch(enrich_backend.elastic.url, out_index) + evolution_items = [] + + to_month = get_to_date(es_in, in_index, out_index, repository_url) + to_month = to_month.replace(day=1, hour=0, minute=0, second=0) + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) + + while to_month < current_month: + copyrighted_files_at_time = es_in.search( + index=in_index, + body=self.get_copyrighted_files(repository_url, to_month.isoformat())) + + licensed_files_at_time = es_in.search( + index=in_index, + body=self.get_licensed_files(repository_url, to_month.isoformat())) + + files_at_time = es_in.search( + index=in_index, + body=get_files_at_time(repository_url, to_month.isoformat())) + + licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"]) + copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"]) + total_files = int(files_at_time["aggregations"]["1"]["value"]) + + repository_name = repository_url.split("/")[-1] + evolution_item = { + "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months), + "origin": repository_url, + "interval_months": interval_months, + "study_creation_date": to_month.isoformat(), + "licensed_files": licensed_files, + "copyrighted_files": copyrighted_files, + "total_files": total_files + } + + evolution_items.append(evolution_item) + + if len(evolution_items) >= self.elastic.max_items_bulk: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + evolution_items = [] + + to_month = to_month + relativedelta(months=+interval_months) + + if len(evolution_items) > 0: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)) + else: + logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items)) From 13083004c653c9ae05408d667d5af9e22962df02 Mon Sep 17 00:00:00 2001 From: inishchith Date: Mon, 29 Jul 2019 12:49:19 +0530 Subject: [PATCH 3/8] [graal] Add connector for Graal Backends and Segregate common methods This commit adds connectors for Graal's CoCom & CoLic Backend. Segregates common methods into a separate utility file. Signed-off-by: inishchith --- .../enriched/graal_study_evolution.py | 190 ++++++++++++++++++ grimoire_elk/enriched/utils.py | 16 +- grimoire_elk/raw/graal.py | 70 +++++++ grimoire_elk/utils.py | 8 + 4 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 grimoire_elk/enriched/graal_study_evolution.py create mode 100644 grimoire_elk/raw/graal.py diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py new file mode 100644 index 000000000..0df72facd --- /dev/null +++ b/grimoire_elk/enriched/graal_study_evolution.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Valerio Cosentino +# Nishchith Shetty +# + +from grimoirelab_toolkit.datetime import str_to_datetime + + +def get_unique_repository(): + """ Retrieve all the repository names from the index. """ + + query_unique_repository = """ + { + "size": 0, + "aggs": { + "unique_repos": { + "terms": { + "field": "origin" + } + } + } + } + """ + + return query_unique_repository + + +def get_last_study_date(repository_url): + """ Retrieve the last study_creation_date of the item corresponding + to given repository from the study index. + """ + + query_last_study_date = """ + { + "size": 0, + "aggs": { + "1": { + "max": { + "field": "study_creation_date" + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin.keyword": "%s" + } + }] + } + } + } + """ % (repository_url) + + return query_last_study_date + + +def get_first_enriched_date(repository_url): + """ Retrieve the first/oldest metadata__updated_on of the item + corresponding to given repository. + """ + + query_first_enriched_date = """ + { + "size": 0, + "aggs": { + "1": { + "top_hits": { + "docvalue_fields": [ + "metadata__updated_on" + ], + "_source": "metadata__updated_on", + "size": 1, + "sort": [{ + "commit_date": { + "order": "asc" + } + }] + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin": "%s" + } + }] + } + } + } + """ % (repository_url) + + return query_first_enriched_date + + +def get_files_at_time(repository_url, to_date): + """ Retrieve all the latest changes wrt files until the to_date, + corresponding to the given repository. + """ + + # TODO: Fix for interval month matching + + query_files_at_time = """ + { + "size": 0, + "aggs": { + "file_stats": { + "terms": { + "field": "file_path", + "size": 2147483647, + "order": { + "_key": "desc" + } + }, + "aggs": { + "1": { + "top_hits": { + "size": 1, + "sort": [{ + "metadata__updated_on": { + "order": "desc" + } + }] + } + } + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin": "%s" + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_files_at_time + + +def get_to_date(es_in, in_index, out_index, repository_url): + """ Get the appropriate to_date value for incremental insertion. """ + study_data_available = False + + if es_in.indices.exists(index=out_index): + last_study_date = es_in.search( + index=out_index, + body=get_last_study_date(repository_url))["aggregations"]["1"] + + if last_study_date["value"] is not None: + study_data_available = True + to_date = str_to_datetime(last_study_date["value_as_string"]) + + if not study_data_available: + first_item_date = es_in.search( + index=in_index, + body=get_first_enriched_date(repository_url))["aggregations"]["1"]["hits"]["hits"][0]["_source"] + + to_date = str_to_datetime(first_item_date["metadata__updated_on"]) + + return to_date diff --git a/grimoire_elk/enriched/utils.py b/grimoire_elk/enriched/utils.py index 7b9f5cf78..8de4a8ef0 100755 --- a/grimoire_elk/enriched/utils.py +++ b/grimoire_elk/enriched/utils.py @@ -29,7 +29,8 @@ import urllib3 -from grimoirelab_toolkit.datetime import datetime_utcnow +from grimoirelab_toolkit.datetime import (datetime_utcnow, + str_to_datetime) BACKOFF_FACTOR = 0.2 @@ -212,3 +213,16 @@ def get_diff_current_date(days=0, hours=0, minutes=0): before_date = datetime_utcnow() - datetime.timedelta(days=days, hours=hours, minutes=minutes) return before_date + + +def fix_field_date(date_value): + """Fix possible errors in the field date""" + + field_date = str_to_datetime(date_value) + + try: + _ = int(field_date.strftime("%z")[0:3]) + except ValueError: + field_date = field_date.replace(tzinfo=None) + + return field_date.isoformat() diff --git a/grimoire_elk/raw/graal.py b/grimoire_elk/raw/graal.py new file mode 100644 index 000000000..750aa80be --- /dev/null +++ b/grimoire_elk/raw/graal.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# + +from .elastic import ElasticOcean +from ..elastic_mapping import Mapping as BaseMapping + + +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "data": { + "properties": { + "message": { + "type": "text", + "index": true + } + } + } + } + } + ''' + + return {"items": mapping} + + +class GraalOcean(ElasticOcean): + """CoLic Ocean feeder""" + + mapping = Mapping + + @classmethod + def get_perceval_params_from_url(cls, url): + params = [] + tokens = url.split(' ', 1) # Just split the URL not the filter + url = tokens[0] + params.append(url) + + return params diff --git a/grimoire_elk/utils.py b/grimoire_elk/utils.py index 8aba1ab02..72b487082 100755 --- a/grimoire_elk/utils.py +++ b/grimoire_elk/utils.py @@ -28,6 +28,9 @@ from grimoire_elk.elastic import ElasticConnectException from grimoire_elk.elastic import ElasticSearch +# Connectors for Graal +from graal.backends.core.cocom import CoCom, CoComCommand +from graal.backends.core.colic import CoLic, CoLicCommand # Connectors for Perceval from grimoire_elk.raw.hyperkitty import HyperKittyOcean from perceval.backends.core.askbot import Askbot, AskbotCommand @@ -70,6 +73,8 @@ from .enriched.askbot import AskbotEnrich from .enriched.bugzilla import BugzillaEnrich from .enriched.bugzillarest import BugzillaRESTEnrich +from .enriched.cocom import CocomEnrich +from .enriched.colic import ColicEnrich from .enriched.confluence import ConfluenceEnrich from .enriched.crates import CratesEnrich from .enriched.discourse import DiscourseEnrich @@ -119,6 +124,7 @@ from .raw.github import GitHubOcean from .raw.gitlab import GitLabOcean from .raw.google_hits import GoogleHitsOcean +from .raw.graal import GraalOcean from .raw.groupsio import GroupsioOcean from .raw.jenkins import JenkinsOcean from .raw.jira import JiraOcean @@ -199,6 +205,8 @@ def get_connectors(): return {"askbot": [Askbot, AskbotOcean, AskbotEnrich, AskbotCommand], "bugzilla": [Bugzilla, BugzillaOcean, BugzillaEnrich, BugzillaCommand], "bugzillarest": [BugzillaREST, BugzillaRESTOcean, BugzillaRESTEnrich, BugzillaRESTCommand], + "cocom": [CoCom, GraalOcean, CocomEnrich, CoComCommand], + "colic": [CoLic, GraalOcean, ColicEnrich, CoLicCommand], "confluence": [Confluence, ConfluenceOcean, ConfluenceEnrich, ConfluenceCommand], "crates": [Crates, CratesOcean, CratesEnrich, CratesCommand], "discourse": [Discourse, DiscourseOcean, DiscourseEnrich, DiscourseCommand], From 7e7973491b031cc853a51644033bea702d2eec9f Mon Sep 17 00:00:00 2001 From: inishchith Date: Mon, 29 Jul 2019 12:50:59 +0530 Subject: [PATCH 4/8] [graal:tests] Add appropriate tests for Graal integration (WIP) Signed-off-by: inishchith --- grimoire_elk/enriched/cocom.py | 123 +++++----- grimoire_elk/enriched/colic.py | 104 +++++---- .../enriched/graal_study_evolution.py | 14 +- grimoire_elk/raw/graal.py | 2 +- tests/data/cocom.json | 221 ++++++++++++++++++ tests/data/colic.json | 189 +++++++++++++++ tests/test_cocom.py | 129 ++++++++++ tests/test_colic.py | 89 +++++++ 8 files changed, 758 insertions(+), 113 deletions(-) create mode 100644 tests/data/cocom.json create mode 100644 tests/data/colic.json create mode 100644 tests/test_cocom.py create mode 100644 tests/test_colic.py diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py index 92ff46525..97752d140 100644 --- a/grimoire_elk/enriched/cocom.py +++ b/grimoire_elk/enriched/cocom.py @@ -138,12 +138,14 @@ def __add_derived_metrics(self, file_analysis, eitem): """ Add derived metrics fields """ # TODO: Fix Logic: None rather than 1 - if None not in [eitem["loc"], eitem["comments"], eitem["num_funs"]]: - eitem["loc_per_comment_lines"] = eitem["loc"] / max(eitem["comments"], 1) - eitem["loc_per_blank_lines"] = eitem["loc"] / max(eitem["blanks"], 1) - eitem["loc_per_function"] = eitem["loc"] / max(eitem["num_funs"], 1) + if eitem["loc"] is not None and eitem["comments"] is not None and eitem["num_funs"] is not None: + eitem["comments_per_loc"] = round(eitem["comments"] / max(eitem["loc"], 1), 2) + eitem["blanks_per_loc"] = round(eitem["blanks"] / max(eitem["loc"], 1), 2) + eitem["loc_per_function"] = round(eitem["loc"] / max(eitem["num_funs"], 1), 2) else: - eitem["loc_per_comment_lines"] = eitem["loc_per_blank_lines"] = eitem["loc_per_function"] = None + eitem["comments_per_loc"] = None + eitem["blanks_per_loc"] = None + eitem["loc_per_function"] = None return eitem @@ -176,7 +178,7 @@ def enrich_items(self, ocean_backend, events=False): return num_items def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False, - out_index="cocom_enrich_graal_repo", interval_months=3, + out_index="cocom_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("Doing enrich_repository_analysis study for index {}" @@ -185,12 +187,14 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index + interval_months = list(map(int, interval_months)) unique_repos = es_in.search( index=in_index, body=get_unique_repository()) repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) num_items = 0 ins_items = 0 @@ -198,58 +202,61 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal es_out = ElasticSearch(enrich_backend.elastic.url, out_index) evolution_items = [] - to_month = get_to_date(es_in, in_index, out_index, repository_url) - to_month = to_month.replace(day=1, hour=0, minute=0, second=0) - current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) - - while to_month < current_month: - files_at_time = es_in.search( - index=in_index, - body=get_files_at_time(repository_url, to_month.isoformat()) - )['aggregations']['file_stats'].get("buckets", []) - - if not len(files_at_time): - to_month = to_month + relativedelta(months=+interval_months) - continue - - repository_name = repository_url.split("/")[-1] - evolution_item = { - "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months), - "origin": repository_url, - "interval_months": interval_months, - "study_creation_date": to_month.isoformat(), - "total_files": len(files_at_time) - } - - for file_ in files_at_time: - file_details = file_["1"]["hits"]["hits"][0]["_source"] - - for metric in self.metrics: - total_metric = "total_" + metric - evolution_item[total_metric] = evolution_item.get(total_metric, 0) - evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0 - - # TODO: Fix Logic: None rather than 1 - evolution_item["total_loc_per_comment_lines"] = evolution_item["total_loc"] / \ - max(evolution_item["total_comments"], 1) - evolution_item["total_loc_per_blank_lines"] = evolution_item["total_loc"] / max(evolution_item["total_blanks"], 1) - evolution_item["total_loc_per_function"] = evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1) - - evolution_items.append(evolution_item) - - if len(evolution_items) >= self.elastic.max_items_bulk: - num_items += len(evolution_items) - ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) - evolution_items = [] + for interval in interval_months: - to_month = to_month + relativedelta(months=+interval_months) + to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) + to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) - if len(evolution_items) > 0: - num_items += len(evolution_items) - ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + while to_month < current_month: + files_at_time = es_in.search( + index=in_index, + body=get_files_at_time(repository_url, to_month.isoformat()) + )['aggregations']['file_stats'].get("buckets", []) - if num_items != ins_items: - missing = num_items - ins_items - logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)) - else: - logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items)) + if not len(files_at_time): + to_month = to_month + relativedelta(months=+interval) + continue + + repository_name = repository_url.split("/")[-1] + evolution_item = { + "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), + "origin": repository_url, + "interval_months": interval, + "study_creation_date": to_month.isoformat(), + "total_files": len(files_at_time) + } + + for file_ in files_at_time: + file_details = file_["1"]["hits"]["hits"][0]["_source"] + + for metric in self.metrics: + total_metric = "total_" + metric + evolution_item[total_metric] = evolution_item.get(total_metric, 0) + evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0 + + # TODO: Fix Logic: None rather than 1 + evolution_item["total_comments_per_loc"] = round( + evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) + evolution_item["total_blanks_per_loc"] = round( + evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) + evolution_item["total_loc_per_function"] = round( + evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) + + evolution_items.append(evolution_item) + + if len(evolution_items) >= self.elastic.max_items_bulk: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + evolution_items = [] + + to_month = to_month + relativedelta(months=+interval) + + if len(evolution_items) > 0: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)) + else: + logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items)) diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py index 1e9469d41..589eb2e2a 100644 --- a/grimoire_elk/enriched/colic.py +++ b/grimoire_elk/enriched/colic.py @@ -63,7 +63,7 @@ def has_identities(self): def get_field_unique_id(self): return "id" - def get_licensed_files(repository_url, to_date): + def get_licensed_files(self, repository_url, to_date): """ Retrieve all the licensed files until the to_date, corresponding to the given repository. """ @@ -108,7 +108,7 @@ def get_licensed_files(repository_url, to_date): return query_licensed_files - def get_copyrighted_files(repository_url, to_date): + def get_copyrighted_files(self, repository_url, to_date): """ Retrieve all the copyrighted files until the to_date, corresponding to the given repository. """ @@ -260,7 +260,7 @@ def enrich_items(self, ocean_backend, events=False): return num_items def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False, - out_index="colic_enrich_graal_repo", interval_months=3, + out_index="colic_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): logger.info("Doing enrich_colic_analysis study for index {}" @@ -269,12 +269,14 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) in_index = enrich_backend.elastic.index + interval_months = list(map(int, interval_months)) unique_repos = es_in.search( index=in_index, body=get_unique_repository()) repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) num_items = 0 ins_items = 0 @@ -282,53 +284,59 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa es_out = ElasticSearch(enrich_backend.elastic.url, out_index) evolution_items = [] - to_month = get_to_date(es_in, in_index, out_index, repository_url) - to_month = to_month.replace(day=1, hour=0, minute=0, second=0) - current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) - - while to_month < current_month: - copyrighted_files_at_time = es_in.search( - index=in_index, - body=self.get_copyrighted_files(repository_url, to_month.isoformat())) - - licensed_files_at_time = es_in.search( - index=in_index, - body=self.get_licensed_files(repository_url, to_month.isoformat())) - - files_at_time = es_in.search( - index=in_index, - body=get_files_at_time(repository_url, to_month.isoformat())) - - licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"]) - copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"]) - total_files = int(files_at_time["aggregations"]["1"]["value"]) - - repository_name = repository_url.split("/")[-1] - evolution_item = { - "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months), - "origin": repository_url, - "interval_months": interval_months, - "study_creation_date": to_month.isoformat(), - "licensed_files": licensed_files, - "copyrighted_files": copyrighted_files, - "total_files": total_files - } + for interval in interval_months: + + to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) + to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) + + while to_month < current_month: + copyrighted_files_at_time = es_in.search( + index=in_index, + body=self.get_copyrighted_files(repository_url, to_month.isoformat())) + + licensed_files_at_time = es_in.search( + index=in_index, + body=self.get_licensed_files(repository_url, to_month.isoformat())) + + files_at_time = es_in.search( + index=in_index, + body=get_files_at_time(repository_url, to_month.isoformat())) + + licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"]) + copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"]) + # TODO: Fix - need more efficient query + total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", [])) + + if not total_files: + to_month = to_month + relativedelta(months=+interval) + continue + + repository_name = repository_url.split("/")[-1] + evolution_item = { + "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), + "origin": repository_url, + "interval_months": interval, + "study_creation_date": to_month.isoformat(), + "licensed_files": licensed_files, + "copyrighted_files": copyrighted_files, + "total_files": total_files + } - evolution_items.append(evolution_item) + evolution_items.append(evolution_item) - if len(evolution_items) >= self.elastic.max_items_bulk: - num_items += len(evolution_items) - ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) - evolution_items = [] + if len(evolution_items) >= self.elastic.max_items_bulk: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + evolution_items = [] - to_month = to_month + relativedelta(months=+interval_months) + to_month = to_month + relativedelta(months=+interval) - if len(evolution_items) > 0: - num_items += len(evolution_items) - ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + if len(evolution_items) > 0: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) - if num_items != ins_items: - missing = num_items - ins_items - logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)) - else: - logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items)) + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)) + else: + logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items)) diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py index 0df72facd..a37eb96f4 100644 --- a/grimoire_elk/enriched/graal_study_evolution.py +++ b/grimoire_elk/enriched/graal_study_evolution.py @@ -43,7 +43,7 @@ def get_unique_repository(): return query_unique_repository -def get_last_study_date(repository_url): +def get_last_study_date(repository_url, interval): """ Retrieve the last study_creation_date of the item corresponding to given repository from the study index. """ @@ -64,11 +64,15 @@ def get_last_study_date(repository_url): "term": { "origin.keyword": "%s" } + },{ + "term":{ + "interval_months": "%s" + } }] } } } - """ % (repository_url) + """ % (repository_url, interval) return query_last_study_date @@ -117,8 +121,6 @@ def get_files_at_time(repository_url, to_date): corresponding to the given repository. """ - # TODO: Fix for interval month matching - query_files_at_time = """ { "size": 0, @@ -167,14 +169,14 @@ def get_files_at_time(repository_url, to_date): return query_files_at_time -def get_to_date(es_in, in_index, out_index, repository_url): +def get_to_date(es_in, in_index, out_index, repository_url, interval): """ Get the appropriate to_date value for incremental insertion. """ study_data_available = False if es_in.indices.exists(index=out_index): last_study_date = es_in.search( index=out_index, - body=get_last_study_date(repository_url))["aggregations"]["1"] + body=get_last_study_date(repository_url, interval))["aggregations"]["1"] if last_study_date["value"] is not None: study_data_available = True diff --git a/grimoire_elk/raw/graal.py b/grimoire_elk/raw/graal.py index 750aa80be..f3b5a6963 100644 --- a/grimoire_elk/raw/graal.py +++ b/grimoire_elk/raw/graal.py @@ -56,7 +56,7 @@ def get_elastic_mappings(es_major): class GraalOcean(ElasticOcean): - """CoLic Ocean feeder""" + """Graal Ocean feeder""" mapping = Mapping diff --git a/tests/data/cocom.json b/tests/data/cocom.json new file mode 100644 index 000000000..2d65c6700 --- /dev/null +++ b/tests/data/cocom.json @@ -0,0 +1,221 @@ +[{ + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun Jun 2 18:34:23 2019 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun Jun 2 18:34:23 2019 +0200", + "Signed-off-by": ["Valerio Cosentino "], + "analysis": [{ + "avg_ccn": 2.4193548387096775, + "avg_loc": 8.419354838709678, + "avg_tokens": 60.96774193548387, + "blanks": 158, + "ccn": 75, + "comments": 193, + "ext": "py", + "file_path": "graal/graal.py", + "loc": 372, + "num_funs": 31, + "tokens": 2207 + }], + "analyzer": "lizard_file", + "commit": "692ed86f888d2e7a5ce81a5b8a90f47d05cc5588", + "message": "[graal] Derive `git_path` from `uri`\n \n This code derives the `git_path` of a target repository\n based on its `uri`. This change is needed to allow the\n execution from mordred/ELK, as done with Perceval.\n\n Signed-off-by: Valerio Cosentino " + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562053790.544543, + "updated_on": 1559493263.0, + "uuid": "f86b37d493386ec7467976ff5a707d9c72c54cf9" + }, + { + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "inishchith ", + "AuthorDate": "Mon Jun 3 22:44:15 2019 +0530", + "Commit": "inishchith ", + "CommitDate": "Mon Jun 3 22:47:27 2019 +0530", + "Signed-off-by": ["inishchith "], + "analysis": [{ + "avg_ccn": 1.3461538461538463, + "avg_loc": 8.826923076923077, + "avg_tokens": 81.92307692307692, + "blanks": 204, + "ccn": 70, + "comments": 77, + "ext": "py", + "file_path": "tests/test_graal.py", + "loc": 527, + "num_funs": 52, + "tokens": 4623 + }], + "analyzer": "lizard_file", + "commit": "41f207a9349ae497055ac03157d9915ae81031e0", + "message": "[tests] Add test for deriving `git_path` from `uri`\n \n Signed-off-by: inishchith " + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562053790.902134, + "updated_on": 1559582247.0, + "uuid": "fc17ad9f41767d66c4d2aed6d4b0ba5d072c9980" + }, + { + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Thu Jun 27 09:25:50 2019 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Thu Jun 27 09:25:50 2019 +0200", + "Merge": "5a526a6 26921fe", + "analysis": [{ + "avg_ccn": 2.6666666666666665, + "avg_loc": 19.333333333333332, + "avg_tokens": 129.66666666666666, + "blanks": 26, + "ccn": 8, + "comments": 63, + "ext": "py", + "file_path": "graal/backends/core/analyzers/lizard.py", + "loc": 80, + "num_funs": 3, + "tokens": 421 + }, + { + "avg_ccn": 2.6363636363636362, + "avg_loc": 8.818181818181818, + "avg_tokens": 57.63636363636363, + "blanks": 58, + "ccn": 29, + "comments": 107, + "ext": "py", + "file_path": "graal/backends/core/cocom.py", + "loc": 178, + "num_funs": 11, + "tokens": 938 + }, + { + "avg_ccn": 1.4, + "avg_loc": 13.533333333333333, + "avg_tokens": 127.26666666666667, + "blanks": 71, + "ccn": 21, + "comments": 39, + "ext": "py", + "file_path": "tests/test_cocom.py", + "loc": 234, + "num_funs": 15, + "tokens": 2056 + }, + { + "avg_ccn": 1.3333333333333333, + "avg_loc": 26.666666666666668, + "avg_tokens": 269.3333333333333, + "blanks": 17, + "ccn": 4, + "comments": 25, + "ext": "py", + "file_path": "tests/test_lizard.py", + "loc": 89, + "num_funs": 3, + "tokens": 852 + } + ], + "analyzer": "lizard_file", + "commit": "bfe91c3f9ca046084143f15e117bdd691e0fe12f", + "message": "Merge branch repository_level_cocom_lizard of https: //github.com/inishchith/graal\n \nMerges #39" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562053800.728394, + "updated_on": 1561620350.0, + "uuid": "49a416e4ab44e6f3b02eb96b08a026abdb6afa96" + }, + { + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Thu May 17 17:26:14 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Thu May 17 17:26:14 2018 +0200", + "analysis": [{ + "avg_ccn": null, + "avg_loc": null, + "avg_tokens": null, + "blanks": null, + "ccn": null, + "comments": null, + "file_path": "tests/data/analyzers/sample_code.py", + "loc": null, + "num_funs": null, + "tokens": null + }, + { + "avg_ccn": 2.259259259259259, + "avg_loc": 7.851851851851852, + "avg_tokens": 55.81481481481482, + "blanks": 135, + "ccn": 61, + "comments": 169, + "ext": "py", + "file_path": "graal/graal.py", + "loc": 315, + "num_funs": 27, + "tokens": 1837 + }, + { + "avg_ccn": 2.3333333333333335, + "avg_loc": 6.555555555555555, + "avg_tokens": 55.0, + "blanks": 27, + "ccn": 21, + "comments": 31, + "ext": "py", + "file_path": "tests/data/sample_code.py", + "loc": 72, + "num_funs": 9, + "tokens": 535 + }, + { + "avg_ccn": null, + "avg_loc": null, + "avg_tokens": null, + "blanks": null, + "ccn": null, + "comments": null, + "file_path": "tests/data/graal/graaltest.zip", + "loc": null, + "num_funs": null, + "tokens": null + }, + { + "blanks": 62, + "comments": 39, + "ext": "zip", + "file_path": "tests/data/graaltest.zip", + "loc": 145 + } + ], + "analyzer": "lizard_file", + "commit": "f858376fdb3232417c8de196e04ce9db0e05c3e4", + "message": "[graal] Modify git_path parameterThis code replaces the parameter `git_path` to `gitpath` to ease\nthe integration with arthur. Thus git and graal tasks share somecommon parameters." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1564575285.20279, + "updated_on": 1526570774.0, + "uuid": "0387fc9162b87ae8ad06f626be921d796e32c687" + } +] diff --git a/tests/data/colic.json b/tests/data/colic.json new file mode 100644 index 000000000..bbbc60f6d --- /dev/null +++ b/tests/data/colic.json @@ -0,0 +1,189 @@ +[{ + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode_cli", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:11:43 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:11:43 2018 +0200", + "analysis": [{ + "authors": [], + "base_name": "LICENSE", + "copyrights": [{ + "end_line": 6, + "start_line": 4, + "value": "Copyright (c) 2007 Free Software Foundation, Inc. " + }], + "date": "2019-07-08", + "dirs_count": 0, + "extension": "", + "file_path": "LICENSE", + "file_type": "ASCII text", + "files_count": 0, + "holders": [{ + "end_line": 6, + "start_line": 4, + "value": "Free Software Foundation, Inc." + }], + "is_archive": false, + "is_binary": false, + "is_media": false, + "is_script": false, + "is_source": false, + "is_text": true, + "license_expressions": [ + "gpl-3.0" + ], + "licenses": [{ + "category": "Copyleft", + "end_line": 674, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0.html", + "is_exception": false, + "key": "gpl-3.0", + "matched_rule": { + "identifier": "gpl-3.0.LICENSE", + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": true, + "license_expression": "gpl-3.0", + "licenses": [ + "gpl-3.0" + ], + "match_coverage": 100.0, + "matched_length": 5700, + "matcher": "1-hash", + "rule_length": 5700, + "rule_relevance": 100 + }, + "matched_text": "GNU GENERAL PUBLIC LICENSE\n Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. \n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n", + "name": "GNU General Public License 3.0", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0", + "score": 100.0, + "short_name": "GPL 3.0", + "spdx_license_key": "GPL-3.0-only", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-only", + "start_line": 1, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }], + "md5": "d32239bcb673463ab874e80d47fae504", + "mime_type": "text/plain", + "name": "LICENSE", + "path": "LICENSE", + "programming_language": null, + "scan_errors": [], + "sha1": "8624bcdae55baeef00cd11d5dfcfa60f68710a02", + "size": 35147, + "size_count": 0, + "type": "file" + }], + "analyzer": "scancode_cli", + "commit": "2fb9a49363021922eb0fcc9874baabfc252a827c", + "message": "[graal] Initial commit" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562563540.615095, + "updated_on": 1525605103.0, + "uuid": "29d7a294d2316825de824f1084a783f8479073e0" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode_cli", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:56:51 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:56:51 2018 +0200", + "analysis": [{ + "authors": [{ + "end_line": 20, + "start_line": 19, + "value": "Valerio Cosentino " + }], + "base_name": "codecomplexity", + "copyrights": [{ + "end_line": 3, + "start_line": 3, + "value": "Copyright (c) 2015-2018 Bitergia" + }], + "date": "2019-07-08", + "dirs_count": 0, + "extension": ".py", + "file_path": "graal/codecomplexity.py", + "file_type": "Python script, ASCII text executable", + "files_count": 0, + "holders": [{ + "end_line": 3, + "start_line": 3, + "value": "Bitergia" + }], + "is_archive": false, + "is_binary": false, + "is_media": false, + "is_script": true, + "is_source": true, + "is_text": true, + "license_expressions": [ + "gpl-3.0-plus" + ], + "licenses": [{ + "category": "Copyleft", + "end_line": 17, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html", + "is_exception": false, + "key": "gpl-3.0-plus", + "matched_rule": { + "identifier": "gpl-3.0-plus_12.RULE", + "is_license_notice": true, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": false, + "license_expression": "gpl-3.0-plus", + "licenses": [ + "gpl-3.0-plus" + ], + "match_coverage": 98.2, + "matched_length": 109, + "matcher": "3-seq", + "rule_length": 111, + "rule_relevance": 100 + }, + "matched_text": "This program is free software; you can redistribute it and/or modify\n# it under the terms of the GNU General Public License as published by\n# the Free Software Foundation; either version 3 [of] [the] [License], or\n# (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for more details.\n#\n# You should have received a copy of the GNU General Public License\n# along with this program; if not, write to the Free Software\n# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-", + "name": "GNU General Public License 3.0 or later", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0-plus", + "score": 98.2, + "short_name": "GPL 3.0 or later", + "spdx_license_key": "GPL-3.0-or-later", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-or-later", + "start_line": 5, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }], + "md5": "aa66e700b06ead2a28c2dc29633ebc00", + "mime_type": "text/x-python", + "name": "codecomplexity.py", + "path": "codecomplexity.py", + "programming_language": "Python", + "scan_errors": [], + "sha1": "124e07ae6c850eb232aaf07f43cdb2b2ad2a1db1", + "size": 7817, + "size_count": 0, + "type": "file" + }], + "analyzer": "scancode_cli", + "commit": "a957488c9bd95e3b72a30611edc61496ee152430", + "message": "[codecomplexity] Enable analysis with no file filtering\n\nThis patch allows to handle analysis without file filtering." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562563562.34835, + "updated_on": 1525607811.0, + "uuid": "ce7c47568fd87100aff497dd7677b0736d85db1e" + } +] diff --git a/tests/test_cocom.py b/tests/test_cocom.py new file mode 100644 index 000000000..8838b30d3 --- /dev/null +++ b/tests/test_cocom.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# +import logging +import unittest + +from base import TestBaseBackend + + +HEADER_JSON = {"Content-Type": "application/json"} + + +class TestCoCom(TestBaseBackend): + """Test CoCom backend""" + + connector = "cocom" + ocean_index = "test_" + connector + enrich_index = "test_" + connector + "_enrich" + + def test_has_identites(self): + """Test value of has_identities method""" + + enrich_backend = self.connectors[self.connector][2]() + self.assertFalse(enrich_backend.has_identities()) + + def test_items_to_raw(self): + """Test whether JSON items are properly inserted into ES""" + + result = self._test_items_to_raw() + + self.assertGreater(result['items'], 0) + self.assertGreater(result['raw'], 0) + self.assertGreaterEqual(result['items'], result['raw']) + + def test_raw_to_enrich(self): + """Test whether the raw index is properly enriched""" + + result = self._test_raw_to_enrich() + + self.assertGreater(result['raw'], 0) + self.assertGreater(result['enrich'], 0) + self.assertGreaterEqual(result['enrich'], result['raw']) + + enrich_backend = self.connectors[self.connector][2]() + + item = self.items[0] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], 75) + self.assertEqual(eitem['num_funs'], 31) + self.assertEqual(eitem['tokens'], 2207) + self.assertEqual(eitem['loc'], 372) + self.assertEqual(eitem['ext'], "py") + self.assertEqual(eitem['blanks'], 158) + self.assertEqual(eitem['comments'], 193) + self.assertEqual(eitem['file_path'], "graal/graal.py") + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem["comments_per_loc"], 0.52) + self.assertEqual(eitem["blanks_per_loc"], 0.42) + self.assertEqual(eitem["loc_per_function"], 12.0) + + item = self.items[1] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], 70) + self.assertEqual(eitem['num_funs'], 52) + self.assertEqual(eitem['tokens'], 4623) + self.assertEqual(eitem['loc'], 527) + self.assertEqual(eitem['ext'], "py") + self.assertEqual(eitem['blanks'], 204) + self.assertEqual(eitem['comments'], 77) + self.assertEqual(eitem['file_path'], "tests/test_graal.py") + self.assertEqual(eitem['modules'], ["tests"]) + self.assertEqual(eitem["comments_per_loc"], 0.15) + self.assertEqual(eitem["blanks_per_loc"], 0.39) + self.assertEqual(eitem["loc_per_function"], 10.13) + + item = self.items[2] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], 8) + self.assertEqual(eitem['num_funs'], 3) + self.assertEqual(eitem['tokens'], 421) + self.assertEqual(eitem['loc'], 80) + self.assertEqual(eitem['ext'], "py") + self.assertEqual(eitem['blanks'], 26) + self.assertEqual(eitem['comments'], 63) + self.assertEqual(eitem['file_path'], "graal/backends/core/analyzers/lizard.py") + self.assertEqual(eitem['modules'], ["graal", "graal/backends", "graal/backends/core", "graal/backends/core/analyzers"]) + self.assertEqual(eitem["comments_per_loc"], 0.79) + self.assertEqual(eitem["blanks_per_loc"], 0.33) + self.assertEqual(eitem["loc_per_function"], 26.67) + + item = self.items[3] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], None) + self.assertEqual(eitem['num_funs'], None) + self.assertEqual(eitem['tokens'], None) + self.assertEqual(eitem['loc'], None) + self.assertEqual(eitem['ext'], None) + self.assertEqual(eitem['blanks'], None) + self.assertEqual(eitem['comments'], None) + self.assertEqual(eitem['file_path'], "tests/data/analyzers/sample_code.py") + self.assertEqual(eitem['modules'], ["tests", "tests/data", "tests/data/analyzers"]) + self.assertEqual(eitem["comments_per_loc"], None) + self.assertEqual(eitem["blanks_per_loc"], None) + self.assertEqual(eitem["loc_per_function"], None) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("requests").setLevel(logging.WARNING) + unittest.main(warnings='ignore') diff --git a/tests/test_colic.py b/tests/test_colic.py new file mode 100644 index 000000000..a376ddbea --- /dev/null +++ b/tests/test_colic.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# +import logging +import unittest + +from base import TestBaseBackend + + +HEADER_JSON = {"Content-Type": "application/json"} + + +class TestCoLic(TestBaseBackend): + """Test CoLic backend""" + + connector = "colic" + ocean_index = "test_" + connector + enrich_index = "test_" + connector + "_enrich" + + def test_has_identites(self): + """Test value of has_identities method""" + + enrich_backend = self.connectors[self.connector][2]() + self.assertFalse(enrich_backend.has_identities()) + + def test_items_to_raw(self): + """Test whether JSON items are properly inserted into ES""" + + result = self._test_items_to_raw() + + self.assertGreater(result['items'], 0) + self.assertGreater(result['raw'], 0) + self.assertGreaterEqual(result['items'], result['raw']) + + def test_raw_to_enrich(self): + """Test whether the raw index is properly enriched""" + + result = self._test_raw_to_enrich() + + self.assertGreater(result['raw'], 0) + self.assertGreater(result['enrich'], 0) + self.assertGreaterEqual(result['enrich'], result['raw']) + + enrich_backend = self.connectors[self.connector][2]() + + item = self.items[0] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2007 Free Software Foundation, Inc. "]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], []) + self.assertEqual(eitem['file_path'], "LICENSE") + + item = self.items[1] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0-plus"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0 or later"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2015-2018 Bitergia"]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("requests").setLevel(logging.WARNING) + unittest.main(warnings='ignore') From 4b4e0886711468f38b6dca083b483012662e31ca Mon Sep 17 00:00:00 2001 From: inishchith Date: Thu, 1 Aug 2019 01:19:07 +0530 Subject: [PATCH 5/8] [schema] Add CoCom and CoLic schema Signed-off-by: inishchith --- schema/graal_cocom.csv | 31 +++++++++++++++++++++++++++++++ schema/graal_colic.csv | 26 ++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 schema/graal_cocom.csv create mode 100644 schema/graal_colic.csv diff --git a/schema/graal_cocom.csv b/schema/graal_cocom.csv new file mode 100644 index 000000000..b4e2acb9a --- /dev/null +++ b/schema/graal_cocom.csv @@ -0,0 +1,31 @@ +name,type,aggregatable,description +author,string,true,"Author name." +author_date,date,true,"Author date (when the original author made the commit)." +blanks,number,true,"Number of blank lines in a file." +blanks_per_loc,number,true,"Number of blank lines per line of code." +ccn,number,true,"Code Complexity of a file." +comments,number,true,"Number of comments in a file." +comments_per_loc,number,true,"Number of comment lines per line of code." +commit_date,date,true,"Date when committer made this commit." +commit_sha,string,true,"Commit hash." +committer,string,true,"Author name." +ext,string,true,"File extension" +file_path,string,true,"File Path" +grimoire_creation_date,date,true,"Commit date (when the original author made the commit)." +id,string,true,"Graal Item Id." +loc,number,true,"Lines of code in a file." +loc_per_function,number,true,"Number of lines of code per function definition." +message,string,true,"Commit message as a single String." +metadata__enriched_on,date,true,"Date when the item was enriched." +metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information." +metadata__gelk_version,keyword,true,"Version of the backend used to enrich information." +metadata__timestamp,date,true,"Date when the item was stored in RAW index." +metadata__updated_on,date,true,"Date when the item was updated in its original data source." +modules,string,true,"Modules which the file is part of" +num_funs,number,true,"Number of function definition in the file" +origin,keyword,true,"Original URL where the repository was retrieved from." +project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)" +project,keyword,true,"Project." +tag,keyword,true,"Graal tag." +tokens,number,true,"Number of tokens in a file" +uuid,keyword,true,"Graal UUID." diff --git a/schema/graal_colic.csv b/schema/graal_colic.csv new file mode 100644 index 000000000..676844644 --- /dev/null +++ b/schema/graal_colic.csv @@ -0,0 +1,26 @@ +name,type,aggregatable,description +author,string,true,"Author name." +author_date,date,true,"Author date (when the original author made the commit)." +commit_date,date,true,"Date when committer made this commit." +commit_sha,string,true,"Commit hash." +committer,string,true,"Author name." +copyrights,string,true,"Copyright definitions found in file by analyzer." +file_path,string,true,"File Path" +grimoire_creation_date,date,true,"Commit date (when the original author made the commit)." +has_copyright,number,true,"1 if copyright definition exists else 0" +has_license,number,true,"1 if license definition exists else 0" +id,string,true,"Graal Item Id." +license_name,string,true,"License definitions found in file by analyzer" +licenses,string,true,"License keyword/tag." +message,string,true,"Commit message as a single String." +metadata__enriched_on,date,true,"Date when the item was enriched." +metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information." +metadata__gelk_version,keyword,true,"Version of the backend used to enrich information." +metadata__timestamp,date,true,"Date when the item was stored in RAW index." +metadata__updated_on,date,true,"Date when the item was updated in its original data source." +modules,string,true,"Modules which the file is part of." +origin,keyword,true,"Original URL where the repository was retrieved from." +project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)" +project,keyword,true,"Project." +tag,keyword,true,"Graal tag." +uuid,keyword,true,"Graal UUID." From e0430d1f21258f85f7bdcd6fade2fdede4455d02 Mon Sep 17 00:00:00 2001 From: inishchith Date: Thu, 1 Aug 2019 23:29:29 +0530 Subject: [PATCH 6/8] [tests:study] Add tests for CoCom & CoLic study implementation Signed-off-by: inishchith --- grimoire_elk/enriched/cocom.py | 5 +++-- grimoire_elk/enriched/colic.py | 5 +++-- tests/test_cocom.py | 15 +++++++++++++++ tests/test_colic.py | 15 +++++++++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py index 97752d140..3761adf39 100644 --- a/grimoire_elk/enriched/cocom.py +++ b/grimoire_elk/enriched/cocom.py @@ -181,8 +181,7 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal out_index="cocom_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): - logger.info("Doing enrich_repository_analysis study for index {}" - .format(self.elastic.anonymize_url(self.elastic.index_url))) + logger.info("[cocom] Starting enrich_repository_analysis study") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) @@ -260,3 +259,5 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)) else: logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items)) + + logger.info("[cocom] Ending enrich_repository_analysis study") diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py index 589eb2e2a..fad933fdf 100644 --- a/grimoire_elk/enriched/colic.py +++ b/grimoire_elk/enriched/colic.py @@ -263,8 +263,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa out_index="colic_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): - logger.info("Doing enrich_colic_analysis study for index {}" - .format(self.elastic.anonymize_url(self.elastic.index_url))) + logger.info("[colic] Starting enrich_colic_analysis study") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) @@ -340,3 +339,5 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)) else: logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items)) + + logger.info("[colic] Ending enrich_colic_analysis study") diff --git a/tests/test_cocom.py b/tests/test_cocom.py index 8838b30d3..121c5ca75 100644 --- a/tests/test_cocom.py +++ b/tests/test_cocom.py @@ -23,6 +23,7 @@ import unittest from base import TestBaseBackend +from grimoire_elk.enriched.cocom import logger HEADER_JSON = {"Content-Type": "application/json"} @@ -121,6 +122,20 @@ def test_raw_to_enrich(self): self.assertEqual(eitem["blanks_per_loc"], None) self.assertEqual(eitem["loc_per_function"], None) + def test_cocom_analysis_study(self): + """ Test that the cocom analysis study works correctly """ + + study, ocean_backend, enrich_backend = self._test_study('enrich_repo_analysis') + + with self.assertLogs(logger, level='INFO') as cm: + + if study.__name__ == "enrich_repo_analysis": + study(ocean_backend, enrich_backend) + self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[cocom] Starting ' + 'enrich_repository_analysis study') + self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[cocom] Ending ' + 'enrich_repository_analysis study') + if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') diff --git a/tests/test_colic.py b/tests/test_colic.py index a376ddbea..db0c441f2 100644 --- a/tests/test_colic.py +++ b/tests/test_colic.py @@ -23,6 +23,7 @@ import unittest from base import TestBaseBackend +from grimoire_elk.enriched.colic import logger HEADER_JSON = {"Content-Type": "application/json"} @@ -81,6 +82,20 @@ def test_raw_to_enrich(self): self.assertEqual(eitem['modules'], ["graal"]) self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + def test_colic_analysis_study(self): + """ Test that the colic analysis study works correctly """ + + study, ocean_backend, enrich_backend = self._test_study('enrich_colic_analysis') + + with self.assertLogs(logger, level='INFO') as cm: + + if study.__name__ == "enrich_colic_analysis": + study(ocean_backend, enrich_backend) + self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[colic] Starting ' + 'enrich_colic_analysis study') + self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[colic] Ending ' + 'enrich_colic_analysis study') + if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') From 400a53d4008d6b721c665fddd8797ffc84aa94cb Mon Sep 17 00:00:00 2001 From: inishchith Date: Mon, 5 Aug 2019 20:08:47 +0530 Subject: [PATCH 7/8] [colic] Add category reference implementation and corresponding tests Signed-off-by: inishchith --- grimoire_elk/enriched/cocom.py | 71 +++++++++- grimoire_elk/enriched/colic.py | 99 +++++++++++-- .../enriched/graal_study_evolution.py | 6 +- grimoire_elk/raw/graal.py | 9 +- tests/data/colic.json | 130 ++++++++++++++++++ tests/test_cocom.py | 3 +- tests/test_colic.py | 33 ++++- 7 files changed, 326 insertions(+), 25 deletions(-) diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py index 3761adf39..0e229433a 100644 --- a/grimoire_elk/enriched/cocom.py +++ b/grimoire_elk/enriched/cocom.py @@ -13,8 +13,7 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# along with this program. If not, see . # # Authors: # Valerio Cosentino @@ -32,6 +31,7 @@ get_unique_repository, get_files_at_time) from .utils import fix_field_date +from ..elastic_mapping import Mapping as BaseMapping from grimoirelab_toolkit.datetime import datetime_utcnow from grimoire_elk.elastic import ElasticSearch @@ -41,6 +41,71 @@ logger = logging.getLogger(__name__) +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "id" : { + "type" : "keyword" + }, + "interval_months" : { + "type" : "long" + }, + "origin" : { + "type" : "keyword" + }, + "study_creation_date" : { + "type" : "date" + }, + "total_blanks" : { + "type" : "long" + }, + "total_blanks_per_loc" : { + "type" : "float" + }, + "total_ccn" : { + "type" : "long" + }, + "total_comments" : { + "type" : "long" + }, + "total_comments_per_loc" : { + "type" : "float" + }, + "total_files" : { + "type" : "long" + }, + "total_loc" : { + "type" : "long" + }, + "total_loc_per_function" : { + "type" : "float" + }, + "total_num_funs" : { + "type" : "long" + }, + "total_tokens" : { + "type" : "long" + } + } + } + ''' + + return {"items": mapping} + + class CocomEnrich(Enrich): metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"] @@ -198,7 +263,7 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal ins_items = 0 for repository_url in repositories: - es_out = ElasticSearch(enrich_backend.elastic.url, out_index) + es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) evolution_items = [] for interval in interval_months: diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py index fad933fdf..89036ef36 100644 --- a/grimoire_elk/enriched/colic.py +++ b/grimoire_elk/enriched/colic.py @@ -13,8 +13,7 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# along with this program. If not, see . # # Authors: # Nishchith Shetty @@ -30,6 +29,7 @@ get_unique_repository, get_files_at_time) from .utils import fix_field_date +from ..elastic_mapping import Mapping as BaseMapping from grimoirelab_toolkit.datetime import datetime_utcnow from grimoire_elk.elastic import ElasticSearch @@ -39,6 +39,50 @@ logger = logging.getLogger(__name__) +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "id" : { + "type" : "keyword" + }, + "interval_months" : { + "type" : "long" + }, + "origin" : { + "type" : "keyword" + }, + "study_creation_date" : { + "type" : "date" + }, + "total_files": { + "type": "long" + }, + "licensed_files": { + "type": "long" + }, + "copyrighted_files": { + "type": "long" + } + } + } + ''' + + return {"items": mapping} + + class ColicEnrich(Enrich): def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, @@ -167,8 +211,8 @@ def extract_modules(self, file_path): return modules @metadata - def get_rich_item(self, file_analysis): - # TODO: requires adjustments regarding category of backend used + def __get_rich_scancode(self, file_analysis): + # Scancode and Scancode-CLI Implementation eitem = {} eitem["file_path"] = file_analysis["file_path"] @@ -192,14 +236,44 @@ def get_rich_item(self, file_analysis): return eitem + @metadata + def __get_rich_nomossa(self, file_analysis): + # NOMOS analyzer implementation + + eitem = {} + eitem["file_path"] = file_analysis["file_path"] + eitem["modules"] = self.extract_modules(eitem["file_path"]) + eitem["licenses"] = [] + eitem["license_name"] = [] + eitem["has_license"] = 0 + + if file_analysis["licenses"] != "No_license_found": + eitem["has_license"] = 1 + for _license in file_analysis["licenses"]: + eitem["licenses"].append(_license) + eitem["license_name"].append(_license) + + # NOMOS doesn't provide copyright information. + eitem["copyrights"] = [] + eitem["has_copyright"] = 0 + + return eitem + def get_rich_items(self, item): - # The real data - entry = item['data'] + """ + :category: code_license_scancode_cli(default) + """ + if item["category"] == "code_license_nomos": + get_rich_item = self.__get_rich_nomossa + else: + get_rich_item = self.__get_rich_scancode + + entry = item['data'] enriched_items = [] for file_analysis in entry["analysis"]: - eitem = self.get_rich_item(file_analysis) + eitem = get_rich_item(file_analysis) for f in self.RAW_FIELDS_COPY: if f in item: @@ -208,13 +282,14 @@ def get_rich_items(self, item): eitem[f] = None # common attributes - eitem['commit_sha'] = entry['commit'] eitem['author'] = entry['Author'] - eitem['committer'] = entry['Commit'] - eitem['commit'] = entry['commit'] - eitem['message'] = entry['message'] eitem['author_date'] = fix_field_date(entry['AuthorDate']) + eitem["category"] = item["category"] + eitem['commit'] = entry['commit'] + eitem['committer'] = entry['Commit'] eitem['commit_date'] = fix_field_date(entry['CommitDate']) + eitem['commit_sha'] = entry['commit'] + eitem['message'] = entry['message'] if self.prjs_map: eitem.update(self.get_item_project(eitem)) @@ -280,7 +355,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa ins_items = 0 for repository_url in repositories: - es_out = ElasticSearch(enrich_backend.elastic.url, out_index) + es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) evolution_items = [] for interval in interval_months: diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py index a37eb96f4..c840a15f5 100644 --- a/grimoire_elk/enriched/graal_study_evolution.py +++ b/grimoire_elk/enriched/graal_study_evolution.py @@ -13,8 +13,7 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# You should have received a copy of the GNU General Public License # # Authors: # Valerio Cosentino @@ -33,7 +32,8 @@ def get_unique_repository(): "aggs": { "unique_repos": { "terms": { - "field": "origin" + "field": "origin", + "size": 5000 } } } diff --git a/grimoire_elk/raw/graal.py b/grimoire_elk/raw/graal.py index f3b5a6963..df998de38 100644 --- a/grimoire_elk/raw/graal.py +++ b/grimoire_elk/raw/graal.py @@ -13,8 +13,7 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# along with this program. If not, see . # # Authors: # Nishchith Shetty @@ -37,7 +36,7 @@ def get_elastic_mappings(es_major): """ mapping = ''' - { + { "dynamic":true, "properties": { "data": { @@ -45,6 +44,10 @@ def get_elastic_mappings(es_major): "message": { "type": "text", "index": true + }, + "analysis": { + "dynamic":false, + "properties": {} } } } diff --git a/tests/data/colic.json b/tests/data/colic.json index bbbc60f6d..f559075de 100644 --- a/tests/data/colic.json +++ b/tests/data/colic.json @@ -185,5 +185,135 @@ "timestamp": 1562563562.34835, "updated_on": 1525607811.0, "uuid": "ce7c47568fd87100aff497dd7677b0736d85db1e" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:56:51 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:56:51 2018 +0200", + "analysis": [{ + "copyrights": [{ + "end_line": 3, + "start_line": 3, + "value": "Copyright (c) 2015-2018 Bitergia" + }], + "file_path": "graal/codecomplexity.py", + "licenses": [{ + "category": "Copyleft", + "end_line": 17, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html", + "is_exception": false, + "key": "gpl-3.0-plus", + "matched_rule": { + "identifier": "gpl-3.0-plus_117.RULE", + "is_license_notice": true, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": false, + "license_expression": "gpl-3.0-plus", + "licenses": [ + "gpl-3.0-plus" + ], + "match_coverage": 97.35, + "matched_length": 110, + "matcher": "3-seq", + "rule_length": 113, + "rule_relevance": 100.0 + }, + "name": "GNU General Public License 3.0 or later", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0-plus", + "score": 97.35, + "short_name": "GPL 3.0 or later", + "spdx_license_key": "GPL-3.0-or-later", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-or-later", + "start_line": 5, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }] + }], + "analyzer": "scancode", + "commit": "a957488c9bd95e3b72a30611edc61496ee152430", + "message": "[codecomplexity] Enable analysis with no file filtering\n\nThis patch allows to handle analysis without file filtering." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1565108843.015344, + "updated_on": 1525607811.0, + "uuid": "ce7c47568fd87100aff497dd7677b0736d85db1e" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode", + "data": { + "Author": "valerio ", + "AuthorDate": "Sun May 6 14:02:36 2018 +0200", + "Commit": "GitHub ", + "CommitDate": "Sun May 6 14:02:36 2018 +0200", + "analysis": [{ + "copyrights": [], + "file_path": "README.md", + "licenses": [] + }], + "analyzer": "scancode", + "commit": "8aedf09e36008fee19192985c0eb51879c6c61e4", + "message": "Create README.md" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1565108866.965087, + "updated_on": 1525608156.0, + "uuid": "856beb87d6b324b136e718295cc6ad69343a1066" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_nomos", + "data": { + "Author": "inishchith ", + "AuthorDate": "Mon Feb 25 21:44:23 2019 +0530", + "Commit": "Valerio Cosentino ", + "CommitDate": "Tue Feb 26 16:24:43 2019 +0100", + "analysis": [{ + "file_path": "tests/test_colic.py", + "licenses": [ + "GPL-3.0" + ] + }, + { + "file_path": "tests/test_nomos.py", + "licenses": [ + "GPL-3.0" + ] + }, + { + "file_path": "tests/test_scancode.py", + "licenses": [ + "GPL-3.0" + ] + }, + { + "file_path": "tests/utils.py", + "licenses": [ + "GPL-3.0" + ] + } + ], + "analyzer": "nomos", + "commit": "dda651a12eb05b2d604522b4fbdbf07d3e213eff", + "message": "[tests] Move executable path to utils.py\n\nMove executable paths - NOMOS_PATH and SCANCODE_PATH to utils.py" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1565204679.302685, + "updated_on": 1551194683.0, + "uuid": "79a561015d5d49c3ec6754a05db24735f957814e" } ] diff --git a/tests/test_cocom.py b/tests/test_cocom.py index 121c5ca75..ec9f98f21 100644 --- a/tests/test_cocom.py +++ b/tests/test_cocom.py @@ -13,8 +13,7 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# along with this program. If not, see . # # Authors: # Nishchith Shetty diff --git a/tests/test_colic.py b/tests/test_colic.py index db0c441f2..6a5725b64 100644 --- a/tests/test_colic.py +++ b/tests/test_colic.py @@ -13,8 +13,7 @@ # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# along with this program. If not, see . # # Authors: # Nishchith Shetty @@ -82,6 +81,36 @@ def test_raw_to_enrich(self): self.assertEqual(eitem['modules'], ["graal"]) self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + item = self.items[2] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0-plus"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0 or later"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2015-2018 Bitergia"]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + + item = self.items[3] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], []) + self.assertEqual(eitem['has_license'], 0) + self.assertEqual(eitem['license_name'], []) + self.assertEqual(eitem['copyrights'], []) + self.assertEqual(eitem['has_copyright'], 0) + self.assertEqual(eitem['modules'], []) + self.assertEqual(eitem['file_path'], "README.md") + + item = self.items[4] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["GPL-3.0"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GPL-3.0"]) + self.assertEqual(eitem['copyrights'], []) + self.assertEqual(eitem['has_copyright'], 0) + self.assertEqual(eitem['modules'], ["tests"]) + self.assertEqual(eitem['file_path'], "tests/test_colic.py") + def test_colic_analysis_study(self): """ Test that the colic analysis study works correctly """ From effc887b89f3b62460b148ef78376b7b0f675866 Mon Sep 17 00:00:00 2001 From: inishchith Date: Wed, 14 Aug 2019 01:11:22 +0530 Subject: [PATCH 8/8] [logger] Add logs for study and Fix CoLic query Add appropriate logs for the enrichers Update CoCom Study method name Fix Tests Signed-off-by: inishchith --- grimoire_elk/enriched/cocom.py | 28 ++++++--- grimoire_elk/enriched/colic.py | 100 ++++++++++++++++++++++----------- tests/test_cocom.py | 12 ++-- tests/test_colic.py | 4 +- 4 files changed, 95 insertions(+), 49 deletions(-) diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py index 0e229433a..79a7883a6 100644 --- a/grimoire_elk/enriched/cocom.py +++ b/grimoire_elk/enriched/cocom.py @@ -115,7 +115,7 @@ def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=N db_user, db_password, db_host) self.studies = [] - self.studies.append(self.enrich_repo_analysis) + self.studies.append(self.enrich_cocom_analysis) def get_identities(self, item): """ Return the identities from an item """ @@ -242,11 +242,11 @@ def enrich_items(self, ocean_backend, events=False): return num_items - def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False, - out_index="cocom_enrich_graal_repo", interval_months=[3], - date_field="grimoire_creation_date"): + def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, + out_index="cocom_enrich_graal_repo", interval_months=[3], + date_field="grimoire_creation_date"): - logger.info("[cocom] Starting enrich_repository_analysis study") + logger.info("[enrich-cocom-analysis] Start enrich_cocom_analysis study") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) @@ -259,11 +259,16 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) + + logger.info("[enrich-cocom-analysis] {} repositories to process".format(len(repositories))) + es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) + es_out.add_alias("cocom_study") + num_items = 0 ins_items = 0 for repository_url in repositories: - es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) + logger.info("[enrich-cocom-analysis] Start analysis for {}".format(repository_url)) evolution_items = [] for interval in interval_months: @@ -306,6 +311,7 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal evolution_item["total_loc_per_function"] = round( evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) + evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: @@ -321,8 +327,12 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal if num_items != ins_items: missing = num_items - ins_items - logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)) + logger.error( + "[enrich-cocom-analysis] %s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items) + ) else: - logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items)) + logger.info("[enrich-cocom-analysis] %s items inserted for Graal CoCom Analysis Study", str(num_items)) + + logger.info("[enrich-cocom-analysis] End analysis for {} with month interval".format(repository_url, interval)) - logger.info("[cocom] Ending enrich_repository_analysis study") + logger.info("[enrich-cocom-analysis] End enrich_cocom_analysis study") diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py index 89036ef36..68234e575 100644 --- a/grimoire_elk/enriched/colic.py +++ b/grimoire_elk/enriched/colic.py @@ -26,8 +26,7 @@ from .enrich import (Enrich, metadata) from .graal_study_evolution import (get_to_date, - get_unique_repository, - get_files_at_time) + get_unique_repository) from .utils import fix_field_date from ..elastic_mapping import Mapping as BaseMapping @@ -107,7 +106,43 @@ def has_identities(self): def get_field_unique_id(self): return "id" - def get_licensed_files(self, repository_url, to_date): + def __get_total_files(self, repository_url, to_date): + """ Retrieve total number for files until to_date, corresponding + to the given repository + """ + + query_total_files = """ + { + "size": 0, + "aggs": { + "1": { + "cardinality": { + "field": "file_path" + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin": "%s" + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_total_files + + def __get_licensed_files(self, repository_url, to_date): """ Retrieve all the licensed files until the to_date, corresponding to the given repository. """ @@ -124,18 +159,14 @@ def get_licensed_files(self, repository_url, to_date): }, "query": { "bool": { - "must": [{ - "match_phrase": { - "has_license": { - "query": 1 - } + "filter": [{ + "term": { + "has_license": 1 } }, { - "match_phrase": { - "origin": { - "query": "%s" - } + "term": { + "origin": "%s" } }, { @@ -152,7 +183,7 @@ def get_licensed_files(self, repository_url, to_date): return query_licensed_files - def get_copyrighted_files(self, repository_url, to_date): + def __get_copyrighted_files(self, repository_url, to_date): """ Retrieve all the copyrighted files until the to_date, corresponding to the given repository. """ @@ -169,18 +200,14 @@ def get_copyrighted_files(self, repository_url, to_date): }, "query": { "bool": { - "must": [{ - "match_phrase": { - "has_copyright": { - "query": 1 - } + "filter": [{ + "term": { + "has_copyright": 1 } }, { - "match_phrase": { - "origin": { - "query": "%s" - } + "term": { + "origin": "%s" } }, { @@ -338,7 +365,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa out_index="colic_enrich_graal_repo", interval_months=[3], date_field="grimoire_creation_date"): - logger.info("[colic] Starting enrich_colic_analysis study") + logger.info("[enrich-colic-analysis] Start enrich_colic_analysis study") es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) @@ -350,12 +377,17 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa body=get_unique_repository()) repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + + logger.info("[enrich-colic-analysis] {} repositories to process".format(len(repositories))) + es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) + es_out.add_alias("colic_study") + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) num_items = 0 ins_items = 0 for repository_url in repositories: - es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) + logger.info("[enrich-colic-analysis] Start analysis for {}".format(repository_url)) evolution_items = [] for interval in interval_months: @@ -366,20 +398,19 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa while to_month < current_month: copyrighted_files_at_time = es_in.search( index=in_index, - body=self.get_copyrighted_files(repository_url, to_month.isoformat())) + body=self.__get_copyrighted_files(repository_url, to_month.isoformat())) licensed_files_at_time = es_in.search( index=in_index, - body=self.get_licensed_files(repository_url, to_month.isoformat())) + body=self.__get_licensed_files(repository_url, to_month.isoformat())) files_at_time = es_in.search( index=in_index, - body=get_files_at_time(repository_url, to_month.isoformat())) + body=self.__get_total_files(repository_url, to_month.isoformat())) licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"]) copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"]) - # TODO: Fix - need more efficient query - total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", [])) + total_files = int(files_at_time["aggregations"]["1"]["value"]) if not total_files: to_month = to_month + relativedelta(months=+interval) @@ -396,6 +427,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa "total_files": total_files } + evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats")) evolution_items.append(evolution_item) if len(evolution_items) >= self.elastic.max_items_bulk: @@ -411,8 +443,12 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa if num_items != ins_items: missing = num_items - ins_items - logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)) + logger.error( + "[enrich-colic-analysis] %s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items) + ) else: - logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items)) + logger.info("[enrich-colic-analysis] %s items inserted for Graal CoLic Analysis Study", str(num_items)) + + logger.info("[enrich-colic-analysis] End analysis for {} with month interval".format(repository_url, interval)) - logger.info("[colic] Ending enrich_colic_analysis study") + logger.info("[enrich-colic-analysis] End enrich_colic_analysis study") diff --git a/tests/test_cocom.py b/tests/test_cocom.py index ec9f98f21..16f4129d9 100644 --- a/tests/test_cocom.py +++ b/tests/test_cocom.py @@ -124,16 +124,16 @@ def test_raw_to_enrich(self): def test_cocom_analysis_study(self): """ Test that the cocom analysis study works correctly """ - study, ocean_backend, enrich_backend = self._test_study('enrich_repo_analysis') + study, ocean_backend, enrich_backend = self._test_study('enrich_cocom_analysis') with self.assertLogs(logger, level='INFO') as cm: - if study.__name__ == "enrich_repo_analysis": + if study.__name__ == "enrich_cocom_analysis": study(ocean_backend, enrich_backend) - self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[cocom] Starting ' - 'enrich_repository_analysis study') - self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[cocom] Ending ' - 'enrich_repository_analysis study') + self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] Start ' + 'enrich_cocom_analysis study') + self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] End ' + 'enrich_cocom_analysis study') if __name__ == "__main__": diff --git a/tests/test_colic.py b/tests/test_colic.py index 6a5725b64..b7c81dfa0 100644 --- a/tests/test_colic.py +++ b/tests/test_colic.py @@ -120,9 +120,9 @@ def test_colic_analysis_study(self): if study.__name__ == "enrich_colic_analysis": study(ocean_backend, enrich_backend) - self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[colic] Starting ' + self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] Start ' 'enrich_colic_analysis study') - self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[colic] Ending ' + self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] End ' 'enrich_colic_analysis study')