-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,10 +21,24 @@ | |
# Nishchith Shetty <[email protected]> | ||
# | ||
|
||
import datetime | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
inishchith
Author
Owner
|
||
import logging | ||
from pprint import pprint | ||
|
||
import dateutil.tz | ||
from dateutil.relativedelta import relativedelta | ||
|
||
from elasticsearch import Elasticsearch as ES, RequestsHttpConnection | ||
|
||
from .enrich import (Enrich, | ||
metadata) | ||
from .graal_utils import (get_unique_repository, | ||
get_last_study_date, | ||
get_first_enriched_date, | ||
get_files_at_time) | ||
|
||
from .enrich import Enrich, metadata | ||
from grimoirelab_toolkit.datetime import str_to_datetime | ||
from grimoire_elk.elastic import ElasticSearch | ||
|
||
MAX_SIZE_BULK_ENRICHED_ITEMS = 200 | ||
|
||
|
@@ -33,6 +47,14 @@ | |
|
||
class CocomEnrich(Enrich): | ||
|
||
def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, | ||
db_user='', db_password='', db_host=''): | ||
super().__init__(db_sortinghat, db_projects_map, json_projects_map, | ||
db_user, db_password, db_host) | ||
|
||
self.studies = [] | ||
self.studies.append(self.enrich_repo_analysis) | ||
|
||
def get_identities(self, item): | ||
""" Return the identities from an item """ | ||
identities = [] | ||
|
@@ -98,7 +120,6 @@ def get_rich_items(self, item): | |
eitem['commit_sha'] = entry['commit'] | ||
eitem['author'] = entry['Author'] | ||
eitem['committer'] = entry['Commit'] | ||
eitem['commit'] = entry['commit'] | ||
eitem['message'] = entry['message'] | ||
eitem['author_date'] = self.__fix_field_date(entry['AuthorDate']) | ||
eitem['commit_date'] = self.__fix_field_date(entry['CommitDate']) | ||
|
@@ -158,6 +179,107 @@ def enrich_items(self, ocean_backend, events=False): | |
|
||
return num_items | ||
|
||
def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False, | ||
out_index="cocom_enrich_graal_repo", interval_months=3, | ||
date_field="grimoire_creation_date"): | ||
|
||
logger.info("Doing enrich_repository_analysis study for index {}" | ||
.format(self.elastic.anonymize_url(self.elastic.index_url))) | ||
|
||
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, | ||
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) | ||
|
||
unique_repos = es_in.search( | ||
index=enrich_backend.elastic.index, | ||
body=get_unique_repository) | ||
|
||
repos = [repo['key'] for repo in unique_repos['aggregations']['unique_repos']['buckets']] | ||
metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"] | ||
num_items = 0 | ||
ins_items = 0 | ||
|
||
for repo in repos: | ||
study_data_available = False | ||
|
||
if es_in.indices.exists(index=out_index): | ||
last_study_date = es_in.search( | ||
index=out_index, | ||
body=get_last_study_date % (repo))["aggregations"]["1"] | ||
|
||
if last_study_date["value"] is not None: | ||
study_data_available = True | ||
to_date = str_to_datetime(last_study_date["value_as_string"]) | ||
|
||
if not study_data_available: | ||
first_item_date = es_in.search( | ||
index=enrich_backend.elastic.index, | ||
body=get_first_enriched_date % (repo))["aggregations"]["1"]["hits"]["hits"][0]["_source"] | ||
to_date = str_to_datetime(first_item_date["metadata__updated_on"]) | ||
|
||
es_out = ElasticSearch(enrich_backend.elastic.url, out_index) | ||
evolution_items = [] | ||
|
||
# TODO: Incorporate full month logic | ||
to_date = to_date.replace(day=1) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong. |
||
|
||
while to_date <= datetime.datetime.now(dateutil.tz.tzutc()): | ||
This comment has been minimized.
Sorry, something went wrong.
valeriocos
|
||
files_at_time = es_in.search( | ||
index=enrich_backend.elastic.index, | ||
body=get_files_at_time % (repo, to_date.isoformat())) | ||
|
||
if not len(files_at_time['aggregations']['file_stats']['buckets']): | ||
to_date = to_date + relativedelta(months=+interval_months) | ||
continue | ||
|
||
# TODO: Requires fix | ||
evolution_item = { | ||
This comment has been minimized.
Sorry, something went wrong.
valeriocos
|
||
"id": to_date.strftime('%Y-%m-%dT%H:%M:%S') + "_study_cocom", | ||
# "commit_sha": eitem["commit_sha"], | ||
"origin": repo, | ||
# "file_path": eitem["file_path"], | ||
# "modules": eitem["modules"], | ||
# "author_date": eitem["author_date"], | ||
# "commit_date": eitem["commit_date"], | ||
"to_date": to_date.isoformat() | ||
# "to_date": self.__fix_field_date(to_date.strftime('%Y-%m-%dT%H:%M:%S')) | ||
} | ||
|
||
for file_ in files_at_time['aggregations']['file_stats']['buckets']: | ||
file_details = file_["1"]["hits"]["hits"][0]["_source"] | ||
|
||
for metric in metrics: | ||
if file_details[metric] is not None: | ||
evolution_item["total_" + metric] = evolution_item.get("total_" + metric, 0) + file_details[metric] | ||
|
||
if evolution_item["total_loc"]: | ||
total_lines = evolution_item["total_comments"] + evolution_item["total_blanks"] + evolution_item["total_loc"] | ||
evolution_item["total_comments_ratio"] = evolution_item["total_comments"] / total_lines | ||
evolution_item["total_blanks_ratio"] = evolution_item["total_blanks"] / total_lines | ||
else: | ||
evolution_item["total_comments_ratio"] = evolution_item["total_comments"] | ||
evolution_item["total_blanks_ratio"] = evolution_item["total_blanks"] | ||
|
||
pprint(evolution_item) | ||
evolution_items.append(evolution_item) | ||
|
||
if len(evolution_items) >= self.elastic.max_items_bulk: | ||
num_items += len(evolution_items) | ||
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) | ||
evolution_items = [] | ||
|
||
to_date = to_date + relativedelta(months=+interval_months) | ||
|
||
if len(evolution_items) > 0: | ||
num_items += len(evolution_items) | ||
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) | ||
|
||
if num_items != ins_items: | ||
missing = num_items - ins_items | ||
logger.error("%s/%s missing items for Study Enricher", str(missing), str(num_items)) | ||
else: | ||
logger.info("%s items inserted for Study Enricher", str(num_items)) | ||
|
||
|
||
def __fix_field_date(self, date_value): | ||
"""Fix possible errors in the field date""" | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2015-2019 Bitergia | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program; if not, write to the Free Software | ||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
# | ||
# Authors: | ||
# Valerio Cosentino <[email protected]> | ||
# Nishchith Shetty <[email protected]> | ||
# | ||
|
||
# CONTAINS QUERIES | ||
|
||
get_unique_repository = """ | ||
{ | ||
"size": 0, | ||
"aggs": { | ||
"unique_repos": { | ||
"terms": { | ||
"field": "origin" | ||
} | ||
} | ||
} | ||
} | ||
""" | ||
|
||
get_last_study_date = """ | ||
{ | ||
"size": 0, | ||
"aggs": { | ||
"1": { | ||
"max": { | ||
"field": "to_date" | ||
} | ||
} | ||
}, | ||
"query": { | ||
"bool": { | ||
"filter": [{ | ||
"term": { | ||
"origin.keyword": "%s" | ||
} | ||
}] | ||
} | ||
} | ||
} | ||
""" | ||
|
||
get_first_enriched_date = """ | ||
{ | ||
"size": 0, | ||
"aggs": { | ||
"1": { | ||
"top_hits": { | ||
"docvalue_fields": [ | ||
"metadata__updated_on" | ||
], | ||
"_source": "metadata__updated_on", | ||
"size": 1, | ||
"sort": [{ | ||
"commit_date": { | ||
"order": "asc" | ||
} | ||
}] | ||
} | ||
} | ||
}, | ||
"query": { | ||
"bool": { | ||
"filter": [{ | ||
"term": { | ||
"origin": "%s" | ||
} | ||
}] | ||
} | ||
} | ||
} | ||
""" | ||
|
||
get_files_at_time = """ | ||
{ | ||
"size": 0, | ||
"aggs": { | ||
"file_stats": { | ||
"terms": { | ||
"field": "file_path", | ||
"size": 2147483647, | ||
"order": { | ||
"_key": "desc" | ||
} | ||
}, | ||
"aggs": { | ||
"1": { | ||
"top_hits": { | ||
"size": 1, | ||
"sort": [{ | ||
"metadata__updated_on": { | ||
"order": "desc" | ||
} | ||
}] | ||
} | ||
} | ||
} | ||
} | ||
}, | ||
"query": { | ||
"bool": { | ||
"filter": [{ | ||
"term": { | ||
"origin": "%s" | ||
} | ||
}, { | ||
"range": { | ||
"metadata__updated_on": { | ||
"lte": "%s" | ||
} | ||
} | ||
}] | ||
} | ||
} | ||
} | ||
""" |
Thank you @inishchith for your work. It looks nice! I left some minor comments.
What do you think should be improved? Do you see possible limitiations with this implementation?