Skip to content

Commit

Permalink
[study] Add repository analysis study
Browse files Browse the repository at this point in the history
Signed-off-by: inishchith <[email protected]>
  • Loading branch information
inishchith committed Jul 26, 2019
1 parent 30ea904 commit cb01d31
Show file tree
Hide file tree
Showing 2 changed files with 295 additions and 8 deletions.
115 changes: 107 additions & 8 deletions grimoire_elk/enriched/cocom.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,21 @@
#

import logging
from pprint import pprint

from .enrich import Enrich, metadata
from grimoirelab_toolkit.datetime import str_to_datetime
from dateutil.relativedelta import relativedelta

from elasticsearch import Elasticsearch as ES, RequestsHttpConnection

from .enrich import (Enrich,
metadata)
from .graal_study_evolution import (get_to_date,
get_unique_repository,
get_files_at_time)

from grimoirelab_toolkit.datetime import (str_to_datetime,
datetime_utcnow)
from grimoire_elk.elastic import ElasticSearch

MAX_SIZE_BULK_ENRICHED_ITEMS = 200

Expand All @@ -33,6 +45,14 @@

class CocomEnrich(Enrich):

def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
db_user='', db_password='', db_host=''):
super().__init__(db_sortinghat, db_projects_map, json_projects_map,
db_user, db_password, db_host)

self.studies = []
self.studies.append(self.enrich_repo_analysis)

def get_identities(self, item):
""" Return the identities from an item """
identities = []
Expand Down Expand Up @@ -98,7 +118,6 @@ def get_rich_items(self, item):
eitem['commit_sha'] = entry['commit']
eitem['author'] = entry['Author']
eitem['committer'] = entry['Commit']
eitem['commit'] = entry['commit']
eitem['message'] = entry['message']
eitem['author_date'] = self.__fix_field_date(entry['AuthorDate'])
eitem['commit_date'] = self.__fix_field_date(entry['CommitDate'])
Expand All @@ -120,13 +139,15 @@ def get_rich_items(self, item):

def __add_derived_metrics(self, file_analysis, eitem):
""" Add derived metrics fields """
if eitem['loc']:

# TODO: Fix Logic: None rather than 1
if eitem["loc"]:
total_lines = eitem['loc'] + eitem['comments'] + eitem['blanks']
eitem["comments_ratio"] = eitem['comments'] / total_lines
eitem["blanks_ratio"] = eitem['blanks'] / total_lines
eitem["lines_per_comment_lines"] = total_lines / max(eitem["comments"], 1)
eitem["lines_per_blank_lines"] = total_lines / max(eitem["blanks"], 1)
else:
eitem["comments_ratio"] = eitem['comments']
eitem["blanks_ratio"] = eitem['blanks']
eitem["lines_per_comment_lines"] = 0
eitem["lines_per_blank_lines"] = 0

return eitem

Expand Down Expand Up @@ -158,6 +179,84 @@ def enrich_items(self, ocean_backend, events=False):

return num_items

def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
out_index="cocom_enrich_graal_repo", interval_months=3,
date_field="grimoire_creation_date"):

logger.info("Doing enrich_repository_analysis study for index {}"
.format(self.elastic.anonymize_url(self.elastic.index_url)))

es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
in_index = enrich_backend.elastic.index

unique_repos = es_in.search(
index=in_index,
body=get_unique_repository())

repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos']['buckets']]
metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"]
num_items = 0
ins_items = 0

for repository_url in repositories:
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
evolution_items = []

to_date = get_to_date(es_in, in_index, out_index, repository_url)
to_date = to_date.replace(day=1, hour=0, minute=0, second=0)
today_date = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)

while to_date < today_date:
files_at_time = es_in.search(
index=in_index,
body=get_files_at_time(repository_url, to_date.isoformat()))

if not len(files_at_time['aggregations']['file_stats']['buckets']):
to_date = to_date + relativedelta(months=+interval_months)
continue

repository_name = repository_url.split("/")[-1]
evolution_item = {
"id": to_date.strftime('%Y-%m-%dT%H:%M:%S') + "_" + repository_name + "_" + str(interval_months),
"origin": repository_url,
"interval_months": interval_months,
"study_creation_date": to_date.isoformat(),
"total_files": len(files_at_time['aggregations']['file_stats']['buckets'])
}

for file_ in files_at_time['aggregations']['file_stats']['buckets']:
file_details = file_["1"]["hits"]["hits"][0]["_source"]

for metric in metrics:
if file_details[metric] is not None:
evolution_item["total_" + metric] = evolution_item.get("total_" + metric, 0) + file_details[metric]

total_lines = evolution_item["total_comments"] + evolution_item["total_blanks"] + evolution_item["total_loc"]
# TODO: Fix Logic: None rather than 1
evolution_item["total_lines_per_comment_lines"] = total_lines / max(evolution_item["total_comments"], 1)
evolution_item["total_lines_per_blank_lines"] = total_lines / max(evolution_item["total_blanks"], 1)

pprint(evolution_item)
evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
evolution_items = []

to_date = to_date + relativedelta(months=+interval_months)

if len(evolution_items) > 0:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Study Enricher", str(missing), str(num_items))
else:
logger.info("%s items inserted for Study Enricher", str(num_items))

def __fix_field_date(self, date_value):
"""Fix possible errors in the field date"""

Expand Down
188 changes: 188 additions & 0 deletions grimoire_elk/enriched/graal_study_evolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2019 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Authors:
# Valerio Cosentino <[email protected]>
# Nishchith Shetty <[email protected]>
#

from grimoirelab_toolkit.datetime import str_to_datetime


def get_unique_repository():
""" Retrieve all the repository names from the index. """

query_unique_repository = """
{
"size": 0,
"aggs": {
"unique_repos": {
"terms": {
"field": "origin"
}
}
}
}
"""

return query_unique_repository


def get_last_study_date(repository_url):
""" Retrieve the last study_creation_date of the item corresponding
to given repository from the study index.
"""

query_last_study_date = """
{
"size": 0,
"aggs": {
"1": {
"max": {
"field": "study_creation_date"
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin.keyword": "%s"
}
}]
}
}
}
""" % (repository_url)

return query_last_study_date


def get_first_enriched_date(repository_url):
""" Retrieve the first/oldest metadata__updated_on of the item
corresponding to given repository.
"""

query_first_enriched_date = """
{
"size": 0,
"aggs": {
"1": {
"top_hits": {
"docvalue_fields": [
"metadata__updated_on"
],
"_source": "metadata__updated_on",
"size": 1,
"sort": [{
"commit_date": {
"order": "asc"
}
}]
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin": "%s"
}
}]
}
}
}
""" % (repository_url)

return query_first_enriched_date


def get_files_at_time(repository_url, to_date):
""" Retrieve all the latest changes wrt files until the to_date,
corresponding to the given repository.
"""

query_files_at_time = """
{
"size": 0,
"aggs": {
"file_stats": {
"terms": {
"field": "file_path",
"size": 2147483647,
"order": {
"_key": "desc"
}
},
"aggs": {
"1": {
"top_hits": {
"size": 1,
"sort": [{
"metadata__updated_on": {
"order": "desc"
}
}]
}
}
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin": "%s"
}
},
{
"range": {
"metadata__updated_on": {
"lte": "%s"
}
}
}]
}
}
}
""" % (repository_url, to_date)

return query_files_at_time


def get_to_date(es_in, in_index, out_index, repository_url):
""" Get the appropriate to_date value for incremental insertion. """
study_data_available = False

if es_in.indices.exists(index=out_index):
last_study_date = es_in.search(
index=out_index,
body=get_last_study_date(repository_url))["aggregations"]["1"]

if last_study_date["value"] is not None:
study_data_available = True
to_date = str_to_datetime(last_study_date["value_as_string"])

if not study_data_available:
first_item_date = es_in.search(
index=in_index,
body=get_first_enriched_date(repository_url))["aggregations"]["1"]["hits"]["hits"][0]["_source"]

to_date = str_to_datetime(first_item_date["metadata__updated_on"])

return to_date

0 comments on commit cb01d31

Please sign in to comment.