Skip to content

Commit

Permalink
[study] Add repository analysis study
Browse files Browse the repository at this point in the history
  • Loading branch information
inishchith committed Jul 19, 2019
1 parent 30ea904 commit 12ec7bf
Show file tree
Hide file tree
Showing 2 changed files with 258 additions and 2 deletions.
126 changes: 124 additions & 2 deletions grimoire_elk/enriched/cocom.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,24 @@
# Nishchith Shetty <[email protected]>
#

import datetime

This comment has been minimized.

Copy link
@valeriocos

valeriocos Jul 19, 2019

Thank you @inishchith for your work. It looks nice! I left some minor comments.
What do you think should be improved? Do you see possible limitiations with this implementation?

This comment has been minimized.

Copy link
@inishchith

inishchith Jul 19, 2019

Author Owner

This is possible a much better version than the initial one related to cache_dict. I'm yet to evaluate this approach, we've probably traded a bit more time instead of memory (which shouldn't be much big of a problem, but needs to be addressed).
I'll share the evaluations later today. Thanks!

import logging
from pprint import pprint

import dateutil.tz
from dateutil.relativedelta import relativedelta

from elasticsearch import Elasticsearch as ES, RequestsHttpConnection

from .enrich import (Enrich,
metadata)
from .graal_utils import (get_unique_repository,
get_last_study_date,
get_first_enriched_date,
get_files_at_time)

from .enrich import Enrich, metadata
from grimoirelab_toolkit.datetime import str_to_datetime
from grimoire_elk.elastic import ElasticSearch

MAX_SIZE_BULK_ENRICHED_ITEMS = 200

Expand All @@ -33,6 +47,14 @@

class CocomEnrich(Enrich):

def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
db_user='', db_password='', db_host=''):
super().__init__(db_sortinghat, db_projects_map, json_projects_map,
db_user, db_password, db_host)

self.studies = []
self.studies.append(self.enrich_repo_analysis)

def get_identities(self, item):
""" Return the identities from an item """
identities = []
Expand Down Expand Up @@ -98,7 +120,6 @@ def get_rich_items(self, item):
eitem['commit_sha'] = entry['commit']
eitem['author'] = entry['Author']
eitem['committer'] = entry['Commit']
eitem['commit'] = entry['commit']
eitem['message'] = entry['message']
eitem['author_date'] = self.__fix_field_date(entry['AuthorDate'])
eitem['commit_date'] = self.__fix_field_date(entry['CommitDate'])
Expand Down Expand Up @@ -158,6 +179,107 @@ def enrich_items(self, ocean_backend, events=False):

return num_items

def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
out_index="cocom_enrich_graal_repo", interval_months=3,
date_field="grimoire_creation_date"):

logger.info("Doing enrich_repository_analysis study for index {}"
.format(self.elastic.anonymize_url(self.elastic.index_url)))

es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)

unique_repos = es_in.search(
index=enrich_backend.elastic.index,
body=get_unique_repository)

repos = [repo['key'] for repo in unique_repos['aggregations']['unique_repos']['buckets']]
metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"]
num_items = 0
ins_items = 0

for repo in repos:
study_data_available = False

if es_in.indices.exists(index=out_index):
last_study_date = es_in.search(
index=out_index,
body=get_last_study_date % (repo))["aggregations"]["1"]

if last_study_date["value"] is not None:
study_data_available = True
to_date = str_to_datetime(last_study_date["value_as_string"])

if not study_data_available:
first_item_date = es_in.search(
index=enrich_backend.elastic.index,
body=get_first_enriched_date % (repo))["aggregations"]["1"]["hits"]["hits"][0]["_source"]
to_date = str_to_datetime(first_item_date["metadata__updated_on"])

es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
evolution_items = []

# TODO: Incorporate full month logic
to_date = to_date.replace(day=1)

This comment has been minimized.

Copy link
@valeriocos

valeriocos Jul 19, 2019

we could set the zero all hours, minutes, seconds info, wdty?

This comment has been minimized.

Copy link
@inishchith

inishchith Jul 19, 2019

Author Owner

@valeriocos Yes. Agreed.


while to_date <= datetime.datetime.now(dateutil.tz.tzutc()):

This comment has been minimized.

Copy link
@valeriocos

valeriocos Jul 19, 2019

there is a function in grimoirelab-toolkit that returns the current datetime (e.g., https://github.com/chaoss/grimoirelab-elk/blob/master/grimoire_elk/enriched/bugzillarest.py#L28). For consistency, it's better to use that one.

This comment has been minimized.

Copy link
@inishchith

inishchith Jul 19, 2019

Author Owner

Thanks for pointing it out. I've updated it :)

files_at_time = es_in.search(
index=enrich_backend.elastic.index,
body=get_files_at_time % (repo, to_date.isoformat()))

if not len(files_at_time['aggregations']['file_stats']['buckets']):
to_date = to_date + relativedelta(months=+interval_months)
continue

# TODO: Requires fix
evolution_item = {

This comment has been minimized.

Copy link
@valeriocos

valeriocos Jul 19, 2019

the evolution item should probably contain also the interval_months in case the same index includes more results calculated with different intervals, WDYT?

This comment has been minimized.

Copy link
@inishchith

inishchith Jul 19, 2019

Author Owner

Yes, it should have the field.
Thanks for pointing this out, as I have a concern here. In case there are data in the index related to 6-months and 1-month interval and even in case there's an overlap in the date, the items would be different. Would this be a problem in our case? WDYT?

This comment has been minimized.

Copy link
@valeriocos

valeriocos Jul 19, 2019

Thank you for pointing this out @inishchith . I'm not sure about whether it will be a problem or not, we could explore some solutions, for instance: (i) the study could delete the index if it detects a change in the interval, (ii) we could allow the existance of more intervals in the index, but we should have a selector to filter the items generated with the same interval. WDYT?

This comment has been minimized.

Copy link
@inishchith

inishchith Jul 19, 2019

Author Owner

@valeriocos I'm currently onto evaluation of the approach, I can surely try the ideas proposed by you and share the results soon!

Thanks!

"id": to_date.strftime('%Y-%m-%dT%H:%M:%S') + "_study_cocom",
# "commit_sha": eitem["commit_sha"],
"origin": repo,
# "file_path": eitem["file_path"],
# "modules": eitem["modules"],
# "author_date": eitem["author_date"],
# "commit_date": eitem["commit_date"],
"to_date": to_date.isoformat()
# "to_date": self.__fix_field_date(to_date.strftime('%Y-%m-%dT%H:%M:%S'))
}

for file_ in files_at_time['aggregations']['file_stats']['buckets']:
file_details = file_["1"]["hits"]["hits"][0]["_source"]

for metric in metrics:
if file_details[metric] is not None:
evolution_item["total_" + metric] = evolution_item.get("total_" + metric, 0) + file_details[metric]

if evolution_item["total_loc"]:
total_lines = evolution_item["total_comments"] + evolution_item["total_blanks"] + evolution_item["total_loc"]
evolution_item["total_comments_ratio"] = evolution_item["total_comments"] / total_lines
evolution_item["total_blanks_ratio"] = evolution_item["total_blanks"] / total_lines
else:
evolution_item["total_comments_ratio"] = evolution_item["total_comments"]
evolution_item["total_blanks_ratio"] = evolution_item["total_blanks"]

pprint(evolution_item)
evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
evolution_items = []

to_date = to_date + relativedelta(months=+interval_months)

if len(evolution_items) > 0:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Study Enricher", str(missing), str(num_items))
else:
logger.info("%s items inserted for Study Enricher", str(num_items))


def __fix_field_date(self, date_value):
"""Fix possible errors in the field date"""

Expand Down
134 changes: 134 additions & 0 deletions grimoire_elk/enriched/graal_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2019 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Authors:
# Valerio Cosentino <[email protected]>
# Nishchith Shetty <[email protected]>
#

# CONTAINS QUERIES

get_unique_repository = """
{
"size": 0,
"aggs": {
"unique_repos": {
"terms": {
"field": "origin"
}
}
}
}
"""

get_last_study_date = """
{
"size": 0,
"aggs": {
"1": {
"max": {
"field": "to_date"
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin.keyword": "%s"
}
}]
}
}
}
"""

get_first_enriched_date = """
{
"size": 0,
"aggs": {
"1": {
"top_hits": {
"docvalue_fields": [
"metadata__updated_on"
],
"_source": "metadata__updated_on",
"size": 1,
"sort": [{
"commit_date": {
"order": "asc"
}
}]
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin": "%s"
}
}]
}
}
}
"""

get_files_at_time = """
{
"size": 0,
"aggs": {
"file_stats": {
"terms": {
"field": "file_path",
"size": 2147483647,
"order": {
"_key": "desc"
}
},
"aggs": {
"1": {
"top_hits": {
"size": 1,
"sort": [{
"metadata__updated_on": {
"order": "desc"
}
}]
}
}
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin": "%s"
}
}, {
"range": {
"metadata__updated_on": {
"lte": "%s"
}
}
}]
}
}
}
"""

0 comments on commit 12ec7bf

Please sign in to comment.