Skip to content

Commit

Permalink
[logger] Add logs for study and Fix CoLic query
Browse files Browse the repository at this point in the history
Add appropriate logs for the enrichers
Update CoCom Study method name
Fix Tests

Signed-off-by: inishchith <[email protected]>
  • Loading branch information
inishchith authored and valeriocos committed Aug 23, 2019
1 parent b9900af commit 640ac44
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 49 deletions.
28 changes: 19 additions & 9 deletions grimoire_elk/enriched/cocom.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=N
db_user, db_password, db_host)

self.studies = []
self.studies.append(self.enrich_repo_analysis)
self.studies.append(self.enrich_cocom_analysis)

def get_identities(self, item):
""" Return the identities from an item """
Expand Down Expand Up @@ -242,11 +242,11 @@ def enrich_items(self, ocean_backend, events=False):

return num_items

def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
out_index="cocom_enrich_graal_repo", interval_months=[3],
date_field="grimoire_creation_date"):
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
out_index="cocom_enrich_graal_repo", interval_months=[3],
date_field="grimoire_creation_date"):

logger.info("[cocom] Starting enrich_repository_analysis study")
logger.info("[enrich-cocom-analysis] Start enrich_cocom_analysis study")

es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
Expand All @@ -259,11 +259,16 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal

repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)

logger.info("[enrich-cocom-analysis] {} repositories to process".format(len(repositories)))
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
es_out.add_alias("cocom_study")

num_items = 0
ins_items = 0

for repository_url in repositories:
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
logger.info("[enrich-cocom-analysis] Start analysis for {}".format(repository_url))
evolution_items = []

for interval in interval_months:
Expand Down Expand Up @@ -306,6 +311,7 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
evolution_item["total_loc_per_function"] = round(
evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2)

evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats"))
evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
Expand All @@ -321,8 +327,12 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
logger.error(
"[enrich-cocom-analysis] %s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)
)
else:
logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
logger.info("[enrich-cocom-analysis] %s items inserted for Graal CoCom Analysis Study", str(num_items))

logger.info("[enrich-cocom-analysis] End analysis for {} with month interval".format(repository_url, interval))

logger.info("[cocom] Ending enrich_repository_analysis study")
logger.info("[enrich-cocom-analysis] End enrich_cocom_analysis study")
100 changes: 68 additions & 32 deletions grimoire_elk/enriched/colic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
from .enrich import (Enrich,
metadata)
from .graal_study_evolution import (get_to_date,
get_unique_repository,
get_files_at_time)
get_unique_repository)
from .utils import fix_field_date
from ..elastic_mapping import Mapping as BaseMapping

Expand Down Expand Up @@ -107,7 +106,43 @@ def has_identities(self):
def get_field_unique_id(self):
return "id"

def get_licensed_files(self, repository_url, to_date):
def __get_total_files(self, repository_url, to_date):
""" Retrieve total number for files until to_date, corresponding
to the given repository
"""

query_total_files = """
{
"size": 0,
"aggs": {
"1": {
"cardinality": {
"field": "file_path"
}
}
},
"query": {
"bool": {
"filter": [{
"term": {
"origin": "%s"
}
},
{
"range": {
"metadata__updated_on": {
"lte": "%s"
}
}
}]
}
}
}
""" % (repository_url, to_date)

return query_total_files

def __get_licensed_files(self, repository_url, to_date):
""" Retrieve all the licensed files until the to_date, corresponding
to the given repository.
"""
Expand All @@ -124,18 +159,14 @@ def get_licensed_files(self, repository_url, to_date):
},
"query": {
"bool": {
"must": [{
"match_phrase": {
"has_license": {
"query": 1
}
"filter": [{
"term": {
"has_license": 1
}
},
{
"match_phrase": {
"origin": {
"query": "%s"
}
"term": {
"origin": "%s"
}
},
{
Expand All @@ -152,7 +183,7 @@ def get_licensed_files(self, repository_url, to_date):

return query_licensed_files

def get_copyrighted_files(self, repository_url, to_date):
def __get_copyrighted_files(self, repository_url, to_date):
""" Retrieve all the copyrighted files until the to_date, corresponding
to the given repository.
"""
Expand All @@ -169,18 +200,14 @@ def get_copyrighted_files(self, repository_url, to_date):
},
"query": {
"bool": {
"must": [{
"match_phrase": {
"has_copyright": {
"query": 1
}
"filter": [{
"term": {
"has_copyright": 1
}
},
{
"match_phrase": {
"origin": {
"query": "%s"
}
"term": {
"origin": "%s"
}
},
{
Expand Down Expand Up @@ -338,7 +365,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
out_index="colic_enrich_graal_repo", interval_months=[3],
date_field="grimoire_creation_date"):

logger.info("[colic] Starting enrich_colic_analysis study")
logger.info("[enrich-colic-analysis] Start enrich_colic_analysis study")

es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
Expand All @@ -350,12 +377,17 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
body=get_unique_repository())

repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]

logger.info("[enrich-colic-analysis] {} repositories to process".format(len(repositories)))
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
es_out.add_alias("colic_study")

current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
num_items = 0
ins_items = 0

for repository_url in repositories:
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
logger.info("[enrich-colic-analysis] Start analysis for {}".format(repository_url))
evolution_items = []

for interval in interval_months:
Expand All @@ -366,20 +398,19 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
while to_month < current_month:
copyrighted_files_at_time = es_in.search(
index=in_index,
body=self.get_copyrighted_files(repository_url, to_month.isoformat()))
body=self.__get_copyrighted_files(repository_url, to_month.isoformat()))

licensed_files_at_time = es_in.search(
index=in_index,
body=self.get_licensed_files(repository_url, to_month.isoformat()))
body=self.__get_licensed_files(repository_url, to_month.isoformat()))

files_at_time = es_in.search(
index=in_index,
body=get_files_at_time(repository_url, to_month.isoformat()))
body=self.__get_total_files(repository_url, to_month.isoformat()))

licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
# TODO: Fix - need more efficient query
total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", []))
total_files = int(files_at_time["aggregations"]["1"]["value"])

if not total_files:
to_month = to_month + relativedelta(months=+interval)
Expand All @@ -396,6 +427,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
"total_files": total_files
}

evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats"))
evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
Expand All @@ -411,8 +443,12 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
logger.error(
"[enrich-colic-analysis] %s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)
)
else:
logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
logger.info("[enrich-colic-analysis] %s items inserted for Graal CoLic Analysis Study", str(num_items))

logger.info("[enrich-colic-analysis] End analysis for {} with month interval".format(repository_url, interval))

logger.info("[colic] Ending enrich_colic_analysis study")
logger.info("[enrich-colic-analysis] End enrich_colic_analysis study")
12 changes: 6 additions & 6 deletions tests/test_cocom.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,16 +124,16 @@ def test_raw_to_enrich(self):
def test_cocom_analysis_study(self):
""" Test that the cocom analysis study works correctly """

study, ocean_backend, enrich_backend = self._test_study('enrich_repo_analysis')
study, ocean_backend, enrich_backend = self._test_study('enrich_cocom_analysis')

with self.assertLogs(logger, level='INFO') as cm:

if study.__name__ == "enrich_repo_analysis":
if study.__name__ == "enrich_cocom_analysis":
study(ocean_backend, enrich_backend)
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[cocom] Starting '
'enrich_repository_analysis study')
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[cocom] Ending '
'enrich_repository_analysis study')
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] Start '
'enrich_cocom_analysis study')
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] End '
'enrich_cocom_analysis study')


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions tests/test_colic.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ def test_colic_analysis_study(self):

if study.__name__ == "enrich_colic_analysis":
study(ocean_backend, enrich_backend)
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[colic] Starting '
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] Start '
'enrich_colic_analysis study')
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[colic] Ending '
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] End '
'enrich_colic_analysis study')


Expand Down

0 comments on commit 640ac44

Please sign in to comment.