Skip to content

Commit

Permalink
[graal:tests] Add appropriate tests for Graal integration (WIP)
Browse files Browse the repository at this point in the history
Signed-off-by: inishchith <[email protected]>
  • Loading branch information
inishchith committed Jul 31, 2019
1 parent 1308300 commit 20eb820
Show file tree
Hide file tree
Showing 8 changed files with 758 additions and 113 deletions.
123 changes: 65 additions & 58 deletions grimoire_elk/enriched/cocom.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,14 @@ def __add_derived_metrics(self, file_analysis, eitem):
""" Add derived metrics fields """

# TODO: Fix Logic: None rather than 1
if None not in [eitem["loc"], eitem["comments"], eitem["num_funs"]]:
eitem["loc_per_comment_lines"] = eitem["loc"] / max(eitem["comments"], 1)
eitem["loc_per_blank_lines"] = eitem["loc"] / max(eitem["blanks"], 1)
eitem["loc_per_function"] = eitem["loc"] / max(eitem["num_funs"], 1)
if eitem["loc"] is not None and eitem["comments"] is not None and eitem["num_funs"] is not None:
eitem["comments_per_loc"] = round(eitem["comments"] / max(eitem["loc"], 1), 2)
eitem["blanks_per_loc"] = round(eitem["blanks"] / max(eitem["loc"], 1), 2)
eitem["loc_per_function"] = round(eitem["loc"] / max(eitem["num_funs"], 1), 2)
else:
eitem["loc_per_comment_lines"] = eitem["loc_per_blank_lines"] = eitem["loc_per_function"] = None
eitem["comments_per_loc"] = None
eitem["blanks_per_loc"] = None
eitem["loc_per_function"] = None

return eitem

Expand Down Expand Up @@ -176,7 +178,7 @@ def enrich_items(self, ocean_backend, events=False):
return num_items

def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
out_index="cocom_enrich_graal_repo", interval_months=3,
out_index="cocom_enrich_graal_repo", interval_months=[3],
date_field="grimoire_creation_date"):

logger.info("Doing enrich_repository_analysis study for index {}"
Expand All @@ -185,71 +187,76 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
in_index = enrich_backend.elastic.index
interval_months = list(map(int, interval_months))

unique_repos = es_in.search(
index=in_index,
body=get_unique_repository())

repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
num_items = 0
ins_items = 0

for repository_url in repositories:
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
evolution_items = []

to_month = get_to_date(es_in, in_index, out_index, repository_url)
to_month = to_month.replace(day=1, hour=0, minute=0, second=0)
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)

while to_month < current_month:
files_at_time = es_in.search(
index=in_index,
body=get_files_at_time(repository_url, to_month.isoformat())
)['aggregations']['file_stats'].get("buckets", [])

if not len(files_at_time):
to_month = to_month + relativedelta(months=+interval_months)
continue

repository_name = repository_url.split("/")[-1]
evolution_item = {
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months),
"origin": repository_url,
"interval_months": interval_months,
"study_creation_date": to_month.isoformat(),
"total_files": len(files_at_time)
}

for file_ in files_at_time:
file_details = file_["1"]["hits"]["hits"][0]["_source"]

for metric in self.metrics:
total_metric = "total_" + metric
evolution_item[total_metric] = evolution_item.get(total_metric, 0)
evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0

# TODO: Fix Logic: None rather than 1
evolution_item["total_loc_per_comment_lines"] = evolution_item["total_loc"] / \
max(evolution_item["total_comments"], 1)
evolution_item["total_loc_per_blank_lines"] = evolution_item["total_loc"] / max(evolution_item["total_blanks"], 1)
evolution_item["total_loc_per_function"] = evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1)

evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
evolution_items = []
for interval in interval_months:

to_month = to_month + relativedelta(months=+interval_months)
to_month = get_to_date(es_in, in_index, out_index, repository_url, interval)
to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0)

if len(evolution_items) > 0:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
while to_month < current_month:
files_at_time = es_in.search(
index=in_index,
body=get_files_at_time(repository_url, to_month.isoformat())
)['aggregations']['file_stats'].get("buckets", [])

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
else:
logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
if not len(files_at_time):
to_month = to_month + relativedelta(months=+interval)
continue

repository_name = repository_url.split("/")[-1]
evolution_item = {
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval),
"origin": repository_url,
"interval_months": interval,
"study_creation_date": to_month.isoformat(),
"total_files": len(files_at_time)
}

for file_ in files_at_time:
file_details = file_["1"]["hits"]["hits"][0]["_source"]

for metric in self.metrics:
total_metric = "total_" + metric
evolution_item[total_metric] = evolution_item.get(total_metric, 0)
evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0

# TODO: Fix Logic: None rather than 1
evolution_item["total_comments_per_loc"] = round(
evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2)
evolution_item["total_blanks_per_loc"] = round(
evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2)
evolution_item["total_loc_per_function"] = round(
evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2)

evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
evolution_items = []

to_month = to_month + relativedelta(months=+interval)

if len(evolution_items) > 0:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
else:
logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
104 changes: 56 additions & 48 deletions grimoire_elk/enriched/colic.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def has_identities(self):
def get_field_unique_id(self):
return "id"

def get_licensed_files(repository_url, to_date):
def get_licensed_files(self, repository_url, to_date):
""" Retrieve all the licensed files until the to_date, corresponding
to the given repository.
"""
Expand Down Expand Up @@ -108,7 +108,7 @@ def get_licensed_files(repository_url, to_date):

return query_licensed_files

def get_copyrighted_files(repository_url, to_date):
def get_copyrighted_files(self, repository_url, to_date):
""" Retrieve all the copyrighted files until the to_date, corresponding
to the given repository.
"""
Expand Down Expand Up @@ -260,7 +260,7 @@ def enrich_items(self, ocean_backend, events=False):
return num_items

def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
out_index="colic_enrich_graal_repo", interval_months=3,
out_index="colic_enrich_graal_repo", interval_months=[3],
date_field="grimoire_creation_date"):

logger.info("Doing enrich_colic_analysis study for index {}"
Expand All @@ -269,66 +269,74 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
in_index = enrich_backend.elastic.index
interval_months = list(map(int, interval_months))

unique_repos = es_in.search(
index=in_index,
body=get_unique_repository())

repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
num_items = 0
ins_items = 0

for repository_url in repositories:
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
evolution_items = []

to_month = get_to_date(es_in, in_index, out_index, repository_url)
to_month = to_month.replace(day=1, hour=0, minute=0, second=0)
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)

while to_month < current_month:
copyrighted_files_at_time = es_in.search(
index=in_index,
body=self.get_copyrighted_files(repository_url, to_month.isoformat()))

licensed_files_at_time = es_in.search(
index=in_index,
body=self.get_licensed_files(repository_url, to_month.isoformat()))

files_at_time = es_in.search(
index=in_index,
body=get_files_at_time(repository_url, to_month.isoformat()))

licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
total_files = int(files_at_time["aggregations"]["1"]["value"])

repository_name = repository_url.split("/")[-1]
evolution_item = {
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months),
"origin": repository_url,
"interval_months": interval_months,
"study_creation_date": to_month.isoformat(),
"licensed_files": licensed_files,
"copyrighted_files": copyrighted_files,
"total_files": total_files
}
for interval in interval_months:

to_month = get_to_date(es_in, in_index, out_index, repository_url, interval)
to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0)

while to_month < current_month:
copyrighted_files_at_time = es_in.search(
index=in_index,
body=self.get_copyrighted_files(repository_url, to_month.isoformat()))

licensed_files_at_time = es_in.search(
index=in_index,
body=self.get_licensed_files(repository_url, to_month.isoformat()))

files_at_time = es_in.search(
index=in_index,
body=get_files_at_time(repository_url, to_month.isoformat()))

licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
# TODO: Fix - need more efficient query
total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", []))

if not total_files:
to_month = to_month + relativedelta(months=+interval)
continue

repository_name = repository_url.split("/")[-1]
evolution_item = {
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval),
"origin": repository_url,
"interval_months": interval,
"study_creation_date": to_month.isoformat(),
"licensed_files": licensed_files,
"copyrighted_files": copyrighted_files,
"total_files": total_files
}

evolution_items.append(evolution_item)
evolution_items.append(evolution_item)

if len(evolution_items) >= self.elastic.max_items_bulk:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
evolution_items = []
if len(evolution_items) >= self.elastic.max_items_bulk:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
evolution_items = []

to_month = to_month + relativedelta(months=+interval_months)
to_month = to_month + relativedelta(months=+interval)

if len(evolution_items) > 0:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
if len(evolution_items) > 0:
num_items += len(evolution_items)
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())

if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
else:
logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
if num_items != ins_items:
missing = num_items - ins_items
logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
else:
logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
14 changes: 8 additions & 6 deletions grimoire_elk/enriched/graal_study_evolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_unique_repository():
return query_unique_repository


def get_last_study_date(repository_url):
def get_last_study_date(repository_url, interval):
""" Retrieve the last study_creation_date of the item corresponding
to given repository from the study index.
"""
Expand All @@ -64,11 +64,15 @@ def get_last_study_date(repository_url):
"term": {
"origin.keyword": "%s"
}
},{
"term":{
"interval_months": "%s"
}
}]
}
}
}
""" % (repository_url)
""" % (repository_url, interval)

return query_last_study_date

Expand Down Expand Up @@ -117,8 +121,6 @@ def get_files_at_time(repository_url, to_date):
corresponding to the given repository.
"""

# TODO: Fix for interval month matching

query_files_at_time = """
{
"size": 0,
Expand Down Expand Up @@ -167,14 +169,14 @@ def get_files_at_time(repository_url, to_date):
return query_files_at_time


def get_to_date(es_in, in_index, out_index, repository_url):
def get_to_date(es_in, in_index, out_index, repository_url, interval):
""" Get the appropriate to_date value for incremental insertion. """
study_data_available = False

if es_in.indices.exists(index=out_index):
last_study_date = es_in.search(
index=out_index,
body=get_last_study_date(repository_url))["aggregations"]["1"]
body=get_last_study_date(repository_url, interval))["aggregations"]["1"]

if last_study_date["value"] is not None:
study_data_available = True
Expand Down
2 changes: 1 addition & 1 deletion grimoire_elk/raw/graal.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_elastic_mappings(es_major):


class GraalOcean(ElasticOcean):
"""CoLic Ocean feeder"""
"""Graal Ocean feeder"""

mapping = Mapping

Expand Down
Loading

0 comments on commit 20eb820

Please sign in to comment.