__init__.py

# -*- coding: utf-8 -*-
"""
@author: Francesco Multari

Vocabulary as a Service
"""
import flask as flask
from flask import Flask, request, render_template
import numpy as np
import json as json
import pandas as pd
import requests
from df2gspread import gspread2df, df2gspread
import multiprocessing


import siris_tools as st

#creating instance of the class
app=Flask(__name__)
app.static_folder = 'static'


def performStatics(tags_joined):
    sheet_stat=tags_joined.groupby(["Taxon"])['id'].count()

    sheet_stat_toupload=pd.DataFrame({"Taxon":sheet_stat.index,"Count":sheet_stat})
    sheet_stat_toupload["CountPerc"]=round(sheet_stat_toupload["Count"]/sheet_stat_toupload["Count"].sum(),2)*100

    sheet_stat_toupload=sheet_stat_toupload.sort_values(["Count"],ascending=False).reset_index(drop=True)

    sheet_stat_keyword=tags_joined.groupby(["keyword"])['id'].count()
    sheet_stat_keyword_toupload=pd.DataFrame({"Keyword":sheet_stat_keyword.index,"Count":sheet_stat_keyword})

    sheet_stat_keyword_toupload=sheet_stat_keyword_toupload.sort_values(["Count"],ascending=False).reset_index(drop=True)
    sheet_stat_keyword_toupload["CountPerc"]=round(sheet_stat_keyword_toupload["Count"]/sheet_stat_keyword_toupload["Count"].sum(),2)*100
    
    return (sheet_stat_toupload.copy(),sheet_stat_keyword_toupload)


def applyscriptCombiningLemma(vocabulary,outputHash,sheetNameOutput,dataset):
    
    tagger = st.SDGTagger(vocabulary,lemmatize_text = True)

   
    cores=int(multiprocessing.cpu_count())
    
    dataset['fulltext'] = dataset['fulltext'] .astype(str)
    results_True = tagger.tagTextCollection(dataset, text_id='id', text_column='fulltext', n_cores=cores)
    
    print ("RESULTS:")
    print (results_True.head())
    # tidy up the tags
    try:
        
        tags_True = results_True.apply(pd.Series).stack().reset_index().rename(columns={0:'ID', 'level_0':'eid'}).drop(columns='level_1').merge(vocabulary[['ID', 'keyword']],on='ID')


        tags_joined_True=pd.merge(tags_True,dataset,on="id",how="inner")


        sheet_True=tags_joined_True[["id","keyword","fulltext"]]
        sheet_True=sheet_True.groupby(["id","fulltext"])["keyword"].apply(','.join).reset_index()
    
       
    except:
         tags_joined_True=pd.DataFrame(None)
         sheet_True=pd.DataFrame(columns=["id","fulltext","keyword"])

    if uploadDataframeToAGoogleSheet(sheet_True,outputHash,sheetNameOutput+"_V1")=="error":
        return ("ERROR-Error tyring  to open nonexistent or inaccessible spreadsheet for the output DATASET!")
    
    
    #uploadDataframeToAGoogleSheet( True_sheet_stat_toupload,outputHash,sheetNameOutput+"-stat_V1")
    #uploadDataframeToAGoogleSheet( True_sheet_stat_keyword_toupload,outputHash,sheetNameOutput+"-stat_keyword_V1")
    
    ############# Lemmatize False
    
    tagger = st.SDGTagger(vocabulary,lemmatize_text = False)
    results_False = tagger.tagTextCollection(dataset, text_id='id', text_column='fulltext', n_cores=cores)
    
    try:
    	tags_False = results_False.apply(pd.Series).stack().reset_index().rename(columns={0:'ID', 'level_0':'eid'}).drop(columns='level_1').merge(vocabulary[['ID', 'keyword']],on='ID')
    	tags_joined_False=pd.merge(tags_False,dataset,on="id",how="inner")
    	sheet_False=tags_joined_False[["id","keyword","fulltext"]]
    	sheet_False=sheet_False.groupby(["id","fulltext"])["keyword"].apply(','.join).reset_index()
    
    	False_sheet_stat_toupload,False_sheet_stat_keyword_toupload=performStatics(tags_joined_False)
    
    
    	uploadDataframeToAGoogleSheet(sheet_False,outputHash,sheetNameOutput+"_V2-1")
    #uploadDataframeToAGoogleSheet( False_sheet_stat_toupload,outputHash,sheetNameOutput+"-stat_V2-1")
    #uploadDataframeToAGoogleSheet( False_sheet_stat_keyword_toupload,outputHash,sheetNameOutput+"-stat_keyword_V2-1")
    
    
    	final_sheet=pd.concat([sheet_True,sheet_False]).reset_index(drop=True)
    	final_tags=pd.concat([tags_joined_True,tags_joined_False]).reset_index(drop=True)
    except:
        final_sheet=sheet_True.copy()
        final_tags=tags_joined_True.copy()
        
    if final_tags.empty==False:
        sheet_stat_toupload,sheet_stat_keyword_toupload=performStatics(final_tags)
    
        uploadDataframeToAGoogleSheet(final_sheet,outputHash,sheetNameOutput+"_V3")
        uploadDataframeToAGoogleSheet( sheet_stat_toupload,outputHash,sheetNameOutput+"-stat_V3")
        uploadDataframeToAGoogleSheet( sheet_stat_keyword_toupload,outputHash,sheetNameOutput+"-stat_keyword_V3")
    
    else:
        final_sheet=pd.DataFrame(columns=["id","fulltext","keyword"])
        sheet_stat_toupload=pd.DataFrame(columns=["Taxon","Count","CountPerc"])
        sheet_stat_keyword_toupload=pd.DataFrame(columns=["Keyword","Count","CountPerc"])
        
        uploadDataframeToAGoogleSheet(final_sheet,outputHash,sheetNameOutput+"_V3")
        uploadDataframeToAGoogleSheet( sheet_stat_toupload,outputHash,sheetNameOutput+"-stat_V3")
        uploadDataframeToAGoogleSheet( sheet_stat_keyword_toupload,outputHash,sheetNameOutput+"-stat_keyword_V3")
        return ("DONE - Process ended correctly. No results founded for the given vocabulary.")
    
    return ("DONE - Process ended correctly")
    

#utility function used to read the Google sheets content
def downalodFromGoogleSheet(hashId,sheetName):
    try:
        if sheetName!="":
            f=gspread2df.download(
                hashId,
                sheetName,
                col_names=True,
                row_names=False
            ).replace('', pd.np.nan)
        else:
            f=gspread2df.download(
                hashId,
                col_names=True,
                row_names=False
            ).replace('', pd.np.nan)


        return (f.copy())
    except SystemExit:
        return ("ERR")
    except RuntimeError:
        return None


def uploadDataframeToAGoogleSheet(df,hashId,sheetName):
    f=df2gspread.upload(
        df,
        hashId,
        sheetName,row_names=False,col_names=True
        )
    if f is None:
        return ("error")
    return ("ok")

@app.route('/',methods=["POST"])
def runScript():

    #reading fields from the request
    inputHash=request.values.get('inputHash')
    sheetNameInput=request.values.get('sheetNameInput')
    outputHash=request.values.get('outputHash')
    sheetNameOutput=request.values.get("sheetNameOutput")
    dataset=request.values.get("dt")
    

    vocDownloaded=downalodFromGoogleSheet(inputHash,sheetNameInput)
    if vocDownloaded is None:
        return ("ERROR-Error tyring  to open nonexistent or inaccessible spreadsheet for the Vocabulary!")

    #sanity check
    sanitizedCols=["ID","keyword","extra"]

    for c in sanitizedCols:
        if (c in vocDownloaded.columns) == False:
            return ("ERROR-Vocabulary must contains at least the columns: ID, keyword, extra.<br> An example of vocabulary is available here: https://docs.google.com/spreadsheets/d/1eLmPrRLZ3BNDP20eM-M-Tjx9tGeRjkMs8a-lNCyhRKI")

    df=0
    
    pathUbuntu="/home/ubuntu/webapps/dyc-dynamic-controlled-vocabulary/"
   
  
    if dataset=="h2020":
        df=pd.read_csv(pathUbuntu+"static/data/csv/h2020_projects.csv",encoding="latin1")
    elif dataset=="regpub":
        df=pd.read_csv(pathUbuntu+"static/data/csv/regional_publications.csv")
    elif dataset=="unipub":
        df=pd.read_csv(pathUbuntu+"static/data/csv/university_publications.csv")
    elif dataset=="other":
        datasetInput=request.values.get("datasetInput")
        sheetNameDataset=request.values.get("sheetNameDataset")
        df=downalodFromGoogleSheet(datasetInput,sheetNameDataset)
        if df is None:
            return ("ERROR-Error tyring  to open nonexistent or inaccessible spreadsheet for the DATASET!")

        colsInDf=["id","Taxon","fulltext"]

        for c in colsInDf:
            if (c in df.columns )== False:
                return ("ERROR-The dataset must contains at least the columns: id,Taxon,fulltext.<br/> An example of dataset is available here: https://drive.google.com/drive/folders/10e8fBA4JP80g6bCzW1lwipUGg7q12uEg")

    if downalodFromGoogleSheet(outputHash,"") is None:
        return ("ERROR-Error tyring  to open nonexistent or inaccessible spreadsheet for the output DATASET!")


    applyscriptCombiningLemma(vocDownloaded,outputHash,sheetNameOutput,df)
    
    
    return ("OK")

@app.route('/',methods=["GET"])
def home():

    return flask.render_template('index.html')


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=8080)
#   app.run()