import os
import re
import ast
import json
import yaml
import requests
import argparse
from datetime import datetime
import pandas as pd
import numpy as np
from helpers import *
from automatic_keyword_generator import *
from collections import Counter
import math
import pdb
[docs]def get_column_names():
feat_cols = [
'Keywords',
'Overview',
'Organization',
'pub_keyword',
'pub_title']
cfp = ["desc", "title", "dept"]
return [i + "_" + j + "_sim" for i in feat_cols for j in cfp]
[docs]class Top_Scholar_Identifier():
"""This is a class to identify the top N scholars for a given proposal.
The proposal dataset created using 'main_extractor.py' will be utilized to get details of the proposal / grant.
The analytical dataset of user-publications created using 'create_analytical_data.py' will be utilized to get scholar profiles.
"""
def __init__(self, n_cores, id_no, top_k, generator_, agency, params):
""" Constructor
:param n_cores: No: of CPU cores to be used for the process
:type n_cores: `int`
:param id_no: Opportunity Number of the proposal
:type id_no: `str`
:param top_k: The number of scholars to be recommended
:type top_k: `int`
:param generator_: The generator to be used for keyword extraction
:type generator_: `str`
:param agency: The agency which is awarding the grant
:type agency: `str`
:param params: Parameters read from the configuration file
:type params: `dict`
"""
# Set the parameters
self.n_cores = params['CPU_COUNT'] if n_cores == 0 else n_cores
self.output_path = params['OUTPUT_PATH']
self.id_no = params['PROPOSAL_ID'] if id_no == '' else id_no
self.top_k = params['top_k_scholars'] if top_k == 0 else top_k
self.generator_ = generator_
agency_map = {
'NSF': 'National Science Foundation',
'nsf': 'National Science Foundation',
'nih': 'National Institutes of Health',
'NIH': 'National Institutes of Health'}
self.proposal_data_file = os.path.join(
self.output_path, params['AGENCIES_EXTRACTED_FILENAME_DICT'][agency_map[agency]])
self.analytical_filename = params["ANALYTICAL_DATSET"]
self.scholars_filename = params["SCHOLARS_DATASET"]
[docs] def read_data(self):
""" Function which will read data from the initialized CSV files
regarding proposal and scholar datails and save them as pandas dataframe
:param None :
:return: None
:rtype:
"""
# Read scholars' basic data
self.user_df = pd.read_csv(
os.path.join(
self.output_path,
self.scholars_filename))
# Read scholars' publication data
self.ad = pd.read_csv(
os.path.join(
self.output_path,
self.analytical_filename))
# Read proposal data
self.cfp_df = pd.read_csv(self.proposal_data_file)
self.cfp_df.fillna(" ", inplace=True)
# pdb.set_trace()
self.proposal = self.cfp_df[self.cfp_df["Opportunity Number"]
== self.id_no].reset_index(drop=True).iloc[0]
[docs] def get_section_keys_for_proposal(self):
""" Function to get keywords from Description, Title adn Department sections of the proposal text
:param None :
:return: None
:rtype:
"""
# Get keys from the description of proposal
self.desc_keys = [
i for i in get_keys(
self.proposal["Description"],
generator=self.generator_,
ntop=self.top_k) if len(i) > 3]
# Get keys from the Title of proposal
self.title_keys = [
i for i in get_keys(
self.proposal["Title"],
generator=self.generator_,
ntop=self.top_k) if len(i) > 3]
# Get keys from the Department of proposal
self.dept_keys = [
i for i in get_keys(
self.proposal["Department"],
generator=self.generator_,
ntop=self.top_k) if len(i) > 3]
[docs] def get_top_scholars(self, ntop_=20):
""" Main function to calculate the scholars suitable for the given proposal
:param ntop_: No of scholars to be recommended
:type ntop_: `int`
:return: self.recommend_df
:rtype: class `Pandas.DataFrame`
"""
# Create a list of lists. Each sublist contain a user's id, his/her section keys (from analytical database), proposal's sections keys
# That is if there are 'n' users with 'm' sentions from his profile and 'k' section in proposal, the main list will consist of 'n'*'m'*'k' sublists
self.similarity_lists = []
feat_cols = [
'Keywords',
'Overview',
'Organization',
'pub_keyword',
'pub_title']
for i in feat_cols:
for j in [self.desc_keys, self.title_keys, self.dept_keys]:
self.similarity_lists.append([(i, j, k) for i, j, k in zip(
self.ad["user_id"], self.ad[i], [j] * self.ad.shape[0])])
# Run counter cosine similarity as parallel tasks
self.score_lists = []
for k in self.similarity_lists:
sims = parallelize(
self.n_cores,
func=counter_cosine_similarity,
arg1=k)
self.score_lists.append(sims)
self.sim_df = pd.DataFrame(
{"user_id": [list(i.keys())[0] for i in sims]})
col_names = get_column_names()
for col, k in zip(col_names, range(len(col_names))):
self.sim_df[col] = [list(i.values())[0]
for i in self.score_lists[k]]
# Append the similarity values to the original dataframe
self.sim_df["total_sim"] = self.sim_df.sum(axis=1)
print("Max of self.sim[total_sim] :", self.sim_df["total_sim"].max())
self.sim_df.sort_values("total_sim", ascending=False, inplace=True)
self.sub_df = self.sim_df[:self.top_k]
self.ids = self.sub_df["user_id"].values.tolist()
# Create dataframe with only top scholars
self.recommend_df = self.user_df[self.user_df["User_id"].isin(
self.ids)]
self.recommend_df.set_index("User_id", inplace=True)
self.recommend_df = self.recommend_df.loc[self.ids]
return self.recommend_df
if __name__ == "__main__":
""" Read arguments from command line (cmd). If no input via cmd, use config
file
"""
parser = argparse.ArgumentParser(description="Parameter file")
parser.add_argument(
'--config_file',
metavar='FILENAME',
type=str,
default='config.yml',
help='Parameter file name in yaml format')
parser.add_argument(
'--top_k',
metavar='TOP_K_SCHOLARS',
type=int,
default=0,
help='Enter an integer K (K>0) to identify the top K scholars')
parser.add_argument(
'--proposal_id',
metavar='PROPOSAL_ID',
type=str,
default='',
help='ID of proposal for which scholars are to be identified')
parser.add_argument(
'--generator',
metavar='KEYWORD_GENERATOR',
type=str,
default="Spacy",
help='Generator for automatic keyword extraction')
parser.add_argument(
'--n_cores',
metavar='CPU_COUNT',
type=int,
default=0,
help='No of CPU threads to be used')
parser.add_argument(
'--agency',
metavar='AGENCY',
type=str,
choices=[
'nsf',
'nih',
'NSF',
'NIH'],
required=True,
help='Agencies whose proposals are to be extracted')
args = parser.parse_args()
print("\n\nRecommending Scholars for Proposal ID : ",args.proposal_id )
# Read configuration file. If not successfull end the program
try:
params = yaml.safe_load(open(args.config_file))
except BaseException:
print(f'Error loading parameter file: {args.config_file}.')
sys.exit(1)
# Initialize a class object with all parameters
obj = Top_Scholar_Identifier(
n_cores=args.n_cores,
agency=args.agency,
id_no=args.proposal_id,
top_k=args.top_k,
generator_=args.generator,
params=params)
# Reads (CSV file) with data regarding Proposal, Scholar details and
obj.read_data()
# Extract keyword for proposal
obj.get_section_keys_for_proposal()
# Get recommendations
recommendations = obj.get_top_scholars(ntop_=20)
# Save the recommendation
save_pandas_to_csv(
df=recommendations,
output_path=os.path.join(
obj.output_path,
params['PROPOSAL_RECOMMENDATIONS_FILENAME']),
index=False)