Source code for user_profile_creation
import os
import sys
import json
import yaml
import argparse
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm
from helpers import extract_json, save_pandas_to_csv
import pdb
[docs]def get_userid(user_dict, key):
""" Function to get User IDs from JSON
:param user_dict: The URL from which response is to be retrieved
:type user_dict: `JSON`
:return: User ID
:rtype: `str`
"""
try:
idx = user_dict[key]
except BaseException:
idx = np.nan
return idx
[docs]class extract_user_profiles():
""" Class which can extract profiles of all users from a university
"""
def __init__(self, univ_name, output_path):
""" Constructor
:param univ_name: Name of the univeristy
:type univ_name: `str`
:return: None
"""
self.output_path = params['OUTPUT_PATH'] if output_path == '' else output_path
self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.output_path )
self.profile_url = params['UNIV_DETAILS'][args.univ_name]['PROFILE_URL']
self.base_url = params['UNIV_DETAILS'][args.univ_name]['BASE_URL']
self.end_url = params['UNIV_DETAILS'][args.univ_name]['END_URL']
self.sub_json = extract_json(self.base_url, self.end_url, 1)
self.n_scholars = self.sub_json['page']['totalElements']
self.scholars_dataset = params['SCHOLARS_DATASET']
print("Total Scholars: ", self.n_scholars)
# path = os.path.join(os.getcwd(), "Test_Folder")
if not os.path.exists(self.output_path):
os.mkdir(self.output_path)
[docs] def extract_info(self, url, user_id):
""" Function to extract a particular user's information from general university URL
:param url: The URL from which response is to be retrieved
:type url: `str`
:param user_id: ID of the particular scholar
:type user_id: `str`
:return: None
"""
self.user_url = url + user_id
payload = {}
headers = {
'accept': 'application/json, text/plain, */*'
}
response = requests.request(
"GET", self.user_url, headers=headers, data=payload)
response_str = response.text
self.user_dict = json.loads(response_str)
return None
[docs] def get_name(self):
""" Function to extract name of the Scholar from University Page
:param None:
:return: Name of the Scholar
:rtype: `str`
"""
return self.user_dict['name']
[docs] def get_email(self):
""" Function to extract email of the Scholar from University Page
:param None:
:return: Email of the Scholar
:rtype: `str`
"""
try:
return self.user_dict['primaryEmail']
except BaseException:
return None
[docs] def get_title(self):
""" Function to extract Prefered title of the Scholar from University Page
:param None:
:return: Preferred title of the Scholar
:rtype: `str`
"""
return self.user_dict['preferredTitle']
[docs] def get_department_info(self):
""" Function to extract Department info (including course area) of the Scholar from University Page
:param None:
:return: Department info of the Scholar
:rtype: `str`
"""
try:
position_list = self.user_dict["positions"]
course_area = []
department = []
try:
for p in position_list:
course_area.append(p['organizations'][0]['label'])
department.append(
p['organizations'][0]['parent'][0]['label'])
except BaseException:
pass
return "||".join(course_area), "||".join(department)
except BaseException:
return None, None
[docs] def get_overview(self):
""" Function to extract Overview of the Scholar from University Page
:param None:
:return: Overview of the Scholar
:rtype: `str`
"""
try:
return self.user_dict['overview']
except BaseException:
return None
[docs] def get_keywords(self):
""" Function to extract keywords of the Scholar from University Page
:param None:
:return: Keywords of the Scholar
:rtype: `str`
"""
try:
return "||".join(self.user_dict['keywords'])
except BaseException:
return None
[docs] def get_npublications(self):
""" Function to get the no of publciations of the Scholar from University Page
:param None:
:return: Publications of the Scholar
:rtype: `str`
"""
try:
return len(self.user_dict["publications"])
except BaseException:
return None
[docs] def get_publications(self):
""" Function to extract publications of the Scholar from University Page
:param None:
:return: Publications of the Scholar
:rtype: `str`
"""
try:
return json.dumps(self.user_dict['publications'])
except BaseException:
return None
[docs] def get_research(self):
""" Function to Research areas of the Scholar from University Page
:param None:
:return: Research areas of the Scholar, Length of research_areas
:rtype: `Tuple (List, Int)`
"""
try:
r_list = []
for d in self.user_dict["researcherOn"]:
r_list.append(d['label'])
return "||".join(r_list), len(r_list)
except BaseException:
return None, 0
[docs] def get_awards(self):
""" Function to Research areas of the Scholar from University Page
:param None:
:return: Research areas of the Scholar, Length of research_areas
:rtype: `Tuple (List, Int)`
"""
try:
a_list = []
for d in self.user_dict["awardsAndHonors"]:
a_list.append(d['label'])
return "||".join(a_list), len(a_list)
except BaseException:
return None, 0
[docs] def get_organizations(self):
""" Function to extract Organizations of the Scholar from University Page
:param None:
:return: Organizations of the Scholar
:rtype: `str`
"""
try:
return self.user_dict['organizations']
except BaseException:
return None
[docs] def get_department(self):
""" Function to extract Department of the Scholar from University Page
:param None:
:return: Department of the Scholar
:rtype: `str`
"""
try:
return self.user_dict['schools']
except BaseException:
return None
[docs] def get_netid(self):
""" Function to extract NetID (University Unique Identifier) of the Scholar from University Page
:param None:
:return: NetID of the Scholar
:rtype: `str`
"""
try:
return self.user_dict['netid']
except BaseException:
return None
[docs] def get_profile(self, url, user_id):
""" Function to extract all details of a scholar from University Page
:param url: The base university URL from which Scholars' data can be extracted by appending their user_ids
:type url: `str`
:param user_id: The university provided User ID of the scholar
:type user_id: `str`
:return: Scholar Data in the form of Pandas.DataFrame
:rtype: `Pandas.DataFrame`
"""
self.extract_info(url, user_id)
course, dept = self.get_department_info()
research, r_len = self.get_research()
awards, a_len = self.get_awards()
try:
org = "||".join(self.get_organizations())
except BaseException:
org = None
try:
user_data = pd.DataFrame({
"User_id": user_id,
"Netid": self.get_netid(),
"Name": self.get_name(),
"Email": self.get_email(),
"Type": self.get_title(),
"Overview": self.get_overview(),
"Keywords": self.get_keywords(),
"n_publications": self.get_npublications(),
"Publications": self.get_publications(),
"Research": research,
"n_research": r_len,
"Awards": awards,
"n_awards": a_len,
"Organizations": org,
"Course": course,
"Department": dept
}, index=[0])
except BaseException:
user_data = pd.DataFrame(
{
'columns': [
"User_id",
"Netid",
"Name",
"Email",
"Type",
"Overview",
"Keywords",
"n_publications",
"Research",
"n_research",
"Awards",
"n_awards",
"Organizations",
"Course",
"Department"]})
return user_data
[docs] def extract_profiles(self):
""" Function to compile Scholar data of a particular university.
The function will first identify the total number of scholars in a university and then get basic summary available for each scholar.
:param None:
:return: None
"""
# From the main URL, identify the number of scholars
final_json = extract_json(self.base_url, self.end_url, self.n_scholars)
user_jsons = final_json['_embedded']['individual']
user_ids = [get_userid(d, "id") for d in user_jsons]
user_ids = [i for i in user_ids if i is not np.nan]
print("Total ids extracted for scholars", len(user_ids))
url = self.profile_url + user_ids[1]
payload = {}
headers = {
'accept': 'application/json, text/plain, */*'
}
response = requests.request("GET", url, headers=headers, data=payload)
response_str = response.text
user_dict = json.loads(response_str)
# For each scholar, go to his/her summary page and extract relevant data
user_list = []
# #TODELETE
# user_ids = user_ids[:10]
for idx in tqdm(user_ids):
user_list.append(self.get_profile(self.profile_url, idx))
# Save User profiles
df = pd.concat(user_list).reset_index(drop=True)
save_pandas_to_csv(
df=df,
output_path=os.path.join(
self.output_path,
self.scholars_dataset),
index=False)
if __name__ == "__main__":
# Read arguments from command line (cmd). If no input via cmd, use config
# file
parser = argparse.ArgumentParser(description="Parameter file")
parser.add_argument(
'--config_file',
metavar='FILENAME',
type=str,
default='config.yml',
help='Parameter file name in yaml format')
parser.add_argument(
'--univ_name',
metavar='UNIV_NAME',
type=str,
default='TAMU',
choices=[
'TAMU',
'UFL'],
help='NAME of University')
parser.add_argument(
'--output_path',
metavar='OUTPUT_PATH',
type=str,
default='',
help='Path for saving output file')
args = parser.parse_args()
print("\n\nCreating User Profile")
try:
params = yaml.safe_load(open(args.config_file))
except BaseException:
print(f'Error loading parameter file: {args.config_file}.')
sys.exit(1)
profile_extractor_object = extract_user_profiles(
args.univ_name, args.output_path)
profile_extractor_object.extract_profiles()
print("TASK COMPLETED : Successfully created User profiles")