Source code for user_profile_creation

import os
import sys

import json
import yaml
import argparse
import requests

import pandas as pd
import numpy as np

from tqdm import tqdm

from helpers import extract_json, save_pandas_to_csv

import pdb


[docs]def get_userid(user_dict, key): """ Function to get User IDs from JSON :param user_dict: The URL from which response is to be retrieved :type user_dict: `JSON` :return: User ID :rtype: `str` """ try: idx = user_dict[key] except BaseException: idx = np.nan return idx
[docs]class extract_user_profiles(): """ Class which can extract profiles of all users from a university """ def __init__(self, univ_name, output_path): """ Constructor :param univ_name: Name of the univeristy :type univ_name: `str` :return: None """ self.output_path = params['OUTPUT_PATH'] if output_path == '' else output_path self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.output_path ) self.profile_url = params['UNIV_DETAILS'][args.univ_name]['PROFILE_URL'] self.base_url = params['UNIV_DETAILS'][args.univ_name]['BASE_URL'] self.end_url = params['UNIV_DETAILS'][args.univ_name]['END_URL'] self.sub_json = extract_json(self.base_url, self.end_url, 1) self.n_scholars = self.sub_json['page']['totalElements'] self.scholars_dataset = params['SCHOLARS_DATASET'] print("Total Scholars: ", self.n_scholars) # path = os.path.join(os.getcwd(), "Test_Folder") if not os.path.exists(self.output_path): os.mkdir(self.output_path)
[docs] def extract_info(self, url, user_id): """ Function to extract a particular user's information from general university URL :param url: The URL from which response is to be retrieved :type url: `str` :param user_id: ID of the particular scholar :type user_id: `str` :return: None """ self.user_url = url + user_id payload = {} headers = { 'accept': 'application/json, text/plain, */*' } response = requests.request( "GET", self.user_url, headers=headers, data=payload) response_str = response.text self.user_dict = json.loads(response_str) return None
[docs] def get_name(self): """ Function to extract name of the Scholar from University Page :param None: :return: Name of the Scholar :rtype: `str` """ return self.user_dict['name']
[docs] def get_email(self): """ Function to extract email of the Scholar from University Page :param None: :return: Email of the Scholar :rtype: `str` """ try: return self.user_dict['primaryEmail'] except BaseException: return None
[docs] def get_title(self): """ Function to extract Prefered title of the Scholar from University Page :param None: :return: Preferred title of the Scholar :rtype: `str` """ return self.user_dict['preferredTitle']
[docs] def get_department_info(self): """ Function to extract Department info (including course area) of the Scholar from University Page :param None: :return: Department info of the Scholar :rtype: `str` """ try: position_list = self.user_dict["positions"] course_area = [] department = [] try: for p in position_list: course_area.append(p['organizations'][0]['label']) department.append( p['organizations'][0]['parent'][0]['label']) except BaseException: pass return "||".join(course_area), "||".join(department) except BaseException: return None, None
[docs] def get_overview(self): """ Function to extract Overview of the Scholar from University Page :param None: :return: Overview of the Scholar :rtype: `str` """ try: return self.user_dict['overview'] except BaseException: return None
[docs] def get_keywords(self): """ Function to extract keywords of the Scholar from University Page :param None: :return: Keywords of the Scholar :rtype: `str` """ try: return "||".join(self.user_dict['keywords']) except BaseException: return None
[docs] def get_npublications(self): """ Function to get the no of publciations of the Scholar from University Page :param None: :return: Publications of the Scholar :rtype: `str` """ try: return len(self.user_dict["publications"]) except BaseException: return None
[docs] def get_publications(self): """ Function to extract publications of the Scholar from University Page :param None: :return: Publications of the Scholar :rtype: `str` """ try: return json.dumps(self.user_dict['publications']) except BaseException: return None
[docs] def get_research(self): """ Function to Research areas of the Scholar from University Page :param None: :return: Research areas of the Scholar, Length of research_areas :rtype: `Tuple (List, Int)` """ try: r_list = [] for d in self.user_dict["researcherOn"]: r_list.append(d['label']) return "||".join(r_list), len(r_list) except BaseException: return None, 0
[docs] def get_awards(self): """ Function to Research areas of the Scholar from University Page :param None: :return: Research areas of the Scholar, Length of research_areas :rtype: `Tuple (List, Int)` """ try: a_list = [] for d in self.user_dict["awardsAndHonors"]: a_list.append(d['label']) return "||".join(a_list), len(a_list) except BaseException: return None, 0
[docs] def get_organizations(self): """ Function to extract Organizations of the Scholar from University Page :param None: :return: Organizations of the Scholar :rtype: `str` """ try: return self.user_dict['organizations'] except BaseException: return None
[docs] def get_department(self): """ Function to extract Department of the Scholar from University Page :param None: :return: Department of the Scholar :rtype: `str` """ try: return self.user_dict['schools'] except BaseException: return None
[docs] def get_netid(self): """ Function to extract NetID (University Unique Identifier) of the Scholar from University Page :param None: :return: NetID of the Scholar :rtype: `str` """ try: return self.user_dict['netid'] except BaseException: return None
[docs] def get_profile(self, url, user_id): """ Function to extract all details of a scholar from University Page :param url: The base university URL from which Scholars' data can be extracted by appending their user_ids :type url: `str` :param user_id: The university provided User ID of the scholar :type user_id: `str` :return: Scholar Data in the form of Pandas.DataFrame :rtype: `Pandas.DataFrame` """ self.extract_info(url, user_id) course, dept = self.get_department_info() research, r_len = self.get_research() awards, a_len = self.get_awards() try: org = "||".join(self.get_organizations()) except BaseException: org = None try: user_data = pd.DataFrame({ "User_id": user_id, "Netid": self.get_netid(), "Name": self.get_name(), "Email": self.get_email(), "Type": self.get_title(), "Overview": self.get_overview(), "Keywords": self.get_keywords(), "n_publications": self.get_npublications(), "Publications": self.get_publications(), "Research": research, "n_research": r_len, "Awards": awards, "n_awards": a_len, "Organizations": org, "Course": course, "Department": dept }, index=[0]) except BaseException: user_data = pd.DataFrame( { 'columns': [ "User_id", "Netid", "Name", "Email", "Type", "Overview", "Keywords", "n_publications", "Research", "n_research", "Awards", "n_awards", "Organizations", "Course", "Department"]}) return user_data
[docs] def extract_profiles(self): """ Function to compile Scholar data of a particular university. The function will first identify the total number of scholars in a university and then get basic summary available for each scholar. :param None: :return: None """ # From the main URL, identify the number of scholars final_json = extract_json(self.base_url, self.end_url, self.n_scholars) user_jsons = final_json['_embedded']['individual'] user_ids = [get_userid(d, "id") for d in user_jsons] user_ids = [i for i in user_ids if i is not np.nan] print("Total ids extracted for scholars", len(user_ids)) url = self.profile_url + user_ids[1] payload = {} headers = { 'accept': 'application/json, text/plain, */*' } response = requests.request("GET", url, headers=headers, data=payload) response_str = response.text user_dict = json.loads(response_str) # For each scholar, go to his/her summary page and extract relevant data user_list = [] # #TODELETE # user_ids = user_ids[:10] for idx in tqdm(user_ids): user_list.append(self.get_profile(self.profile_url, idx)) # Save User profiles df = pd.concat(user_list).reset_index(drop=True) save_pandas_to_csv( df=df, output_path=os.path.join( self.output_path, self.scholars_dataset), index=False)
if __name__ == "__main__": # Read arguments from command line (cmd). If no input via cmd, use config # file parser = argparse.ArgumentParser(description="Parameter file") parser.add_argument( '--config_file', metavar='FILENAME', type=str, default='config.yml', help='Parameter file name in yaml format') parser.add_argument( '--univ_name', metavar='UNIV_NAME', type=str, default='TAMU', choices=[ 'TAMU', 'UFL'], help='NAME of University') parser.add_argument( '--output_path', metavar='OUTPUT_PATH', type=str, default='', help='Path for saving output file') args = parser.parse_args() print("\n\nCreating User Profile") try: params = yaml.safe_load(open(args.config_file)) except BaseException: print(f'Error loading parameter file: {args.config_file}.') sys.exit(1) profile_extractor_object = extract_user_profiles( args.univ_name, args.output_path) profile_extractor_object.extract_profiles() print("TASK COMPLETED : Successfully created User profiles")