Source code for main_extractor

from Agency_proposal_extractor.NIH_Extractor import NIHExtractor
from Agency_proposal_extractor.NSF_Extractor import NSFExtractor


import argparse
import yaml
import sys
import os
import pdb

import pandas as pd


[docs]class AgencyDataExtractor(): """ Class which can extract data from required agencey webpages. Currently added agencie - NIH, NSF """ def __init__(self, n_cores, agencies, params): """ Constuctor :param n_cores: No of CPU cores to be used :type n_cores: `int` :param agencies: List of agence names from which data is to be extracted :type agencies: `List` :param params: Dictionary of default parameter values from CONFIG.yml file :type params: `Dict` :return: None """ self.n_cores = params['CPU_COUNT'] if n_cores == 0 else n_cores self.agencies_filenames = params['AGENCIES_FILENAME_DICT'] self.agencies = params['AGENCIES'] if agencies == [] else agencies self.output_path = params['OUTPUT_PATH'] self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.output_path ) self.agency_extractors = { 'National Institutes of Health': NIHExtractor, 'National Science Foundation': NSFExtractor} self.extracted_agencies_filenames = params['AGENCIES_EXTRACTED_FILENAME_DICT']
[docs] def extract_agency_proposals(self): """ Parent function which calls child functions to retrieve data for each agency. Each child function will save the data to specific files separately. :param None: :return: None """ for agency in self.agencies: try: data = pd.read_csv( os.path.join( self.output_path, self.agencies_filenames[agency])) urls = data[data['AgencyName'] == agency]['AdditionalInformationURL'].values extractor = self.agency_extractors[agency]( data=data, urls=urls, save_filename=self.extracted_agencies_filenames[agency]) extractor.extract_all( n_cores=self.n_cores, output_path=self.output_path) print("Completed extraction for agency - :", agency) except BaseException: print("Error for Agency : ", agency)
if __name__ == "__main__": # Read arguments from command line (cmd). If no input via cmd, use config # file parser = argparse.ArgumentParser(description="Parameter file") parser.add_argument( '--config_file', metavar='FILENAME', type=str, default='config.yml', help='Parameter file name in yaml format') parser.add_argument('-a', '--agencies', metavar='AGENCIES', nargs="*", default=[ 'National Science Foundation', 'National Institutes of Health'], help='Agencies whose proposals are to be extracted') parser.add_argument( '--n_cores', metavar='CPU_COUNT', type=int, default=0, help='No of CPU threads to be used') args = parser.parse_args() print("\n\nExtracting Proposals from Agencies") try: params = yaml.safe_load(open(args.config_file)) except BaseException: print(f'Error loading parameter file: {args.config_file}.') sys.exit(1) extractor = AgencyDataExtractor( n_cores=args.n_cores, agencies=args.agencies, params=params) extractor.extract_agency_proposals() print("TASK COMPLETED : Completed Extracting Proposals")